This commit was manufactured by cvs2svn to create tag 'r211c1'.
[python/dscho.git] / Doc / lib / libre.tex
blob414f1b5bf59a25585206cbba1a73838ab6296529
1 \section{\module{re} ---
2 Regular expression operations}
3 \declaremodule{standard}{re}
4 \moduleauthor{Andrew M. Kuchling}{amk1@bigfoot.com}
5 \moduleauthor{Fredrik Lundh}{effbot@telia.com}
6 \sectionauthor{Andrew M. Kuchling}{amk1@bigfoot.com}
9 \modulesynopsis{Regular expression search and match operations with a
10 Perl-style expression syntax.}
13 This module provides regular expression matching operations similar to
14 those found in Perl. Regular expression pattern strings may not
15 contain null bytes, but can specify the null byte using the
16 \code{\e\var{number}} notation. Both patterns and strings to be
17 searched can be Unicode strings as well as 8-bit strings. The
18 \module{re} module is always available.
20 Regular expressions use the backslash character (\character{\e}) to
21 indicate special forms or to allow special characters to be used
22 without invoking their special meaning. This collides with Python's
23 usage of the same character for the same purpose in string literals;
24 for example, to match a literal backslash, one might have to write
25 \code{'\e\e\e\e'} as the pattern string, because the regular expression
26 must be \samp{\e\e}, and each backslash must be expressed as
27 \samp{\e\e} inside a regular Python string literal.
29 The solution is to use Python's raw string notation for regular
30 expression patterns; backslashes are not handled in any special way in
31 a string literal prefixed with \character{r}. So \code{r"\e n"} is a
32 two-character string containing \character{\e} and \character{n},
33 while \code{"\e n"} is a one-character string containing a newline.
34 Usually patterns will be expressed in Python code using this raw
35 string notation.
37 \strong{Implementation note:}
38 The \module{re}\refstmodindex{pre} module has two distinct
39 implementations: \module{sre} is the default implementation and
40 includes Unicode support, but may run into stack limitations for some
41 patterns. Though this will be fixed for a future release of Python,
42 the older implementation (without Unicode support) is still available
43 as the \module{pre}\refstmodindex{pre} module.
46 \begin{seealso}
47 \seetitle{Mastering Regular Expressions}{Book on regular expressions
48 by Jeffrey Friedl, published by O'Reilly. The Python
49 material in this book dates from before the \refmodule{re}
50 module, but it covers writing good regular expression
51 patterns in great detail.}
52 \end{seealso}
55 \subsection{Regular Expression Syntax \label{re-syntax}}
57 A regular expression (or RE) specifies a set of strings that matches
58 it; the functions in this module let you check if a particular string
59 matches a given regular expression (or if a given regular expression
60 matches a particular string, which comes down to the same thing).
62 Regular expressions can be concatenated to form new regular
63 expressions; if \emph{A} and \emph{B} are both regular expressions,
64 then \emph{AB} is also an regular expression. If a string \emph{p}
65 matches A and another string \emph{q} matches B, the string \emph{pq}
66 will match AB. Thus, complex expressions can easily be constructed
67 from simpler primitive expressions like the ones described here. For
68 details of the theory and implementation of regular expressions,
69 consult the Friedl book referenced below, or almost any textbook about
70 compiler construction.
72 A brief explanation of the format of regular expressions follows. For
73 further information and a gentler presentation, consult the Regular
74 Expression HOWTO, accessible from \url{http://www.python.org/doc/howto/}.
76 Regular expressions can contain both special and ordinary characters.
77 Most ordinary characters, like \character{A}, \character{a}, or \character{0},
78 are the simplest regular expressions; they simply match themselves.
79 You can concatenate ordinary characters, so \regexp{last} matches the
80 string \code{'last'}. (In the rest of this section, we'll write RE's in
81 \regexp{this special style}, usually without quotes, and strings to be
82 matched \code{'in single quotes'}.)
84 Some characters, like \character{|} or \character{(}, are special. Special
85 characters either stand for classes of ordinary characters, or affect
86 how the regular expressions around them are interpreted.
88 The special characters are:
90 \begin{list}{}{\leftmargin 0.7in \labelwidth 0.65in}
92 \item[\character{.}] (Dot.) In the default mode, this matches any
93 character except a newline. If the \constant{DOTALL} flag has been
94 specified, this matches any character including a newline.
96 \item[\character{\^}] (Caret.) Matches the start of the string, and in
97 \constant{MULTILINE} mode also matches immediately after each newline.
99 \item[\character{\$}] Matches the end of the string, and in
100 \constant{MULTILINE} mode also matches before a newline.
101 \regexp{foo} matches both 'foo' and 'foobar', while the regular
102 expression \regexp{foo\$} matches only 'foo'.
104 \item[\character{*}] Causes the resulting RE to
105 match 0 or more repetitions of the preceding RE, as many repetitions
106 as are possible. \regexp{ab*} will
107 match 'a', 'ab', or 'a' followed by any number of 'b's.
109 \item[\character{+}] Causes the
110 resulting RE to match 1 or more repetitions of the preceding RE.
111 \regexp{ab+} will match 'a' followed by any non-zero number of 'b's; it
112 will not match just 'a'.
114 \item[\character{?}] Causes the resulting RE to
115 match 0 or 1 repetitions of the preceding RE. \regexp{ab?} will
116 match either 'a' or 'ab'.
117 \item[\code{*?}, \code{+?}, \code{??}] The \character{*}, \character{+}, and
118 \character{?} qualifiers are all \dfn{greedy}; they match as much text as
119 possible. Sometimes this behaviour isn't desired; if the RE
120 \regexp{<.*>} is matched against \code{'<H1>title</H1>'}, it will match the
121 entire string, and not just \code{'<H1>'}.
122 Adding \character{?} after the qualifier makes it perform the match in
123 \dfn{non-greedy} or \dfn{minimal} fashion; as \emph{few} characters as
124 possible will be matched. Using \regexp{.*?} in the previous
125 expression will match only \code{'<H1>'}.
127 \item[\code{\{\var{m},\var{n}\}}] Causes the resulting RE to match from
128 \var{m} to \var{n} repetitions of the preceding RE, attempting to
129 match as many repetitions as possible. For example, \regexp{a\{3,5\}}
130 will match from 3 to 5 \character{a} characters. Omitting \var{n}
131 specifies an infinite upper bound; you can't omit \var{m}.
133 \item[\code{\{\var{m},\var{n}\}?}] Causes the resulting RE to
134 match from \var{m} to \var{n} repetitions of the preceding RE,
135 attempting to match as \emph{few} repetitions as possible. This is
136 the non-greedy version of the previous qualifier. For example, on the
137 6-character string \code{'aaaaaa'}, \regexp{a\{3,5\}} will match 5
138 \character{a} characters, while \regexp{a\{3,5\}?} will only match 3
139 characters.
141 \item[\character{\e}] Either escapes special characters (permitting
142 you to match characters like \character{*}, \character{?}, and so
143 forth), or signals a special sequence; special sequences are discussed
144 below.
146 If you're not using a raw string to
147 express the pattern, remember that Python also uses the
148 backslash as an escape sequence in string literals; if the escape
149 sequence isn't recognized by Python's parser, the backslash and
150 subsequent character are included in the resulting string. However,
151 if Python would recognize the resulting sequence, the backslash should
152 be repeated twice. This is complicated and hard to understand, so
153 it's highly recommended that you use raw strings for all but the
154 simplest expressions.
156 \item[\code{[]}] Used to indicate a set of characters. Characters can
157 be listed individually, or a range of characters can be indicated by
158 giving two characters and separating them by a \character{-}. Special
159 characters are not active inside sets. For example, \regexp{[akm\$]}
160 will match any of the characters \character{a}, \character{k},
161 \character{m}, or \character{\$}; \regexp{[a-z]}
162 will match any lowercase letter, and \code{[a-zA-Z0-9]} matches any
163 letter or digit. Character classes such as \code{\e w} or \code{\e S}
164 (defined below) are also acceptable inside a range. If you want to
165 include a \character{]} or a \character{-} inside a set, precede it with a
166 backslash, or place it as the first character. The
167 pattern \regexp{[]]} will match \code{']'}, for example.
169 You can match the characters not within a range by \dfn{complementing}
170 the set. This is indicated by including a
171 \character{\^} as the first character of the set; \character{\^} elsewhere will
172 simply match the \character{\^} character. For example, \regexp{[{\^}5]}
173 will match any character except \character{5}.
175 \item[\character{|}]\code{A|B}, where A and B can be arbitrary REs,
176 creates a regular expression that will match either A or B. An
177 arbitrary number of REs can be separated by the \character{|} in this
178 way. This can be used inside groups (see below) as well. REs
179 separated by \character{|} are tried from left to right, and the first
180 one that allows the complete pattern to match is considered the
181 accepted branch. This means that if \code{A} matches, \code{B} will
182 never be tested, even if it would produce a longer overall match. In
183 other words, the \character{|} operator is never greedy. To match a
184 literal \character{|}, use \regexp{\e|}, or enclose it inside a
185 character class, as in \regexp{[|]}.
187 \item[\code{(...)}] Matches whatever regular expression is inside the
188 parentheses, and indicates the start and end of a group; the contents
189 of a group can be retrieved after a match has been performed, and can
190 be matched later in the string with the \regexp{\e \var{number}} special
191 sequence, described below. To match the literals \character{(} or
192 \character{)}, use \regexp{\e(} or \regexp{\e)}, or enclose them
193 inside a character class: \regexp{[(] [)]}.
195 \item[\code{(?...)}] This is an extension notation (a \character{?}
196 following a \character{(} is not meaningful otherwise). The first
197 character after the \character{?}
198 determines what the meaning and further syntax of the construct is.
199 Extensions usually do not create a new group;
200 \regexp{(?P<\var{name}>...)} is the only exception to this rule.
201 Following are the currently supported extensions.
203 \item[\code{(?iLmsux)}] (One or more letters from the set \character{i},
204 \character{L}, \character{m}, \character{s}, \character{u},
205 \character{x}.) The group matches the empty string; the letters set
206 the corresponding flags (\constant{re.I}, \constant{re.L},
207 \constant{re.M}, \constant{re.S}, \constant{re.U}, \constant{re.X})
208 for the entire regular expression. This is useful if you wish to
209 include the flags as part of the regular expression, instead of
210 passing a \var{flag} argument to the \function{compile()} function.
212 Note that the \regexp{(?x)} flag changes how the expression is parsed.
213 It should be used first in the expression string, or after one or more
214 whitespace characters. If there are non-whitespace characters before
215 the flag, the results are undefined.
217 \item[\code{(?:...)}] A non-grouping version of regular parentheses.
218 Matches whatever regular expression is inside the parentheses, but the
219 substring matched by the
220 group \emph{cannot} be retrieved after performing a match or
221 referenced later in the pattern.
223 \item[\code{(?P<\var{name}>...)}] Similar to regular parentheses, but
224 the substring matched by the group is accessible via the symbolic group
225 name \var{name}. Group names must be valid Python identifiers. A
226 symbolic group is also a numbered group, just as if the group were not
227 named. So the group named 'id' in the example above can also be
228 referenced as the numbered group 1.
230 For example, if the pattern is
231 \regexp{(?P<id>[a-zA-Z_]\e w*)}, the group can be referenced by its
232 name in arguments to methods of match objects, such as \code{m.group('id')}
233 or \code{m.end('id')}, and also by name in pattern text
234 (e.g. \regexp{(?P=id)}) and replacement text (e.g. \code{\e g<id>}).
236 \item[\code{(?P=\var{name})}] Matches whatever text was matched by the
237 earlier group named \var{name}.
239 \item[\code{(?\#...)}] A comment; the contents of the parentheses are
240 simply ignored.
242 \item[\code{(?=...)}] Matches if \regexp{...} matches next, but doesn't
243 consume any of the string. This is called a lookahead assertion. For
244 example, \regexp{Isaac (?=Asimov)} will match \code{'Isaac~'} only if it's
245 followed by \code{'Asimov'}.
247 \item[\code{(?!...)}] Matches if \regexp{...} doesn't match next. This
248 is a negative lookahead assertion. For example,
249 \regexp{Isaac (?!Asimov)} will match \code{'Isaac~'} only if it's \emph{not}
250 followed by \code{'Asimov'}.
252 \item[\code{(?<=...)}] Matches if the current position in the string
253 is preceded by a match for \regexp{...} that ends at the current
254 position. This is called a positive lookbehind assertion.
255 \regexp{(?<=abc)def} will match \samp{abcdef}, since the lookbehind
256 will back up 3 characters and check if the contained pattern matches.
257 The contained pattern must only match strings of some fixed length,
258 meaning that \regexp{abc} or \regexp{a|b} are allowed, but \regexp{a*}
259 isn't.
261 \item[\code{(?<!...)}] Matches if the current position in the string
262 is not preceded by a match for \regexp{...}. This
263 is called a negative lookbehind assertion. Similar to positive lookbehind
264 assertions, the contained pattern must only match strings of some
265 fixed length.
267 \end{list}
269 The special sequences consist of \character{\e} and a character from the
270 list below. If the ordinary character is not on the list, then the
271 resulting RE will match the second character. For example,
272 \regexp{\e\$} matches the character \character{\$}.
274 \begin{list}{}{\leftmargin 0.7in \labelwidth 0.65in}
276 \item[\code{\e \var{number}}] Matches the contents of the group of the
277 same number. Groups are numbered starting from 1. For example,
278 \regexp{(.+) \e 1} matches \code{'the the'} or \code{'55 55'}, but not
279 \code{'the end'} (note
280 the space after the group). This special sequence can only be used to
281 match one of the first 99 groups. If the first digit of \var{number}
282 is 0, or \var{number} is 3 octal digits long, it will not be interpreted
283 as a group match, but as the character with octal value \var{number}.
284 Inside the \character{[} and \character{]} of a character class, all numeric
285 escapes are treated as characters.
287 \item[\code{\e A}] Matches only at the start of the string.
289 \item[\code{\e b}] Matches the empty string, but only at the
290 beginning or end of a word. A word is defined as a sequence of
291 alphanumeric characters, so the end of a word is indicated by
292 whitespace or a non-alphanumeric character. Inside a character range,
293 \regexp{\e b} represents the backspace character, for compatibility with
294 Python's string literals.
296 \item[\code{\e B}] Matches the empty string, but only when it is
297 \emph{not} at the beginning or end of a word.
299 \item[\code{\e d}]Matches any decimal digit; this is
300 equivalent to the set \regexp{[0-9]}.
302 \item[\code{\e D}]Matches any non-digit character; this is
303 equivalent to the set \regexp{[{\^}0-9]}.
305 \item[\code{\e s}]Matches any whitespace character; this is
306 equivalent to the set \regexp{[ \e t\e n\e r\e f\e v]}.
308 \item[\code{\e S}]Matches any non-whitespace character; this is
309 equivalent to the set \regexp{[\^\ \e t\e n\e r\e f\e v]}.
311 \item[\code{\e w}]When the \constant{LOCALE} and \constant{UNICODE}
312 flags are not specified,
313 matches any alphanumeric character; this is equivalent to the set
314 \regexp{[a-zA-Z0-9_]}. With \constant{LOCALE}, it will match the set
315 \regexp{[0-9_]} plus whatever characters are defined as letters for
316 the current locale. If \constant{UNICODE} is set, this will match the
317 characters \regexp{[0-9_]} plus whatever is classified as alphanumeric
318 in the Unicode character properties database.
320 \item[\code{\e W}]When the \constant{LOCALE} and \constant{UNICODE}
321 flags are not specified, matches any non-alphanumeric character; this
322 is equivalent to the set \regexp{[{\^}a-zA-Z0-9_]}. With
323 \constant{LOCALE}, it will match any character not in the set
324 \regexp{[0-9_]}, and not defined as a letter for the current locale.
325 If \constant{UNICODE} is set, this will match anything other than
326 \regexp{[0-9_]} and characters marked at alphanumeric in the Unicode
327 character properties database.
329 \item[\code{\e Z}]Matches only at the end of the string.
331 \item[\code{\e \e}] Matches a literal backslash.
333 \end{list}
336 \subsection{Matching vs. Searching \label{matching-searching}}
337 \sectionauthor{Fred L. Drake, Jr.}{fdrake@acm.org}
339 Python offers two different primitive operations based on regular
340 expressions: match and search. If you are accustomed to Perl's
341 semantics, the search operation is what you're looking for. See the
342 \function{search()} function and corresponding method of compiled
343 regular expression objects.
345 Note that match may differ from search using a regular expression
346 beginning with \character{\^}: \character{\^} matches only at the
347 start of the string, or in \constant{MULTILINE} mode also immediately
348 following a newline. The ``match'' operation succeeds only if the
349 pattern matches at the start of the string regardless of mode, or at
350 the starting position given by the optional \var{pos} argument
351 regardless of whether a newline precedes it.
353 % Examples from Tim Peters:
354 \begin{verbatim}
355 re.compile("a").match("ba", 1) # succeeds
356 re.compile("^a").search("ba", 1) # fails; 'a' not at start
357 re.compile("^a").search("\na", 1) # fails; 'a' not at start
358 re.compile("^a", re.M).search("\na", 1) # succeeds
359 re.compile("^a", re.M).search("ba", 1) # fails; no preceding \n
360 \end{verbatim}
363 \subsection{Module Contents}
364 \nodename{Contents of Module re}
366 The module defines the following functions and constants, and an exception:
369 \begin{funcdesc}{compile}{pattern\optional{, flags}}
370 Compile a regular expression pattern into a regular expression
371 object, which can be used for matching using its \function{match()} and
372 \function{search()} methods, described below.
374 The expression's behaviour can be modified by specifying a
375 \var{flags} value. Values can be any of the following variables,
376 combined using bitwise OR (the \code{|} operator).
378 The sequence
380 \begin{verbatim}
381 prog = re.compile(pat)
382 result = prog.match(str)
383 \end{verbatim}
385 is equivalent to
387 \begin{verbatim}
388 result = re.match(pat, str)
389 \end{verbatim}
391 but the version using \function{compile()} is more efficient when the
392 expression will be used several times in a single program.
393 %(The compiled version of the last pattern passed to
394 %\function{re.match()} or \function{re.search()} is cached, so
395 %programs that use only a single regular expression at a time needn't
396 %worry about compiling regular expressions.)
397 \end{funcdesc}
399 \begin{datadesc}{I}
400 \dataline{IGNORECASE}
401 Perform case-insensitive matching; expressions like \regexp{[A-Z]} will match
402 lowercase letters, too. This is not affected by the current locale.
403 \end{datadesc}
405 \begin{datadesc}{L}
406 \dataline{LOCALE}
407 Make \regexp{\e w}, \regexp{\e W}, \regexp{\e b}, and
408 \regexp{\e B} dependent on the current locale.
409 \end{datadesc}
411 \begin{datadesc}{M}
412 \dataline{MULTILINE}
413 When specified, the pattern character \character{\^} matches at the
414 beginning of the string and at the beginning of each line
415 (immediately following each newline); and the pattern character
416 \character{\$} matches at the end of the string and at the end of each line
417 (immediately preceding each newline).
418 By default, \character{\^} matches only at the beginning of the string, and
419 \character{\$} only at the end of the string and immediately before the
420 newline (if any) at the end of the string.
421 \end{datadesc}
423 \begin{datadesc}{S}
424 \dataline{DOTALL}
425 Make the \character{.} special character match any character at all,
426 including a newline; without this flag, \character{.} will match
427 anything \emph{except} a newline.
428 \end{datadesc}
430 \begin{datadesc}{U}
431 \dataline{UNICODE}
432 Make \regexp{\e w}, \regexp{\e W}, \regexp{\e b}, and
433 \regexp{\e B} dependent on the Unicode character properties database.
434 \versionadded{2.0}
435 \end{datadesc}
437 \begin{datadesc}{X}
438 \dataline{VERBOSE}
439 This flag allows you to write regular expressions that look nicer.
440 Whitespace within the pattern is ignored,
441 except when in a character class or preceded by an unescaped
442 backslash, and, when a line contains a \character{\#} neither in a character
443 class or preceded by an unescaped backslash, all characters from the
444 leftmost such \character{\#} through the end of the line are ignored.
445 % XXX should add an example here
446 \end{datadesc}
449 \begin{funcdesc}{search}{pattern, string\optional{, flags}}
450 Scan through \var{string} looking for a location where the regular
451 expression \var{pattern} produces a match, and return a
452 corresponding \class{MatchObject} instance.
453 Return \code{None} if no
454 position in the string matches the pattern; note that this is
455 different from finding a zero-length match at some point in the string.
456 \end{funcdesc}
458 \begin{funcdesc}{match}{pattern, string\optional{, flags}}
459 If zero or more characters at the beginning of \var{string} match
460 the regular expression \var{pattern}, return a corresponding
461 \class{MatchObject} instance. Return \code{None} if the string does not
462 match the pattern; note that this is different from a zero-length
463 match.
465 \strong{Note:} If you want to locate a match anywhere in
466 \var{string}, use \method{search()} instead.
467 \end{funcdesc}
469 \begin{funcdesc}{split}{pattern, string\optional{, maxsplit\code{ = 0}}}
470 Split \var{string} by the occurrences of \var{pattern}. If
471 capturing parentheses are used in \var{pattern}, then the text of all
472 groups in the pattern are also returned as part of the resulting list.
473 If \var{maxsplit} is nonzero, at most \var{maxsplit} splits
474 occur, and the remainder of the string is returned as the final
475 element of the list. (Incompatibility note: in the original Python
476 1.5 release, \var{maxsplit} was ignored. This has been fixed in
477 later releases.)
479 \begin{verbatim}
480 >>> re.split('\W+', 'Words, words, words.')
481 ['Words', 'words', 'words', '']
482 >>> re.split('(\W+)', 'Words, words, words.')
483 ['Words', ', ', 'words', ', ', 'words', '.', '']
484 >>> re.split('\W+', 'Words, words, words.', 1)
485 ['Words', 'words, words.']
486 \end{verbatim}
488 This function combines and extends the functionality of
489 the old \function{regsub.split()} and \function{regsub.splitx()}.
490 \end{funcdesc}
492 \begin{funcdesc}{findall}{pattern, string}
493 Return a list of all non-overlapping matches of \var{pattern} in
494 \var{string}. If one or more groups are present in the pattern,
495 return a list of groups; this will be a list of tuples if the pattern
496 has more than one group. Empty matches are included in the result.
497 \versionadded{1.5.2}
498 \end{funcdesc}
500 \begin{funcdesc}{sub}{pattern, repl, string\optional{, count\code{ = 0}}}
501 Return the string obtained by replacing the leftmost non-overlapping
502 occurrences of \var{pattern} in \var{string} by the replacement
503 \var{repl}. If the pattern isn't found, \var{string} is returned
504 unchanged. \var{repl} can be a string or a function; if a function,
505 it is called for every non-overlapping occurrence of \var{pattern}.
506 The function takes a single match object argument, and returns the
507 replacement string. For example:
509 \begin{verbatim}
510 >>> def dashrepl(matchobj):
511 .... if matchobj.group(0) == '-': return ' '
512 .... else: return '-'
513 >>> re.sub('-{1,2}', dashrepl, 'pro----gram-files')
514 'pro--gram files'
515 \end{verbatim}
517 The pattern may be a string or an RE object; if you need to specify
518 regular expression flags, you must use a RE object, or use
519 embedded modifiers in a pattern; e.g.
520 \samp{sub("(?i)b+", "x", "bbbb BBBB")} returns \code{'x x'}.
522 The optional argument \var{count} is the maximum number of pattern
523 occurrences to be replaced; \var{count} must be a non-negative integer, and
524 the default value of 0 means to replace all occurrences.
526 Empty matches for the pattern are replaced only when not adjacent to a
527 previous match, so \samp{sub('x*', '-', 'abc')} returns \code{'-a-b-c-'}.
529 If \var{repl} is a string, any backslash escapes in it are processed.
530 That is, \samp{\e n} is converted to a single newline character,
531 \samp{\e r} is converted to a linefeed, and so forth. Unknown escapes
532 such as \samp{\e j} are left alone. Backreferences, such as \samp{\e 6}, are
533 replaced with the substring matched by group 6 in the pattern.
535 In addition to character escapes and backreferences as described
536 above, \samp{\e g<name>} will use the substring matched by the group
537 named \samp{name}, as defined by the \regexp{(?P<name>...)} syntax.
538 \samp{\e g<number>} uses the corresponding group number; \samp{\e
539 g<2>} is therefore equivalent to \samp{\e 2}, but isn't ambiguous in a
540 replacement such as \samp{\e g<2>0}. \samp{\e 20} would be
541 interpreted as a reference to group 20, not a reference to group 2
542 followed by the literal character \character{0}.
543 \end{funcdesc}
545 \begin{funcdesc}{subn}{pattern, repl, string\optional{, count\code{ = 0}}}
546 Perform the same operation as \function{sub()}, but return a tuple
547 \code{(\var{new_string}, \var{number_of_subs_made})}.
548 \end{funcdesc}
550 \begin{funcdesc}{escape}{string}
551 Return \var{string} with all non-alphanumerics backslashed; this is
552 useful if you want to match an arbitrary literal string that may have
553 regular expression metacharacters in it.
554 \end{funcdesc}
556 \begin{excdesc}{error}
557 Exception raised when a string passed to one of the functions here
558 is not a valid regular expression (e.g., unmatched parentheses) or
559 when some other error occurs during compilation or matching. It is
560 never an error if a string contains no match for a pattern.
561 \end{excdesc}
564 \subsection{Regular Expression Objects \label{re-objects}}
566 Compiled regular expression objects support the following methods and
567 attributes:
569 \begin{methoddesc}[RegexObject]{search}{string\optional{, pos\optional{,
570 endpos}}}
571 Scan through \var{string} looking for a location where this regular
572 expression produces a match, and return a
573 corresponding \class{MatchObject} instance. Return \code{None} if no
574 position in the string matches the pattern; note that this is
575 different from finding a zero-length match at some point in the string.
577 The optional \var{pos} and \var{endpos} parameters have the same
578 meaning as for the \method{match()} method.
579 \end{methoddesc}
581 \begin{methoddesc}[RegexObject]{match}{string\optional{, pos\optional{,
582 endpos}}}
583 If zero or more characters at the beginning of \var{string} match
584 this regular expression, return a corresponding
585 \class{MatchObject} instance. Return \code{None} if the string does not
586 match the pattern; note that this is different from a zero-length
587 match.
589 \strong{Note:} If you want to locate a match anywhere in
590 \var{string}, use \method{search()} instead.
592 The optional second parameter \var{pos} gives an index in the string
593 where the search is to start; it defaults to \code{0}. This is not
594 completely equivalent to slicing the string; the \code{'\^'} pattern
595 character matches at the real beginning of the string and at positions
596 just after a newline, but not necessarily at the index where the search
597 is to start.
599 The optional parameter \var{endpos} limits how far the string will
600 be searched; it will be as if the string is \var{endpos} characters
601 long, so only the characters from \var{pos} to \var{endpos} will be
602 searched for a match.
603 \end{methoddesc}
605 \begin{methoddesc}[RegexObject]{split}{string\optional{,
606 maxsplit\code{ = 0}}}
607 Identical to the \function{split()} function, using the compiled pattern.
608 \end{methoddesc}
610 \begin{methoddesc}[RegexObject]{findall}{string}
611 Identical to the \function{findall()} function, using the compiled pattern.
612 \end{methoddesc}
614 \begin{methoddesc}[RegexObject]{sub}{repl, string\optional{, count\code{ = 0}}}
615 Identical to the \function{sub()} function, using the compiled pattern.
616 \end{methoddesc}
618 \begin{methoddesc}[RegexObject]{subn}{repl, string\optional{,
619 count\code{ = 0}}}
620 Identical to the \function{subn()} function, using the compiled pattern.
621 \end{methoddesc}
624 \begin{memberdesc}[RegexObject]{flags}
625 The flags argument used when the RE object was compiled, or
626 \code{0} if no flags were provided.
627 \end{memberdesc}
629 \begin{memberdesc}[RegexObject]{groupindex}
630 A dictionary mapping any symbolic group names defined by
631 \regexp{(?P<\var{id}>)} to group numbers. The dictionary is empty if no
632 symbolic groups were used in the pattern.
633 \end{memberdesc}
635 \begin{memberdesc}[RegexObject]{pattern}
636 The pattern string from which the RE object was compiled.
637 \end{memberdesc}
640 \subsection{Match Objects \label{match-objects}}
642 \class{MatchObject} instances support the following methods and attributes:
644 \begin{methoddesc}[MatchObject]{expand}{template}
645 Return the string obtained by doing backslash substitution on the
646 template string \var{template}, as done by the \method{sub()} method.
647 Escapes such as \samp{\e n} are converted to the appropriate
648 characters, and numeric backreferences (\samp{\e 1}, \samp{\e 2}) and named
649 backreferences (\samp{\e g<1>}, \samp{\e g<name>}) are replaced by the contents of the
650 corresponding group.
651 \end{methoddesc}
653 \begin{methoddesc}[MatchObject]{group}{\optional{group1, \moreargs}}
654 Returns one or more subgroups of the match. If there is a single
655 argument, the result is a single string; if there are
656 multiple arguments, the result is a tuple with one item per argument.
657 Without arguments, \var{group1} defaults to zero (i.e. the whole match
658 is returned).
659 If a \var{groupN} argument is zero, the corresponding return value is the
660 entire matching string; if it is in the inclusive range [1..99], it is
661 the string matching the the corresponding parenthesized group. If a
662 group number is negative or larger than the number of groups defined
663 in the pattern, an \exception{IndexError} exception is raised.
664 If a group is contained in a part of the pattern that did not match,
665 the corresponding result is \code{None}. If a group is contained in a
666 part of the pattern that matched multiple times, the last match is
667 returned.
669 If the regular expression uses the \regexp{(?P<\var{name}>...)} syntax,
670 the \var{groupN} arguments may also be strings identifying groups by
671 their group name. If a string argument is not used as a group name in
672 the pattern, an \exception{IndexError} exception is raised.
674 A moderately complicated example:
676 \begin{verbatim}
677 m = re.match(r"(?P<int>\d+)\.(\d*)", '3.14')
678 \end{verbatim}
680 After performing this match, \code{m.group(1)} is \code{'3'}, as is
681 \code{m.group('int')}, and \code{m.group(2)} is \code{'14'}.
682 \end{methoddesc}
684 \begin{methoddesc}[MatchObject]{groups}{\optional{default}}
685 Return a tuple containing all the subgroups of the match, from 1 up to
686 however many groups are in the pattern. The \var{default} argument is
687 used for groups that did not participate in the match; it defaults to
688 \code{None}. (Incompatibility note: in the original Python 1.5
689 release, if the tuple was one element long, a string would be returned
690 instead. In later versions (from 1.5.1 on), a singleton tuple is
691 returned in such cases.)
692 \end{methoddesc}
694 \begin{methoddesc}[MatchObject]{groupdict}{\optional{default}}
695 Return a dictionary containing all the \emph{named} subgroups of the
696 match, keyed by the subgroup name. The \var{default} argument is
697 used for groups that did not participate in the match; it defaults to
698 \code{None}.
699 \end{methoddesc}
701 \begin{methoddesc}[MatchObject]{start}{\optional{group}}
702 \funcline{end}{\optional{group}}
703 Return the indices of the start and end of the substring
704 matched by \var{group}; \var{group} defaults to zero (meaning the whole
705 matched substring).
706 Return \code{-1} if \var{group} exists but
707 did not contribute to the match. For a match object
708 \var{m}, and a group \var{g} that did contribute to the match, the
709 substring matched by group \var{g} (equivalent to
710 \code{\var{m}.group(\var{g})}) is
712 \begin{verbatim}
713 m.string[m.start(g):m.end(g)]
714 \end{verbatim}
716 Note that
717 \code{m.start(\var{group})} will equal \code{m.end(\var{group})} if
718 \var{group} matched a null string. For example, after \code{\var{m} =
719 re.search('b(c?)', 'cba')}, \code{\var{m}.start(0)} is 1,
720 \code{\var{m}.end(0)} is 2, \code{\var{m}.start(1)} and
721 \code{\var{m}.end(1)} are both 2, and \code{\var{m}.start(2)} raises
722 an \exception{IndexError} exception.
723 \end{methoddesc}
725 \begin{methoddesc}[MatchObject]{span}{\optional{group}}
726 For \class{MatchObject} \var{m}, return the 2-tuple
727 \code{(\var{m}.start(\var{group}), \var{m}.end(\var{group}))}.
728 Note that if \var{group} did not contribute to the match, this is
729 \code{(-1, -1)}. Again, \var{group} defaults to zero.
730 \end{methoddesc}
732 \begin{memberdesc}[MatchObject]{pos}
733 The value of \var{pos} which was passed to the
734 \function{search()} or \function{match()} function. This is the index
735 into the string at which the RE engine started looking for a match.
736 \end{memberdesc}
738 \begin{memberdesc}[MatchObject]{endpos}
739 The value of \var{endpos} which was passed to the
740 \function{search()} or \function{match()} function. This is the index
741 into the string beyond which the RE engine will not go.
742 \end{memberdesc}
744 \begin{memberdesc}[MatchObject]{lastgroup}
745 The name of the last matched capturing group, or \code{None} if the
746 group didn't have a name, or if no group was matched at all.
747 \end{memberdesc}
749 \begin{memberdesc}[MatchObject]{lastindex}
750 The integer index of the last matched capturing group, or \code{None}
751 if no group was matched at all.
752 \end{memberdesc}
754 \begin{memberdesc}[MatchObject]{re}
755 The regular expression object whose \method{match()} or
756 \method{search()} method produced this \class{MatchObject} instance.
757 \end{memberdesc}
759 \begin{memberdesc}[MatchObject]{string}
760 The string passed to \function{match()} or \function{search()}.
761 \end{memberdesc}