Lib/textwrap.py

   1 """Text wrapping and filling.
   2 """
   3
   4 # Copyright (C) 1999-2001 Gregory P. Ward.
   5 # Copyright (C) 2002 Python Software Foundation.
   6 # Written by Greg Ward <gward@python.net>
   7
   8 # XXX currently this module does not work very well with Unicode
   9 # strings.  See http://www.python.org/sf/622831 for updates.
  10
  11 __revision__ = "$Id$"
  12
  13 import string, re
  14
  15 __all__ = ['TextWrapper', 'wrap', 'fill']
  16
  17 # Hardcode the recognized whitespace characters to the US-ASCII
  18 # whitespace characters.  The main reason for doing this is that in
  19 # ISO-8859-1, 0xa0 is non-breaking whitespace, so in certain locales
  20 # that character winds up in string.whitespace.  Respecting
  21 # string.whitespace in those cases would 1) make textwrap treat 0xa0 the
  22 # same as any other whitespace char, which is clearly wrong (it's a
  23 # *non-breaking* space), 2) possibly cause problems with Unicode,
  24 # since 0xa0 is not in range(128).
  25 _whitespace = '\t\n\x0b\x0c\r '
  26
  27 class TextWrapper:
  28     """
  29     Object for wrapping/filling text.  The public interface consists of
  30     the wrap() and fill() methods; the other methods are just there for
  31     subclasses to override in order to tweak the default behaviour.
  32     If you want to completely replace the main wrapping algorithm,
  33     you'll probably have to override _wrap_chunks().
  34
  35     Several instance attributes control various aspects of wrapping:
  36       width (default: 70)
  37         the maximum width of wrapped lines (unless break_long_words
  38         is false)
  39       initial_indent (default: "")
  40         string that will be prepended to the first line of wrapped
  41         output.  Counts towards the line's width.
  42       subsequent_indent (default: "")
  43         string that will be prepended to all lines save the first
  44         of wrapped output; also counts towards each line's width.
  45       expand_tabs (default: true)
  46         Expand tabs in input text to spaces before further processing.
  47         Each tab will become 1 .. 8 spaces, depending on its position in
  48         its line.  If false, each tab is treated as a single character.
  49       replace_whitespace (default: true)
  50         Replace all whitespace characters in the input text by spaces
  51         after tab expansion.  Note that if expand_tabs is false and
  52         replace_whitespace is true, every tab will be converted to a
  53         single space!
  54       fix_sentence_endings (default: false)
  55         Ensure that sentence-ending punctuation is always followed
  56         by two spaces.  Off by default because the algorithm is
  57         (unavoidably) imperfect.
  58       break_long_words (default: true)
  59         Break words longer than 'width'.  If false, those words will not
  60         be broken, and some lines might be longer than 'width'.
  61     """
  62
  63     whitespace_trans = string.maketrans(_whitespace, ' ' * len(_whitespace))
  64
  65     unicode_whitespace_trans = {}
  66     uspace = ord(u' ')
  67     for x in map(ord, _whitespace):
  68         unicode_whitespace_trans[x] = uspace
  69
  70     # This funky little regex is just the trick for splitting
  71     # text up into word-wrappable chunks.  E.g.
  72     #   "Hello there -- you goof-ball, use the -b option!"
  73     # splits into
  74     #   Hello/ /there/ /--/ /you/ /goof-/ball,/ /use/ /the/ /-b/ /option!
  75     # (after stripping out empty strings).
  76     wordsep_re = re.compile(r'(\s+|'                  # any whitespace
  77                             r'-*\w{2,}-(?=\w{2,})|'   # hyphenated words
  78                             r'(?<=\S)-{2,}(?=\w))')   # em-dash
  79
  80     # XXX will there be a locale-or-charset-aware version of
  81     # string.lowercase in 2.3?
  82     sentence_end_re = re.compile(r'[%s]'              # lowercase letter
  83                                  r'[\.\!\?]'          # sentence-ending punct.
  84                                  r'[\"\']?'           # optional end-of-quote
  85                                  % string.lowercase)
  86
  87
  88     def __init__ (self,
  89                   width=70,
  90                   initial_indent="",
  91                   subsequent_indent="",
  92                   expand_tabs=True,
  93                   replace_whitespace=True,
  94                   fix_sentence_endings=False,
  95                   break_long_words=True):
  96         self.width = width
  97         self.initial_indent = initial_indent
  98         self.subsequent_indent = subsequent_indent
  99         self.expand_tabs = expand_tabs
 100         self.replace_whitespace = replace_whitespace
 101         self.fix_sentence_endings = fix_sentence_endings
 102         self.break_long_words = break_long_words
 103
 104
 105     # -- Private methods -----------------------------------------------
 106     # (possibly useful for subclasses to override)
 107
 108     def _munge_whitespace(self, text):
 109         """_munge_whitespace(text : string) -> string
 110
 111         Munge whitespace in text: expand tabs and convert all other
 112         whitespace characters to spaces.  Eg. " foo\tbar\n\nbaz"
 113         becomes " foo    bar  baz".
 114         """
 115         if self.expand_tabs:
 116             text = text.expandtabs()
 117         if self.replace_whitespace:
 118             if isinstance(text, str):
 119                 text = text.translate(self.whitespace_trans)
 120             elif isinstance(text, unicode):
 121                 text = text.translate(self.unicode_whitespace_trans)
 122         return text
 123
 124
 125     def _split(self, text):
 126         """_split(text : string) -> [string]
 127
 128         Split the text to wrap into indivisible chunks.  Chunks are
 129         not quite the same as words; see wrap_chunks() for full
 130         details.  As an example, the text
 131           Look, goof-ball -- use the -b option!
 132         breaks into the following chunks:
 133           'Look,', ' ', 'goof-', 'ball', ' ', '--', ' ',
 134           'use', ' ', 'the', ' ', '-b', ' ', 'option!'
 135         """
 136         chunks = self.wordsep_re.split(text)
 137         chunks = filter(None, chunks)
 138         return chunks
 139
 140     def _fix_sentence_endings(self, chunks):
 141         """_fix_sentence_endings(chunks : [string])
 142
 143         Correct for sentence endings buried in 'chunks'.  Eg. when the
 144         original text contains "... foo.\nBar ...", munge_whitespace()
 145         and split() will convert that to [..., "foo.", " ", "Bar", ...]
 146         which has one too few spaces; this method simply changes the one
 147         space to two.
 148         """
 149         i = 0
 150         pat = self.sentence_end_re
 151         while i < len(chunks)-1:
 152             if chunks[i+1] == " " and pat.search(chunks[i]):
 153                 chunks[i+1] = "  "
 154                 i += 2
 155             else:
 156                 i += 1
 157
 158     def _handle_long_word(self, chunks, cur_line, cur_len, width):
 159         """_handle_long_word(chunks : [string],
 160                              cur_line : [string],
 161                              cur_len : int, width : int)
 162
 163         Handle a chunk of text (most likely a word, not whitespace) that
 164         is too long to fit in any line.
 165         """
 166         space_left = width - cur_len
 167
 168         # If we're allowed to break long words, then do so: put as much
 169         # of the next chunk onto the current line as will fit.
 170         if self.break_long_words:
 171             cur_line.append(chunks[0][0:space_left])
 172             chunks[0] = chunks[0][space_left:]
 173
 174         # Otherwise, we have to preserve the long word intact.  Only add
 175         # it to the current line if there's nothing already there --
 176         # that minimizes how much we violate the width constraint.
 177         elif not cur_line:
 178             cur_line.append(chunks.pop(0))
 179
 180         # If we're not allowed to break long words, and there's already
 181         # text on the current line, do nothing.  Next time through the
 182         # main loop of _wrap_chunks(), we'll wind up here again, but
 183         # cur_len will be zero, so the next line will be entirely
 184         # devoted to the long word that we can't handle right now.
 185
 186     def _wrap_chunks(self, chunks):
 187         """_wrap_chunks(chunks : [string]) -> [string]
 188
 189         Wrap a sequence of text chunks and return a list of lines of
 190         length 'self.width' or less.  (If 'break_long_words' is false,
 191         some lines may be longer than this.)  Chunks correspond roughly
 192         to words and the whitespace between them: each chunk is
 193         indivisible (modulo 'break_long_words'), but a line break can
 194         come between any two chunks.  Chunks should not have internal
 195         whitespace; ie. a chunk is either all whitespace or a "word".
 196         Whitespace chunks will be removed from the beginning and end of
 197         lines, but apart from that whitespace is preserved.
 198         """
 199         lines = []
 200
 201         while chunks:
 202
 203             # Start the list of chunks that will make up the current line.
 204             # cur_len is just the length of all the chunks in cur_line.
 205             cur_line = []
 206             cur_len = 0
 207
 208             # Figure out which static string will prefix this line.
 209             if lines:
 210                 indent = self.subsequent_indent
 211             else:
 212                 indent = self.initial_indent
 213
 214             # Maximum width for this line.
 215             width = self.width - len(indent)
 216
 217             # First chunk on line is whitespace -- drop it, unless this
 218             # is the very beginning of the text (ie. no lines started yet).
 219             if chunks[0].strip() == '' and lines:
 220                 del chunks[0]
 221
 222             while chunks:
 223                 l = len(chunks[0])
 224
 225                 # Can at least squeeze this chunk onto the current line.
 226                 if cur_len + l <= width:
 227                     cur_line.append(chunks.pop(0))
 228                     cur_len += l
 229
 230                 # Nope, this line is full.
 231                 else:
 232                     break
 233
 234             # The current line is full, and the next chunk is too big to
 235             # fit on *any* line (not just this one).
 236             if chunks and len(chunks[0]) > width:
 237                 self._handle_long_word(chunks, cur_line, cur_len, width)
 238
 239             # If the last chunk on this line is all whitespace, drop it.
 240             if cur_line and cur_line[-1].strip() == '':
 241                 del cur_line[-1]
 242
 243             # Convert current line back to a string and store it in list
 244             # of all lines (return value).
 245             if cur_line:
 246                 lines.append(indent + ''.join(cur_line))
 247
 248         return lines
 249
 250
 251     # -- Public interface ----------------------------------------------
 252
 253     def wrap(self, text):
 254         """wrap(text : string) -> [string]
 255
 256         Reformat the single paragraph in 'text' so it fits in lines of
 257         no more than 'self.width' columns, and return a list of wrapped
 258         lines.  Tabs in 'text' are expanded with string.expandtabs(),
 259         and all other whitespace characters (including newline) are
 260         converted to space.
 261         """
 262         text = self._munge_whitespace(text)
 263         indent = self.initial_indent
 264         if len(text) + len(indent) <= self.width:
 265             return [indent + text]
 266         chunks = self._split(text)
 267         if self.fix_sentence_endings:
 268             self._fix_sentence_endings(chunks)
 269         return self._wrap_chunks(chunks)
 270
 271     def fill(self, text):
 272         """fill(text : string) -> string
 273
 274         Reformat the single paragraph in 'text' to fit in lines of no
 275         more than 'self.width' columns, and return a new string
 276         containing the entire wrapped paragraph.
 277         """
 278         return "\n".join(self.wrap(text))
 279
 280
 281 # -- Convenience interface ---------------------------------------------
 282
 283 def wrap(text, width=70, **kwargs):
 284     """Wrap a single paragraph of text, returning a list of wrapped lines.
 285
 286     Reformat the single paragraph in 'text' so it fits in lines of no
 287     more than 'width' columns, and return a list of wrapped lines.  By
 288     default, tabs in 'text' are expanded with string.expandtabs(), and
 289     all other whitespace characters (including newline) are converted to
 290     space.  See TextWrapper class for available keyword args to customize
 291     wrapping behaviour.
 292     """
 293     w = TextWrapper(width=width, **kwargs)
 294     return w.wrap(text)
 295
 296 def fill(text, width=70, **kwargs):
 297     """Fill a single paragraph of text, returning a new string.
 298
 299     Reformat the single paragraph in 'text' to fit in lines of no more
 300     than 'width' columns, and return a new string containing the entire
 301     wrapped paragraph.  As with wrap(), tabs are expanded and other
 302     whitespace characters converted to space.  See TextWrapper class for
 303     available keyword args to customize wrapping behaviour.
 304     """
 305     w = TextWrapper(width=width, **kwargs)
 306     return w.fill(text)