"""Text wrapping and filling. | |
""" | |
# Copyright (C) 1999-2001 Gregory P. Ward. | |
# Copyright (C) 2002, 2003 Python Software Foundation. | |
# Written by Greg Ward <gward@python.net> | |
__revision__ = "$Id$" | |
import string, re | |
try: | |
_unicode = unicode | |
except NameError: | |
# If Python is built without Unicode support, the unicode type | |
# will not exist. Fake one. | |
class _unicode(object): | |
pass | |
# Do the right thing with boolean values for all known Python versions | |
# (so this module can be copied to projects that don't depend on Python | |
# 2.3, e.g. Optik and Docutils) by uncommenting the block of code below. | |
#try: | |
# True, False | |
#except NameError: | |
# (True, False) = (1, 0) | |
__all__ = ['TextWrapper', 'wrap', 'fill', 'dedent'] | |
# Hardcode the recognized whitespace characters to the US-ASCII | |
# whitespace characters. The main reason for doing this is that in | |
# ISO-8859-1, 0xa0 is non-breaking whitespace, so in certain locales | |
# that character winds up in string.whitespace. Respecting | |
# string.whitespace in those cases would 1) make textwrap treat 0xa0 the | |
# same as any other whitespace char, which is clearly wrong (it's a | |
# *non-breaking* space), 2) possibly cause problems with Unicode, | |
# since 0xa0 is not in range(128). | |
_whitespace = '\t\n\x0b\x0c\r ' | |
class TextWrapper: | |
""" | |
Object for wrapping/filling text. The public interface consists of | |
the wrap() and fill() methods; the other methods are just there for | |
subclasses to override in order to tweak the default behaviour. | |
If you want to completely replace the main wrapping algorithm, | |
you'll probably have to override _wrap_chunks(). | |
Several instance attributes control various aspects of wrapping: | |
width (default: 70) | |
the maximum width of wrapped lines (unless break_long_words | |
is false) | |
initial_indent (default: "") | |
string that will be prepended to the first line of wrapped | |
output. Counts towards the line's width. | |
subsequent_indent (default: "") | |
string that will be prepended to all lines save the first | |
of wrapped output; also counts towards each line's width. | |
expand_tabs (default: true) | |
Expand tabs in input text to spaces before further processing. | |
Each tab will become 1 .. 8 spaces, depending on its position in | |
its line. If false, each tab is treated as a single character. | |
replace_whitespace (default: true) | |
Replace all whitespace characters in the input text by spaces | |
after tab expansion. Note that if expand_tabs is false and | |
replace_whitespace is true, every tab will be converted to a | |
single space! | |
fix_sentence_endings (default: false) | |
Ensure that sentence-ending punctuation is always followed | |
by two spaces. Off by default because the algorithm is | |
(unavoidably) imperfect. | |
break_long_words (default: true) | |
Break words longer than 'width'. If false, those words will not | |
be broken, and some lines might be longer than 'width'. | |
break_on_hyphens (default: true) | |
Allow breaking hyphenated words. If true, wrapping will occur | |
preferably on whitespaces and right after hyphens part of | |
compound words. | |
drop_whitespace (default: true) | |
Drop leading and trailing whitespace from lines. | |
""" | |
whitespace_trans = string.maketrans(_whitespace, ' ' * len(_whitespace)) | |
unicode_whitespace_trans = {} | |
uspace = ord(u' ') | |
for x in map(ord, _whitespace): | |
unicode_whitespace_trans[x] = uspace | |
# This funky little regex is just the trick for splitting | |
# text up into word-wrappable chunks. E.g. | |
# "Hello there -- you goof-ball, use the -b option!" | |
# splits into | |
# Hello/ /there/ /--/ /you/ /goof-/ball,/ /use/ /the/ /-b/ /option! | |
# (after stripping out empty strings). | |
wordsep_re = re.compile( | |
r'(\s+|' # any whitespace | |
r'[^\s\w]*\w+[^0-9\W]-(?=\w+[^0-9\W])|' # hyphenated words | |
r'(?<=[\w\!\"\'\&\.\,\?])-{2,}(?=\w))') # em-dash | |
# This less funky little regex just split on recognized spaces. E.g. | |
# "Hello there -- you goof-ball, use the -b option!" | |
# splits into | |
# Hello/ /there/ /--/ /you/ /goof-ball,/ /use/ /the/ /-b/ /option!/ | |
wordsep_simple_re = re.compile(r'(\s+)') | |
# XXX this is not locale- or charset-aware -- string.lowercase | |
# is US-ASCII only (and therefore English-only) | |
sentence_end_re = re.compile(r'[%s]' # lowercase letter | |
r'[\.\!\?]' # sentence-ending punct. | |
r'[\"\']?' # optional end-of-quote | |
r'\Z' # end of chunk | |
% string.lowercase) | |
def __init__(self, | |
width=70, | |
initial_indent="", | |
subsequent_indent="", | |
expand_tabs=True, | |
replace_whitespace=True, | |
fix_sentence_endings=False, | |
break_long_words=True, | |
drop_whitespace=True, | |
break_on_hyphens=True): | |
self.width = width | |
self.initial_indent = initial_indent | |
self.subsequent_indent = subsequent_indent | |
self.expand_tabs = expand_tabs | |
self.replace_whitespace = replace_whitespace | |
self.fix_sentence_endings = fix_sentence_endings | |
self.break_long_words = break_long_words | |
self.drop_whitespace = drop_whitespace | |
self.break_on_hyphens = break_on_hyphens | |
# recompile the regexes for Unicode mode -- done in this clumsy way for | |
# backwards compatibility because it's rather common to monkey-patch | |
# the TextWrapper class' wordsep_re attribute. | |
self.wordsep_re_uni = re.compile(self.wordsep_re.pattern, re.U) | |
self.wordsep_simple_re_uni = re.compile( | |
self.wordsep_simple_re.pattern, re.U) | |
# -- Private methods ----------------------------------------------- | |
# (possibly useful for subclasses to override) | |
def _munge_whitespace(self, text): | |
"""_munge_whitespace(text : string) -> string | |
Munge whitespace in text: expand tabs and convert all other | |
whitespace characters to spaces. Eg. " foo\\tbar\\n\\nbaz" | |
becomes " foo bar baz". | |
""" | |
if self.expand_tabs: | |
text = text.expandtabs() | |
if self.replace_whitespace: | |
if isinstance(text, str): | |
text = text.translate(self.whitespace_trans) | |
elif isinstance(text, _unicode): | |
text = text.translate(self.unicode_whitespace_trans) | |
return text | |
def _split(self, text): | |
"""_split(text : string) -> [string] | |
Split the text to wrap into indivisible chunks. Chunks are | |
not quite the same as words; see _wrap_chunks() for full | |
details. As an example, the text | |
Look, goof-ball -- use the -b option! | |
breaks into the following chunks: | |
'Look,', ' ', 'goof-', 'ball', ' ', '--', ' ', | |
'use', ' ', 'the', ' ', '-b', ' ', 'option!' | |
if break_on_hyphens is True, or in: | |
'Look,', ' ', 'goof-ball', ' ', '--', ' ', | |
'use', ' ', 'the', ' ', '-b', ' ', option!' | |
otherwise. | |
""" | |
if isinstance(text, _unicode): | |
if self.break_on_hyphens: | |
pat = self.wordsep_re_uni | |
else: | |
pat = self.wordsep_simple_re_uni | |
else: | |
if self.break_on_hyphens: | |
pat = self.wordsep_re | |
else: | |
pat = self.wordsep_simple_re | |
chunks = pat.split(text) | |
chunks = filter(None, chunks) # remove empty chunks | |
return chunks | |
def _fix_sentence_endings(self, chunks): | |
"""_fix_sentence_endings(chunks : [string]) | |
Correct for sentence endings buried in 'chunks'. Eg. when the | |
original text contains "... foo.\\nBar ...", munge_whitespace() | |
and split() will convert that to [..., "foo.", " ", "Bar", ...] | |
which has one too few spaces; this method simply changes the one | |
space to two. | |
""" | |
i = 0 | |
patsearch = self.sentence_end_re.search | |
while i < len(chunks)-1: | |
if chunks[i+1] == " " and patsearch(chunks[i]): | |
chunks[i+1] = " " | |
i += 2 | |
else: | |
i += 1 | |
def _handle_long_word(self, reversed_chunks, cur_line, cur_len, width): | |
"""_handle_long_word(chunks : [string], | |
cur_line : [string], | |
cur_len : int, width : int) | |
Handle a chunk of text (most likely a word, not whitespace) that | |
is too long to fit in any line. | |
""" | |
# Figure out when indent is larger than the specified width, and make | |
# sure at least one character is stripped off on every pass | |
if width < 1: | |
space_left = 1 | |
else: | |
space_left = width - cur_len | |
# If we're allowed to break long words, then do so: put as much | |
# of the next chunk onto the current line as will fit. | |
if self.break_long_words: | |
cur_line.append(reversed_chunks[-1][:space_left]) | |
reversed_chunks[-1] = reversed_chunks[-1][space_left:] | |
# Otherwise, we have to preserve the long word intact. Only add | |
# it to the current line if there's nothing already there -- | |
# that minimizes how much we violate the width constraint. | |
elif not cur_line: | |
cur_line.append(reversed_chunks.pop()) | |
# If we're not allowed to break long words, and there's already | |
# text on the current line, do nothing. Next time through the | |
# main loop of _wrap_chunks(), we'll wind up here again, but | |
# cur_len will be zero, so the next line will be entirely | |
# devoted to the long word that we can't handle right now. | |
def _wrap_chunks(self, chunks): | |
"""_wrap_chunks(chunks : [string]) -> [string] | |
Wrap a sequence of text chunks and return a list of lines of | |
length 'self.width' or less. (If 'break_long_words' is false, | |
some lines may be longer than this.) Chunks correspond roughly | |
to words and the whitespace between them: each chunk is | |
indivisible (modulo 'break_long_words'), but a line break can | |
come between any two chunks. Chunks should not have internal | |
whitespace; ie. a chunk is either all whitespace or a "word". | |
Whitespace chunks will be removed from the beginning and end of | |
lines, but apart from that whitespace is preserved. | |
""" | |
lines = [] | |
if self.width <= 0: | |
raise ValueError("invalid width %r (must be > 0)" % self.width) | |
# Arrange in reverse order so items can be efficiently popped | |
# from a stack of chucks. | |
chunks.reverse() | |
while chunks: | |
# Start the list of chunks that will make up the current line. | |
# cur_len is just the length of all the chunks in cur_line. | |
cur_line = [] | |
cur_len = 0 | |
# Figure out which static string will prefix this line. | |
if lines: | |
indent = self.subsequent_indent | |
else: | |
indent = self.initial_indent | |
# Maximum width for this line. | |
width = self.width - len(indent) | |
# First chunk on line is whitespace -- drop it, unless this | |
# is the very beginning of the text (ie. no lines started yet). | |
if self.drop_whitespace and chunks[-1].strip() == '' and lines: | |
del chunks[-1] | |
while chunks: | |
l = len(chunks[-1]) | |
# Can at least squeeze this chunk onto the current line. | |
if cur_len + l <= width: | |
cur_line.append(chunks.pop()) | |
cur_len += l | |
# Nope, this line is full. | |
else: | |
break | |
# The current line is full, and the next chunk is too big to | |
# fit on *any* line (not just this one). | |
if chunks and len(chunks[-1]) > width: | |
self._handle_long_word(chunks, cur_line, cur_len, width) | |
# If the last chunk on this line is all whitespace, drop it. | |
if self.drop_whitespace and cur_line and cur_line[-1].strip() == '': | |
del cur_line[-1] | |
# Convert current line back to a string and store it in list | |
# of all lines (return value). | |
if cur_line: | |
lines.append(indent + ''.join(cur_line)) | |
return lines | |
# -- Public interface ---------------------------------------------- | |
def wrap(self, text): | |
"""wrap(text : string) -> [string] | |
Reformat the single paragraph in 'text' so it fits in lines of | |
no more than 'self.width' columns, and return a list of wrapped | |
lines. Tabs in 'text' are expanded with string.expandtabs(), | |
and all other whitespace characters (including newline) are | |
converted to space. | |
""" | |
text = self._munge_whitespace(text) | |
chunks = self._split(text) | |
if self.fix_sentence_endings: | |
self._fix_sentence_endings(chunks) | |
return self._wrap_chunks(chunks) | |
def fill(self, text): | |
"""fill(text : string) -> string | |
Reformat the single paragraph in 'text' to fit in lines of no | |
more than 'self.width' columns, and return a new string | |
containing the entire wrapped paragraph. | |
""" | |
return "\n".join(self.wrap(text)) | |
# -- Convenience interface --------------------------------------------- | |
def wrap(text, width=70, **kwargs): | |
"""Wrap a single paragraph of text, returning a list of wrapped lines. | |
Reformat the single paragraph in 'text' so it fits in lines of no | |
more than 'width' columns, and return a list of wrapped lines. By | |
default, tabs in 'text' are expanded with string.expandtabs(), and | |
all other whitespace characters (including newline) are converted to | |
space. See TextWrapper class for available keyword args to customize | |
wrapping behaviour. | |
""" | |
w = TextWrapper(width=width, **kwargs) | |
return w.wrap(text) | |
def fill(text, width=70, **kwargs): | |
"""Fill a single paragraph of text, returning a new string. | |
Reformat the single paragraph in 'text' to fit in lines of no more | |
than 'width' columns, and return a new string containing the entire | |
wrapped paragraph. As with wrap(), tabs are expanded and other | |
whitespace characters converted to space. See TextWrapper class for | |
available keyword args to customize wrapping behaviour. | |
""" | |
w = TextWrapper(width=width, **kwargs) | |
return w.fill(text) | |
# -- Loosely related functionality ------------------------------------- | |
_whitespace_only_re = re.compile('^[ \t]+$', re.MULTILINE) | |
_leading_whitespace_re = re.compile('(^[ \t]*)(?:[^ \t\n])', re.MULTILINE) | |
def dedent(text): | |
"""Remove any common leading whitespace from every line in `text`. | |
This can be used to make triple-quoted strings line up with the left | |
edge of the display, while still presenting them in the source code | |
in indented form. | |
Note that tabs and spaces are both treated as whitespace, but they | |
are not equal: the lines " hello" and "\\thello" are | |
considered to have no common leading whitespace. (This behaviour is | |
new in Python 2.5; older versions of this module incorrectly | |
expanded tabs before searching for common leading whitespace.) | |
""" | |
# Look for the longest leading string of spaces and tabs common to | |
# all lines. | |
margin = None | |
text = _whitespace_only_re.sub('', text) | |
indents = _leading_whitespace_re.findall(text) | |
for indent in indents: | |
if margin is None: | |
margin = indent | |
# Current line more deeply indented than previous winner: | |
# no change (previous winner is still on top). | |
elif indent.startswith(margin): | |
pass | |
# Current line consistent with and no deeper than previous winner: | |
# it's the new winner. | |
elif margin.startswith(indent): | |
margin = indent | |
# Current line and previous winner have no common whitespace: | |
# there is no margin. | |
else: | |
margin = "" | |
break | |
# sanity check (testing/debugging only) | |
if 0 and margin: | |
for line in text.split("\n"): | |
assert not line or line.startswith(margin), \ | |
"line = %r, margin = %r" % (line, margin) | |
if margin: | |
text = re.sub(r'(?m)^' + margin, '', text) | |
return text | |
if __name__ == "__main__": | |
#print dedent("\tfoo\n\tbar") | |
#print dedent(" \thello there\n \t how are you?") | |
print dedent("Hello there.\n This is indented.") |