725 lines
25 KiB
Python
725 lines
25 KiB
Python
"""A cleanup tool for HTML.
|
|
|
|
Removes unwanted tags and content. See the `Cleaner` class for
|
|
details.
|
|
"""
|
|
|
|
import re
|
|
import copy
|
|
try:
|
|
from urlparse import urlsplit
|
|
except ImportError:
|
|
# Python 3
|
|
from urllib.parse import urlsplit
|
|
from lxml import etree
|
|
from lxml.html import defs
|
|
from lxml.html import fromstring, tostring, XHTML_NAMESPACE
|
|
from lxml.html import xhtml_to_html, _transform_result
|
|
|
|
try:
|
|
unichr
|
|
except NameError:
|
|
# Python 3
|
|
unichr = chr
|
|
try:
|
|
unicode
|
|
except NameError:
|
|
# Python 3
|
|
unicode = str
|
|
try:
|
|
bytes
|
|
except NameError:
|
|
# Python < 2.6
|
|
bytes = str
|
|
try:
|
|
basestring
|
|
except NameError:
|
|
basestring = (str, bytes)
|
|
|
|
|
|
__all__ = ['clean_html', 'clean', 'Cleaner', 'autolink', 'autolink_html',
|
|
'word_break', 'word_break_html']
|
|
|
|
# Look at http://code.sixapart.com/trac/livejournal/browser/trunk/cgi-bin/cleanhtml.pl
|
|
# Particularly the CSS cleaning; most of the tag cleaning is integrated now
|
|
# I have multiple kinds of schemes searched; but should schemes be
|
|
# whitelisted instead?
|
|
# max height?
|
|
# remove images? Also in CSS? background attribute?
|
|
# Some way to whitelist object, iframe, etc (e.g., if you want to
|
|
# allow *just* embedded YouTube movies)
|
|
# Log what was deleted and why?
|
|
# style="behavior: ..." might be bad in IE?
|
|
# Should we have something for just <meta http-equiv>? That's the worst of the
|
|
# metas.
|
|
# UTF-7 detections? Example:
|
|
# <HEAD><META HTTP-EQUIV="CONTENT-TYPE" CONTENT="text/html; charset=UTF-7"> </HEAD>+ADw-SCRIPT+AD4-alert('XSS');+ADw-/SCRIPT+AD4-
|
|
# you don't always have to have the charset set, if the page has no charset
|
|
# and there's UTF7-like code in it.
|
|
# Look at these tests: http://htmlpurifier.org/live/smoketests/xssAttacks.php
|
|
|
|
|
|
# This is an IE-specific construct you can have in a stylesheet to
|
|
# run some Javascript:
|
|
_css_javascript_re = re.compile(
|
|
r'expression\s*\(.*?\)', re.S|re.I)
|
|
|
|
# Do I have to worry about @\nimport?
|
|
_css_import_re = re.compile(
|
|
r'@\s*import', re.I)
|
|
|
|
# All kinds of schemes besides just javascript: that can cause
|
|
# execution:
|
|
_is_javascript_scheme = re.compile(
|
|
r'(?:javascript|jscript|livescript|vbscript|data|about|mocha):',
|
|
re.I).search
|
|
_substitute_whitespace = re.compile(r'[\s\x00-\x08\x0B\x0C\x0E-\x19]+').sub
|
|
# FIXME: should data: be blocked?
|
|
|
|
# FIXME: check against: http://msdn2.microsoft.com/en-us/library/ms537512.aspx
|
|
_conditional_comment_re = re.compile(
|
|
r'\[if[\s\n\r]+.*?][\s\n\r]*>', re.I|re.S)
|
|
|
|
_find_styled_elements = etree.XPath(
|
|
"descendant-or-self::*[@style]")
|
|
|
|
_find_external_links = etree.XPath(
|
|
("descendant-or-self::a [normalize-space(@href) and substring(normalize-space(@href),1,1) != '#'] |"
|
|
"descendant-or-self::x:a[normalize-space(@href) and substring(normalize-space(@href),1,1) != '#']"),
|
|
namespaces={'x':XHTML_NAMESPACE})
|
|
|
|
class Cleaner(object):
|
|
"""
|
|
Instances cleans the document of each of the possible offending
|
|
elements. The cleaning is controlled by attributes; you can
|
|
override attributes in a subclass, or set them in the constructor.
|
|
|
|
``scripts``:
|
|
Removes any ``<script>`` tags.
|
|
|
|
``javascript``:
|
|
Removes any Javascript, like an ``onclick`` attribute. Also removes stylesheets
|
|
as they could contain Javascript.
|
|
|
|
``comments``:
|
|
Removes any comments.
|
|
|
|
``style``:
|
|
Removes any style tags or attributes.
|
|
|
|
``links``:
|
|
Removes any ``<link>`` tags
|
|
|
|
``meta``:
|
|
Removes any ``<meta>`` tags
|
|
|
|
``page_structure``:
|
|
Structural parts of a page: ``<head>``, ``<html>``, ``<title>``.
|
|
|
|
``processing_instructions``:
|
|
Removes any processing instructions.
|
|
|
|
``embedded``:
|
|
Removes any embedded objects (flash, iframes)
|
|
|
|
``frames``:
|
|
Removes any frame-related tags
|
|
|
|
``forms``:
|
|
Removes any form tags
|
|
|
|
``annoying_tags``:
|
|
Tags that aren't *wrong*, but are annoying. ``<blink>`` and ``<marquee>``
|
|
|
|
``remove_tags``:
|
|
A list of tags to remove. Only the tags will be removed,
|
|
their content will get pulled up into the parent tag.
|
|
|
|
``kill_tags``:
|
|
A list of tags to kill. Killing also removes the tag's content,
|
|
i.e. the whole subtree, not just the tag itself.
|
|
|
|
``allow_tags``:
|
|
A list of tags to include (default include all).
|
|
|
|
``remove_unknown_tags``:
|
|
Remove any tags that aren't standard parts of HTML.
|
|
|
|
``safe_attrs_only``:
|
|
If true, only include 'safe' attributes (specifically the list
|
|
from the feedparser HTML sanitisation web site).
|
|
|
|
``safe_attrs``:
|
|
A set of attribute names to override the default list of attributes
|
|
considered 'safe' (when safe_attrs_only=True).
|
|
|
|
``add_nofollow``:
|
|
If true, then any <a> tags will have ``rel="nofollow"`` added to them.
|
|
|
|
``host_whitelist``:
|
|
A list or set of hosts that you can use for embedded content
|
|
(for content like ``<object>``, ``<link rel="stylesheet">``, etc).
|
|
You can also implement/override the method
|
|
``allow_embedded_url(el, url)`` or ``allow_element(el)`` to
|
|
implement more complex rules for what can be embedded.
|
|
Anything that passes this test will be shown, regardless of
|
|
the value of (for instance) ``embedded``.
|
|
|
|
Note that this parameter might not work as intended if you do not
|
|
make the links absolute before doing the cleaning.
|
|
|
|
Note that you may also need to set ``whitelist_tags``.
|
|
|
|
``whitelist_tags``:
|
|
A set of tags that can be included with ``host_whitelist``.
|
|
The default is ``iframe`` and ``embed``; you may wish to
|
|
include other tags like ``script``, or you may want to
|
|
implement ``allow_embedded_url`` for more control. Set to None to
|
|
include all tags.
|
|
|
|
This modifies the document *in place*.
|
|
"""
|
|
|
|
scripts = True
|
|
javascript = True
|
|
comments = True
|
|
style = False
|
|
links = True
|
|
meta = True
|
|
page_structure = True
|
|
processing_instructions = True
|
|
embedded = True
|
|
frames = True
|
|
forms = True
|
|
annoying_tags = True
|
|
remove_tags = None
|
|
allow_tags = None
|
|
kill_tags = None
|
|
remove_unknown_tags = True
|
|
safe_attrs_only = True
|
|
safe_attrs = defs.safe_attrs
|
|
add_nofollow = False
|
|
host_whitelist = ()
|
|
whitelist_tags = set(['iframe', 'embed'])
|
|
|
|
def __init__(self, **kw):
|
|
for name, value in kw.items():
|
|
if not hasattr(self, name):
|
|
raise TypeError(
|
|
"Unknown parameter: %s=%r" % (name, value))
|
|
setattr(self, name, value)
|
|
|
|
# Used to lookup the primary URL for a given tag that is up for
|
|
# removal:
|
|
_tag_link_attrs = dict(
|
|
script='src',
|
|
link='href',
|
|
# From: http://java.sun.com/j2se/1.4.2/docs/guide/misc/applet.html
|
|
# From what I can tell, both attributes can contain a link:
|
|
applet=['code', 'object'],
|
|
iframe='src',
|
|
embed='src',
|
|
layer='src',
|
|
# FIXME: there doesn't really seem like a general way to figure out what
|
|
# links an <object> tag uses; links often go in <param> tags with values
|
|
# that we don't really know. You'd have to have knowledge about specific
|
|
# kinds of plugins (probably keyed off classid), and match against those.
|
|
##object=?,
|
|
# FIXME: not looking at the action currently, because it is more complex
|
|
# than than -- if you keep the form, you should keep the form controls.
|
|
##form='action',
|
|
a='href',
|
|
)
|
|
|
|
def __call__(self, doc):
|
|
"""
|
|
Cleans the document.
|
|
"""
|
|
if hasattr(doc, 'getroot'):
|
|
# ElementTree instance, instead of an element
|
|
doc = doc.getroot()
|
|
# convert XHTML to HTML
|
|
xhtml_to_html(doc)
|
|
# Normalize a case that IE treats <image> like <img>, and that
|
|
# can confuse either this step or later steps.
|
|
for el in doc.iter('image'):
|
|
el.tag = 'img'
|
|
if not self.comments:
|
|
# Of course, if we were going to kill comments anyway, we don't
|
|
# need to worry about this
|
|
self.kill_conditional_comments(doc)
|
|
|
|
kill_tags = set(self.kill_tags or ())
|
|
remove_tags = set(self.remove_tags or ())
|
|
allow_tags = set(self.allow_tags or ())
|
|
|
|
if self.scripts:
|
|
kill_tags.add('script')
|
|
if self.safe_attrs_only:
|
|
safe_attrs = set(self.safe_attrs)
|
|
for el in doc.iter():
|
|
attrib = el.attrib
|
|
for aname in attrib.keys():
|
|
if aname not in safe_attrs:
|
|
del attrib[aname]
|
|
if self.javascript:
|
|
if not (self.safe_attrs_only and
|
|
self.safe_attrs == defs.safe_attrs):
|
|
# safe_attrs handles events attributes itself
|
|
for el in doc.iter():
|
|
attrib = el.attrib
|
|
for aname in attrib.keys():
|
|
if aname.startswith('on'):
|
|
del attrib[aname]
|
|
doc.rewrite_links(self._remove_javascript_link,
|
|
resolve_base_href=False)
|
|
if not self.style:
|
|
# If we're deleting style then we don't have to remove JS links
|
|
# from styles, otherwise...
|
|
for el in _find_styled_elements(doc):
|
|
old = el.get('style')
|
|
new = _css_javascript_re.sub('', old)
|
|
new = _css_import_re.sub('', new)
|
|
if self._has_sneaky_javascript(new):
|
|
# Something tricky is going on...
|
|
del el.attrib['style']
|
|
elif new != old:
|
|
el.set('style', new)
|
|
for el in list(doc.iter('style')):
|
|
if el.get('type', '').lower().strip() == 'text/javascript':
|
|
el.drop_tree()
|
|
continue
|
|
old = el.text or ''
|
|
new = _css_javascript_re.sub('', old)
|
|
# The imported CSS can do anything; we just can't allow:
|
|
new = _css_import_re.sub('', old)
|
|
if self._has_sneaky_javascript(new):
|
|
# Something tricky is going on...
|
|
el.text = '/* deleted */'
|
|
elif new != old:
|
|
el.text = new
|
|
if self.comments or self.processing_instructions:
|
|
# FIXME: why either? I feel like there's some obscure reason
|
|
# because you can put PIs in comments...? But I've already
|
|
# forgotten it
|
|
kill_tags.add(etree.Comment)
|
|
if self.processing_instructions:
|
|
kill_tags.add(etree.ProcessingInstruction)
|
|
if self.style:
|
|
kill_tags.add('style')
|
|
etree.strip_attributes(doc, 'style')
|
|
if self.links:
|
|
kill_tags.add('link')
|
|
elif self.style or self.javascript:
|
|
# We must get rid of included stylesheets if Javascript is not
|
|
# allowed, as you can put Javascript in them
|
|
for el in list(doc.iter('link')):
|
|
if 'stylesheet' in el.get('rel', '').lower():
|
|
# Note this kills alternate stylesheets as well
|
|
if not self.allow_element(el):
|
|
el.drop_tree()
|
|
if self.meta:
|
|
kill_tags.add('meta')
|
|
if self.page_structure:
|
|
remove_tags.update(('head', 'html', 'title'))
|
|
if self.embedded:
|
|
# FIXME: is <layer> really embedded?
|
|
# We should get rid of any <param> tags not inside <applet>;
|
|
# These are not really valid anyway.
|
|
for el in list(doc.iter('param')):
|
|
found_parent = False
|
|
parent = el.getparent()
|
|
while parent is not None and parent.tag not in ('applet', 'object'):
|
|
parent = parent.getparent()
|
|
if parent is None:
|
|
el.drop_tree()
|
|
kill_tags.update(('applet',))
|
|
# The alternate contents that are in an iframe are a good fallback:
|
|
remove_tags.update(('iframe', 'embed', 'layer', 'object', 'param'))
|
|
if self.frames:
|
|
# FIXME: ideally we should look at the frame links, but
|
|
# generally frames don't mix properly with an HTML
|
|
# fragment anyway.
|
|
kill_tags.update(defs.frame_tags)
|
|
if self.forms:
|
|
remove_tags.add('form')
|
|
kill_tags.update(('button', 'input', 'select', 'textarea'))
|
|
if self.annoying_tags:
|
|
remove_tags.update(('blink', 'marquee'))
|
|
|
|
_remove = []
|
|
_kill = []
|
|
for el in doc.iter():
|
|
if el.tag in kill_tags:
|
|
if self.allow_element(el):
|
|
continue
|
|
_kill.append(el)
|
|
elif el.tag in remove_tags:
|
|
if self.allow_element(el):
|
|
continue
|
|
_remove.append(el)
|
|
|
|
if _remove and _remove[0] == doc:
|
|
# We have to drop the parent-most tag, which we can't
|
|
# do. Instead we'll rewrite it:
|
|
el = _remove.pop(0)
|
|
el.tag = 'div'
|
|
el.attrib.clear()
|
|
elif _kill and _kill[0] == doc:
|
|
# We have to drop the parent-most element, which we can't
|
|
# do. Instead we'll clear it:
|
|
el = _kill.pop(0)
|
|
if el.tag != 'html':
|
|
el.tag = 'div'
|
|
el.clear()
|
|
|
|
_kill.reverse() # start with innermost tags
|
|
for el in _kill:
|
|
el.drop_tree()
|
|
for el in _remove:
|
|
el.drop_tag()
|
|
|
|
if self.remove_unknown_tags:
|
|
if allow_tags:
|
|
raise ValueError(
|
|
"It does not make sense to pass in both allow_tags and remove_unknown_tags")
|
|
allow_tags = set(defs.tags)
|
|
if allow_tags:
|
|
bad = []
|
|
for el in doc.iter():
|
|
if el.tag not in allow_tags:
|
|
bad.append(el)
|
|
if bad:
|
|
if bad[0] is doc:
|
|
el = bad.pop(0)
|
|
el.tag = 'div'
|
|
el.attrib.clear()
|
|
for el in bad:
|
|
el.drop_tag()
|
|
if self.add_nofollow:
|
|
for el in _find_external_links(doc):
|
|
if not self.allow_follow(el):
|
|
rel = el.get('rel')
|
|
if rel:
|
|
if ('nofollow' in rel
|
|
and ' nofollow ' in (' %s ' % rel)):
|
|
continue
|
|
rel = '%s nofollow' % rel
|
|
else:
|
|
rel = 'nofollow'
|
|
el.set('rel', rel)
|
|
|
|
def allow_follow(self, anchor):
|
|
"""
|
|
Override to suppress rel="nofollow" on some anchors.
|
|
"""
|
|
return False
|
|
|
|
def allow_element(self, el):
|
|
if el.tag not in self._tag_link_attrs:
|
|
return False
|
|
attr = self._tag_link_attrs[el.tag]
|
|
if isinstance(attr, (list, tuple)):
|
|
for one_attr in attr:
|
|
url = el.get(one_attr)
|
|
if not url:
|
|
return False
|
|
if not self.allow_embedded_url(el, url):
|
|
return False
|
|
return True
|
|
else:
|
|
url = el.get(attr)
|
|
if not url:
|
|
return False
|
|
return self.allow_embedded_url(el, url)
|
|
|
|
def allow_embedded_url(self, el, url):
|
|
if (self.whitelist_tags is not None
|
|
and el.tag not in self.whitelist_tags):
|
|
return False
|
|
scheme, netloc, path, query, fragment = urlsplit(url)
|
|
netloc = netloc.lower().split(':', 1)[0]
|
|
if scheme not in ('http', 'https'):
|
|
return False
|
|
if netloc in self.host_whitelist:
|
|
return True
|
|
return False
|
|
|
|
def kill_conditional_comments(self, doc):
|
|
"""
|
|
IE conditional comments basically embed HTML that the parser
|
|
doesn't normally see. We can't allow anything like that, so
|
|
we'll kill any comments that could be conditional.
|
|
"""
|
|
bad = []
|
|
self._kill_elements(
|
|
doc, lambda el: _conditional_comment_re.search(el.text),
|
|
etree.Comment)
|
|
|
|
def _kill_elements(self, doc, condition, iterate=None):
|
|
bad = []
|
|
for el in doc.iter(iterate):
|
|
if condition(el):
|
|
bad.append(el)
|
|
for el in bad:
|
|
el.drop_tree()
|
|
|
|
def _remove_javascript_link(self, link):
|
|
# links like "j a v a s c r i p t:" might be interpreted in IE
|
|
new = _substitute_whitespace('', link)
|
|
if _is_javascript_scheme(new):
|
|
# FIXME: should this be None to delete?
|
|
return ''
|
|
return link
|
|
|
|
_substitute_comments = re.compile(r'/\*.*?\*/', re.S).sub
|
|
|
|
def _has_sneaky_javascript(self, style):
|
|
"""
|
|
Depending on the browser, stuff like ``e x p r e s s i o n(...)``
|
|
can get interpreted, or ``expre/* stuff */ssion(...)``. This
|
|
checks for attempt to do stuff like this.
|
|
|
|
Typically the response will be to kill the entire style; if you
|
|
have just a bit of Javascript in the style another rule will catch
|
|
that and remove only the Javascript from the style; this catches
|
|
more sneaky attempts.
|
|
"""
|
|
style = self._substitute_comments('', style)
|
|
style = style.replace('\\', '')
|
|
style = _substitute_whitespace('', style)
|
|
style = style.lower()
|
|
if 'javascript:' in style:
|
|
return True
|
|
if 'expression(' in style:
|
|
return True
|
|
return False
|
|
|
|
def clean_html(self, html):
|
|
result_type = type(html)
|
|
if isinstance(html, basestring):
|
|
doc = fromstring(html)
|
|
else:
|
|
doc = copy.deepcopy(html)
|
|
self(doc)
|
|
return _transform_result(result_type, doc)
|
|
|
|
clean = Cleaner()
|
|
clean_html = clean.clean_html
|
|
|
|
############################################################
|
|
## Autolinking
|
|
############################################################
|
|
|
|
_link_regexes = [
|
|
re.compile(r'(?P<body>https?://(?P<host>[a-z0-9._-]+)(?:/[/\-_.,a-z0-9%&?;=~]*)?(?:\([/\-_.,a-z0-9%&?;=~]*\))?)', re.I),
|
|
# This is conservative, but autolinking can be a bit conservative:
|
|
re.compile(r'mailto:(?P<body>[a-z0-9._-]+@(?P<host>[a-z0-9_._]+[a-z]))', re.I),
|
|
]
|
|
|
|
_avoid_elements = ['textarea', 'pre', 'code', 'head', 'select', 'a']
|
|
|
|
_avoid_hosts = [
|
|
re.compile(r'^localhost', re.I),
|
|
re.compile(r'\bexample\.(?:com|org|net)$', re.I),
|
|
re.compile(r'^127\.0\.0\.1$'),
|
|
]
|
|
|
|
_avoid_classes = ['nolink']
|
|
|
|
def autolink(el, link_regexes=_link_regexes,
|
|
avoid_elements=_avoid_elements,
|
|
avoid_hosts=_avoid_hosts,
|
|
avoid_classes=_avoid_classes):
|
|
"""
|
|
Turn any URLs into links.
|
|
|
|
It will search for links identified by the given regular
|
|
expressions (by default mailto and http(s) links).
|
|
|
|
It won't link text in an element in avoid_elements, or an element
|
|
with a class in avoid_classes. It won't link to anything with a
|
|
host that matches one of the regular expressions in avoid_hosts
|
|
(default localhost and 127.0.0.1).
|
|
|
|
If you pass in an element, the element's tail will not be
|
|
substituted, only the contents of the element.
|
|
"""
|
|
if el.tag in avoid_elements:
|
|
return
|
|
class_name = el.get('class')
|
|
if class_name:
|
|
class_name = class_name.split()
|
|
for match_class in avoid_classes:
|
|
if match_class in class_name:
|
|
return
|
|
for child in list(el):
|
|
autolink(child, link_regexes=link_regexes,
|
|
avoid_elements=avoid_elements,
|
|
avoid_hosts=avoid_hosts,
|
|
avoid_classes=avoid_classes)
|
|
if child.tail:
|
|
text, tail_children = _link_text(
|
|
child.tail, link_regexes, avoid_hosts, factory=el.makeelement)
|
|
if tail_children:
|
|
child.tail = text
|
|
index = el.index(child)
|
|
el[index+1:index+1] = tail_children
|
|
if el.text:
|
|
text, pre_children = _link_text(
|
|
el.text, link_regexes, avoid_hosts, factory=el.makeelement)
|
|
if pre_children:
|
|
el.text = text
|
|
el[:0] = pre_children
|
|
|
|
def _link_text(text, link_regexes, avoid_hosts, factory):
|
|
leading_text = ''
|
|
links = []
|
|
last_pos = 0
|
|
while 1:
|
|
best_match, best_pos = None, None
|
|
for regex in link_regexes:
|
|
regex_pos = last_pos
|
|
while 1:
|
|
match = regex.search(text, pos=regex_pos)
|
|
if match is None:
|
|
break
|
|
host = match.group('host')
|
|
for host_regex in avoid_hosts:
|
|
if host_regex.search(host):
|
|
regex_pos = match.end()
|
|
break
|
|
else:
|
|
break
|
|
if match is None:
|
|
continue
|
|
if best_pos is None or match.start() < best_pos:
|
|
best_match = match
|
|
best_pos = match.start()
|
|
if best_match is None:
|
|
# No more matches
|
|
if links:
|
|
assert not links[-1].tail
|
|
links[-1].tail = text
|
|
else:
|
|
assert not leading_text
|
|
leading_text = text
|
|
break
|
|
link = best_match.group(0)
|
|
end = best_match.end()
|
|
if link.endswith('.') or link.endswith(','):
|
|
# These punctuation marks shouldn't end a link
|
|
end -= 1
|
|
link = link[:-1]
|
|
prev_text = text[:best_match.start()]
|
|
if links:
|
|
assert not links[-1].tail
|
|
links[-1].tail = prev_text
|
|
else:
|
|
assert not leading_text
|
|
leading_text = prev_text
|
|
anchor = factory('a')
|
|
anchor.set('href', link)
|
|
body = best_match.group('body')
|
|
if not body:
|
|
body = link
|
|
if body.endswith('.') or body.endswith(','):
|
|
body = body[:-1]
|
|
anchor.text = body
|
|
links.append(anchor)
|
|
text = text[end:]
|
|
return leading_text, links
|
|
|
|
def autolink_html(html, *args, **kw):
|
|
result_type = type(html)
|
|
if isinstance(html, basestring):
|
|
doc = fromstring(html)
|
|
else:
|
|
doc = copy.deepcopy(html)
|
|
autolink(doc, *args, **kw)
|
|
return _transform_result(result_type, doc)
|
|
|
|
autolink_html.__doc__ = autolink.__doc__
|
|
|
|
############################################################
|
|
## Word wrapping
|
|
############################################################
|
|
|
|
_avoid_word_break_elements = ['pre', 'textarea', 'code']
|
|
_avoid_word_break_classes = ['nobreak']
|
|
|
|
def word_break(el, max_width=40,
|
|
avoid_elements=_avoid_word_break_elements,
|
|
avoid_classes=_avoid_word_break_classes,
|
|
break_character=unichr(0x200b)):
|
|
"""
|
|
Breaks any long words found in the body of the text (not attributes).
|
|
|
|
Doesn't effect any of the tags in avoid_elements, by default
|
|
``<textarea>`` and ``<pre>``
|
|
|
|
Breaks words by inserting ​, which is a unicode character
|
|
for Zero Width Space character. This generally takes up no space
|
|
in rendering, but does copy as a space, and in monospace contexts
|
|
usually takes up space.
|
|
|
|
See http://www.cs.tut.fi/~jkorpela/html/nobr.html for a discussion
|
|
"""
|
|
# Character suggestion of ​ comes from:
|
|
# http://www.cs.tut.fi/~jkorpela/html/nobr.html
|
|
if el.tag in _avoid_word_break_elements:
|
|
return
|
|
class_name = el.get('class')
|
|
if class_name:
|
|
dont_break = False
|
|
class_name = class_name.split()
|
|
for avoid in avoid_classes:
|
|
if avoid in class_name:
|
|
dont_break = True
|
|
break
|
|
if dont_break:
|
|
return
|
|
if el.text:
|
|
el.text = _break_text(el.text, max_width, break_character)
|
|
for child in el:
|
|
word_break(child, max_width=max_width,
|
|
avoid_elements=avoid_elements,
|
|
avoid_classes=avoid_classes,
|
|
break_character=break_character)
|
|
if child.tail:
|
|
child.tail = _break_text(child.tail, max_width, break_character)
|
|
|
|
def word_break_html(html, *args, **kw):
|
|
result_type = type(html)
|
|
doc = fromstring(html)
|
|
word_break(doc, *args, **kw)
|
|
return _transform_result(result_type, doc)
|
|
|
|
def _break_text(text, max_width, break_character):
|
|
words = text.split()
|
|
for word in words:
|
|
if len(word) > max_width:
|
|
replacement = _insert_break(word, max_width, break_character)
|
|
text = text.replace(word, replacement)
|
|
return text
|
|
|
|
_break_prefer_re = re.compile(r'[^a-z]', re.I)
|
|
|
|
def _insert_break(word, width, break_character):
|
|
orig_word = word
|
|
result = ''
|
|
while len(word) > width:
|
|
start = word[:width]
|
|
breaks = list(_break_prefer_re.finditer(start))
|
|
if breaks:
|
|
last_break = breaks[-1]
|
|
# Only walk back up to 10 characters to find a nice break:
|
|
if last_break.end() > width-10:
|
|
# FIXME: should the break character be at the end of the
|
|
# chunk, or the beginning of the next chunk?
|
|
start = word[:last_break.end()]
|
|
result += start + break_character
|
|
word = word[len(start):]
|
|
result += word
|
|
return result
|
|
|