cleanup
This commit is contained in:
parent
f6b5d6bde8
commit
ddecba1d1f
1 changed files with 16 additions and 12 deletions
26
ox/html.py
26
ox/html.py
|
@ -6,7 +6,7 @@ import string
|
||||||
from htmlentitydefs import name2codepoint
|
from htmlentitydefs import name2codepoint
|
||||||
|
|
||||||
|
|
||||||
# Configuration for urlize() function
|
# Configuration for add_links() function
|
||||||
LEADING_PUNCTUATION = ['(', '<', '<']
|
LEADING_PUNCTUATION = ['(', '<', '<']
|
||||||
TRAILING_PUNCTUATION = ['.', ',', ')', '>', '\n', '>', "'", '"']
|
TRAILING_PUNCTUATION = ['.', ',', ')', '>', '\n', '>', "'", '"']
|
||||||
|
|
||||||
|
@ -45,15 +45,17 @@ def linebreaks(value):
|
||||||
paras = ['<p>%s</p>' % p.strip().replace('\n', '<br />') for p in paras]
|
paras = ['<p>%s</p>' % p.strip().replace('\n', '<br />') for p in paras]
|
||||||
return '\n\n'.join(paras)
|
return '\n\n'.join(paras)
|
||||||
|
|
||||||
def stripTags(value):
|
def strip_tags(value):
|
||||||
"""
|
"""
|
||||||
Returns the given HTML with all tags stripped
|
Returns the given HTML with all tags stripped
|
||||||
|
|
||||||
>>> stripTags('some <h2>title</h2> <script>asdfasdf</script>')
|
>>> strip_tags('some <h2>title</h2> <script>asdfasdf</script>')
|
||||||
'some title asdfasdf'
|
'some title asdfasdf'
|
||||||
"""
|
"""
|
||||||
return re.sub(r'<[^>]*?>', '', value)
|
return re.sub(r'<[^>]*?>', '', value)
|
||||||
|
|
||||||
|
stripTags = strip_tags
|
||||||
|
|
||||||
def stripSpacesBetweenTags(value):
|
def stripSpacesBetweenTags(value):
|
||||||
"Returns the given HTML with spaces between tags normalized to a single space"
|
"Returns the given HTML with spaces between tags normalized to a single space"
|
||||||
return re.sub(r'>\s+<', '> <', value)
|
return re.sub(r'>\s+<', '> <', value)
|
||||||
|
@ -66,7 +68,7 @@ def fixAmpersands(value):
|
||||||
"Returns the given HTML with all unencoded ampersands encoded correctly"
|
"Returns the given HTML with all unencoded ampersands encoded correctly"
|
||||||
return unencoded_ampersands_re.sub('&', value)
|
return unencoded_ampersands_re.sub('&', value)
|
||||||
|
|
||||||
def urlize(text, trim_url_limit=None, nofollow=False):
|
def add_links(text, trim_url_limit=None, nofollow=False):
|
||||||
"""
|
"""
|
||||||
Converts any URLs in text into clickable links. Works on http://, https:// and
|
Converts any URLs in text into clickable links. Works on http://, https:// and
|
||||||
www. links. Links can have trailing punctuation (periods, commas, close-parens)
|
www. links. Links can have trailing punctuation (periods, commas, close-parens)
|
||||||
|
@ -97,7 +99,9 @@ def urlize(text, trim_url_limit=None, nofollow=False):
|
||||||
words[i] = lead + middle + trail
|
words[i] = lead + middle + trail
|
||||||
return ''.join(words)
|
return ''.join(words)
|
||||||
|
|
||||||
def cleanHtml(text):
|
urlize = add_links
|
||||||
|
|
||||||
|
def clean_html(text):
|
||||||
"""
|
"""
|
||||||
Cleans the given HTML. Specifically, it does the following:
|
Cleans the given HTML. Specifically, it does the following:
|
||||||
* Converts <b> and <i> to <strong> and <em>.
|
* Converts <b> and <i> to <strong> and <em>.
|
||||||
|
@ -133,13 +137,13 @@ def cleanHtml(text):
|
||||||
# references, a hexadecimal numeric reference, or a named reference).
|
# references, a hexadecimal numeric reference, or a named reference).
|
||||||
charrefpat = re.compile(r'&(#(\d+|x[\da-fA-F]+)|[\w.:-]+);?')
|
charrefpat = re.compile(r'&(#(\d+|x[\da-fA-F]+)|[\w.:-]+);?')
|
||||||
|
|
||||||
def decodeHtml(html):
|
def decode_html(html):
|
||||||
"""
|
"""
|
||||||
>>> decodeHtml('me & you and $&%')
|
>>> decode_html('me & you and $&%')
|
||||||
u'me & you and $&%'
|
u'me & you and $&%'
|
||||||
>>> decodeHtml('€')
|
>>> decode_html('€')
|
||||||
u'€'
|
u'€'
|
||||||
>>> decodeHtml('Anniversary of Daoud's Republic')
|
>>> decode_html('Anniversary of Daoud's Republic')
|
||||||
u'Anniversary of Daoud's Republic'
|
u'Anniversary of Daoud's Republic'
|
||||||
"""
|
"""
|
||||||
if type(html) != unicode:
|
if type(html) != unicode:
|
||||||
|
@ -164,7 +168,7 @@ def decodeHtml(html):
|
||||||
return match.group(0)
|
return match.group(0)
|
||||||
return charrefpat.sub(entitydecode, html).replace(u'\xa0', ' ')
|
return charrefpat.sub(entitydecode, html).replace(u'\xa0', ' ')
|
||||||
|
|
||||||
decode_html = decodeHtml
|
decodeHtml = decode_html
|
||||||
|
|
||||||
def highlight(text, query, hlClass="hl"):
|
def highlight(text, query, hlClass="hl"):
|
||||||
"""
|
"""
|
||||||
|
@ -280,7 +284,7 @@ def sanitize_html(html, tags=None, wikilinks=False):
|
||||||
for i in range(0, len(matches)):
|
for i in range(0, len(matches)):
|
||||||
html = html.replace('\t%d\t'%(i+1), matches[i])
|
html = html.replace('\t%d\t'%(i+1), matches[i])
|
||||||
html = html.replace('\n\n', '<br/><br/>')
|
html = html.replace('\n\n', '<br/><br/>')
|
||||||
html = urlize(html)
|
html = add_links(html)
|
||||||
return sanitize_fragment(html)
|
return sanitize_fragment(html)
|
||||||
|
|
||||||
def sanitize_fragment(html):
|
def sanitize_fragment(html):
|
||||||
|
|
Loading…
Reference in a new issue