diff --git a/ox/html.py b/ox/html.py
index 6b0d48c..6950e01 100644
--- a/ox/html.py
+++ b/ox/html.py
@@ -6,7 +6,7 @@ import string
from htmlentitydefs import name2codepoint
-# Configuration for urlize() function
+# Configuration for add_links() function
LEADING_PUNCTUATION = ['(', '<', '<']
TRAILING_PUNCTUATION = ['.', ',', ')', '>', '\n', '>', "'", '"']
@@ -45,15 +45,17 @@ def linebreaks(value):
paras = ['
%s
' % p.strip().replace('\n', '
') for p in paras]
return '\n\n'.join(paras)
-def stripTags(value):
+def strip_tags(value):
"""
Returns the given HTML with all tags stripped
- >>> stripTags('some title
')
+ >>> strip_tags('some title
')
'some title asdfasdf'
"""
return re.sub(r'<[^>]*?>', '', value)
-
+
+stripTags = strip_tags
+
def stripSpacesBetweenTags(value):
"Returns the given HTML with spaces between tags normalized to a single space"
return re.sub(r'>\s+<', '> <', value)
@@ -66,7 +68,7 @@ def fixAmpersands(value):
"Returns the given HTML with all unencoded ampersands encoded correctly"
return unencoded_ampersands_re.sub('&', value)
-def urlize(text, trim_url_limit=None, nofollow=False):
+def add_links(text, trim_url_limit=None, nofollow=False):
"""
Converts any URLs in text into clickable links. Works on http://, https:// and
www. links. Links can have trailing punctuation (periods, commas, close-parens)
@@ -97,7 +99,9 @@ def urlize(text, trim_url_limit=None, nofollow=False):
words[i] = lead + middle + trail
return ''.join(words)
-def cleanHtml(text):
+urlize = add_links
+
+def clean_html(text):
"""
Cleans the given HTML. Specifically, it does the following:
* Converts and to and .
@@ -133,13 +137,13 @@ def cleanHtml(text):
# references, a hexadecimal numeric reference, or a named reference).
charrefpat = re.compile(r'&(#(\d+|x[\da-fA-F]+)|[\w.:-]+);?')
-def decodeHtml(html):
+def decode_html(html):
"""
- >>> decodeHtml('me & you and $&%')
+ >>> decode_html('me & you and $&%')
u'me & you and $&%'
- >>> decodeHtml('')
+ >>> decode_html('')
u'€'
- >>> decodeHtml('Anniversary of Daoud's Republic')
+ >>> decode_html('Anniversary of Daoud's Republic')
u'Anniversary of Daoud's Republic'
"""
if type(html) != unicode:
@@ -164,7 +168,7 @@ def decodeHtml(html):
return match.group(0)
return charrefpat.sub(entitydecode, html).replace(u'\xa0', ' ')
-decode_html = decodeHtml
+decodeHtml = decode_html
def highlight(text, query, hlClass="hl"):
"""
@@ -280,7 +284,7 @@ def sanitize_html(html, tags=None, wikilinks=False):
for i in range(0, len(matches)):
html = html.replace('\t%d\t'%(i+1), matches[i])
html = html.replace('\n\n', '
')
- html = urlize(html)
+ html = add_links(html)
return sanitize_fragment(html)
def sanitize_fragment(html):