add htmldecode, trimString, import missing chardet
This commit is contained in:
parent
d54e0e5b27
commit
f0927aea2e
3 changed files with 34 additions and 1 deletions
|
@ -6,6 +6,8 @@ import sha
|
||||||
import time
|
import time
|
||||||
import urlparse
|
import urlparse
|
||||||
|
|
||||||
|
import chardet
|
||||||
|
|
||||||
import net
|
import net
|
||||||
from net import DEFAULT_HEADERS
|
from net import DEFAULT_HEADERS
|
||||||
|
|
||||||
|
|
|
@ -3,6 +3,7 @@
|
||||||
# GPL written 2008 by j@pad.ma
|
# GPL written 2008 by j@pad.ma
|
||||||
import re
|
import re
|
||||||
import string
|
import string
|
||||||
|
from htmlentitydefs import name2codepoint
|
||||||
|
|
||||||
|
|
||||||
# Configuration for urlize() function
|
# Configuration for urlize() function
|
||||||
|
@ -116,6 +117,30 @@ def cleanHtml(text):
|
||||||
text = trailing_empty_content_re.sub('', text)
|
text = trailing_empty_content_re.sub('', text)
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
# This pattern matches a character entity reference (a decimal numeric
|
||||||
|
# references, a hexadecimal numeric reference, or a named reference).
|
||||||
|
charrefpat = re.compile(r'&(#(\d+|x[\da-fA-F]+)|[\w.:-]+);?')
|
||||||
|
|
||||||
|
def htmldecode(text):
|
||||||
|
"""Decode HTML entities in the given text."""
|
||||||
|
if type(text) != unicode:
|
||||||
|
text = unicode(text)[:]
|
||||||
|
if type(text) is unicode:
|
||||||
|
uchr = unichr
|
||||||
|
else:
|
||||||
|
uchr = lambda value: value > 255 and unichr(value) or chr(value)
|
||||||
|
def entitydecode(match, uchr=uchr):
|
||||||
|
entity = match.group(1)
|
||||||
|
if entity.startswith('#x'):
|
||||||
|
return uchr(int(entity[2:], 16))
|
||||||
|
elif entity.startswith('#'):
|
||||||
|
return uchr(int(entity[1:]))
|
||||||
|
elif entity in name2codepoint:
|
||||||
|
return uchr(name2codepoint[entity])
|
||||||
|
else:
|
||||||
|
return match.group(0)
|
||||||
|
return charrefpat.sub(entitydecode, text)
|
||||||
|
|
||||||
def highlight(text, query, hlClass="hl"):
|
def highlight(text, query, hlClass="hl"):
|
||||||
if query:
|
if query:
|
||||||
text = text.replace('<br />', '|')
|
text = text.replace('<br />', '|')
|
||||||
|
|
|
@ -46,7 +46,13 @@ def truncateString(s, num):
|
||||||
if words:
|
if words:
|
||||||
ts += "..."
|
ts += "..."
|
||||||
return ts
|
return ts
|
||||||
|
|
||||||
|
def trimString(string, num):
|
||||||
|
"Truncates a string after a certain number of chacters, adding ... at -10 characters"
|
||||||
|
if len(string) > num:
|
||||||
|
string = string[:num - 13] + '...' + string[-10:]
|
||||||
|
return string
|
||||||
|
|
||||||
def truncateWords(s, num):
|
def truncateWords(s, num):
|
||||||
"Truncates a string after a certain number of words."
|
"Truncates a string after a certain number of words."
|
||||||
length = int(num)
|
length = int(num)
|
||||||
|
|
Loading…
Reference in a new issue