add htmldecode, trimString, import missing chardet
This commit is contained in:
parent
d54e0e5b27
commit
f0927aea2e
3 changed files with 34 additions and 1 deletions
|
@ -6,6 +6,8 @@ import sha
|
|||
import time
|
||||
import urlparse
|
||||
|
||||
import chardet
|
||||
|
||||
import net
|
||||
from net import DEFAULT_HEADERS
|
||||
|
||||
|
|
|
@ -3,6 +3,7 @@
|
|||
# GPL written 2008 by j@pad.ma
|
||||
import re
|
||||
import string
|
||||
from htmlentitydefs import name2codepoint
|
||||
|
||||
|
||||
# Configuration for urlize() function
|
||||
|
@ -116,6 +117,30 @@ def cleanHtml(text):
|
|||
text = trailing_empty_content_re.sub('', text)
|
||||
return text
|
||||
|
||||
# This pattern matches a character entity reference (a decimal numeric
|
||||
# references, a hexadecimal numeric reference, or a named reference).
|
||||
charrefpat = re.compile(r'&(#(\d+|x[\da-fA-F]+)|[\w.:-]+);?')
|
||||
|
||||
def htmldecode(text):
|
||||
"""Decode HTML entities in the given text."""
|
||||
if type(text) != unicode:
|
||||
text = unicode(text)[:]
|
||||
if type(text) is unicode:
|
||||
uchr = unichr
|
||||
else:
|
||||
uchr = lambda value: value > 255 and unichr(value) or chr(value)
|
||||
def entitydecode(match, uchr=uchr):
|
||||
entity = match.group(1)
|
||||
if entity.startswith('#x'):
|
||||
return uchr(int(entity[2:], 16))
|
||||
elif entity.startswith('#'):
|
||||
return uchr(int(entity[1:]))
|
||||
elif entity in name2codepoint:
|
||||
return uchr(name2codepoint[entity])
|
||||
else:
|
||||
return match.group(0)
|
||||
return charrefpat.sub(entitydecode, text)
|
||||
|
||||
def highlight(text, query, hlClass="hl"):
|
||||
if query:
|
||||
text = text.replace('<br />', '|')
|
||||
|
|
|
@ -47,6 +47,12 @@ def truncateString(s, num):
|
|||
ts += "..."
|
||||
return ts
|
||||
|
||||
def trimString(string, num):
|
||||
"Truncates a string after a certain number of chacters, adding ... at -10 characters"
|
||||
if len(string) > num:
|
||||
string = string[:num - 13] + '...' + string[-10:]
|
||||
return string
|
||||
|
||||
def truncateWords(s, num):
|
||||
"Truncates a string after a certain number of words."
|
||||
length = int(num)
|
||||
|
|
Loading…
Reference in a new issue