add htmldecode, trimString, import missing chardet

This commit is contained in:
j 2008-04-28 11:50:34 +02:00
parent d54e0e5b27
commit f0927aea2e
3 changed files with 34 additions and 1 deletions

View file

@ -6,6 +6,8 @@ import sha
import time
import urlparse
import chardet
import net
from net import DEFAULT_HEADERS

View file

@ -3,6 +3,7 @@
# GPL written 2008 by j@pad.ma
import re
import string
from htmlentitydefs import name2codepoint
# Configuration for urlize() function
@ -116,6 +117,30 @@ def cleanHtml(text):
text = trailing_empty_content_re.sub('', text)
return text
# This pattern matches a character entity reference (a decimal numeric
# references, a hexadecimal numeric reference, or a named reference).
charrefpat = re.compile(r'&(#(\d+|x[\da-fA-F]+)|[\w.:-]+);?')
def htmldecode(text):
"""Decode HTML entities in the given text."""
if type(text) != unicode:
text = unicode(text)[:]
if type(text) is unicode:
uchr = unichr
else:
uchr = lambda value: value > 255 and unichr(value) or chr(value)
def entitydecode(match, uchr=uchr):
entity = match.group(1)
if entity.startswith('#x'):
return uchr(int(entity[2:], 16))
elif entity.startswith('#'):
return uchr(int(entity[1:]))
elif entity in name2codepoint:
return uchr(name2codepoint[entity])
else:
return match.group(0)
return charrefpat.sub(entitydecode, text)
def highlight(text, query, hlClass="hl"):
if query:
text = text.replace('<br />', '|')

View file

@ -47,6 +47,12 @@ def truncateString(s, num):
ts += "..."
return ts
def trimString(string, num):
"Truncates a string after a certain number of chacters, adding ... at -10 characters"
if len(string) > num:
string = string[:num - 13] + '...' + string[-10:]
return string
def truncateWords(s, num):
"Truncates a string after a certain number of words."
length = int(num)