From f0927aea2ecd72d4404a1e1631c87d03127cf8a4 Mon Sep 17 00:00:00 2001 From: j Date: Mon, 28 Apr 2008 11:50:34 +0200 Subject: [PATCH] add htmldecode, trimString, import missing chardet --- oxutils/cache.py | 2 ++ oxutils/html.py | 25 +++++++++++++++++++++++++ oxutils/text.py | 8 +++++++- 3 files changed, 34 insertions(+), 1 deletion(-) diff --git a/oxutils/cache.py b/oxutils/cache.py index 8ca9bf5..170641d 100644 --- a/oxutils/cache.py +++ b/oxutils/cache.py @@ -6,6 +6,8 @@ import sha import time import urlparse +import chardet + import net from net import DEFAULT_HEADERS diff --git a/oxutils/html.py b/oxutils/html.py index aaddbf6..23635fe 100644 --- a/oxutils/html.py +++ b/oxutils/html.py @@ -3,6 +3,7 @@ # GPL written 2008 by j@pad.ma import re import string +from htmlentitydefs import name2codepoint # Configuration for urlize() function @@ -116,6 +117,30 @@ def cleanHtml(text): text = trailing_empty_content_re.sub('', text) return text +# This pattern matches a character entity reference (a decimal numeric +# references, a hexadecimal numeric reference, or a named reference). +charrefpat = re.compile(r'&(#(\d+|x[\da-fA-F]+)|[\w.:-]+);?') + +def htmldecode(text): + """Decode HTML entities in the given text.""" + if type(text) != unicode: + text = unicode(text)[:] + if type(text) is unicode: + uchr = unichr + else: + uchr = lambda value: value > 255 and unichr(value) or chr(value) + def entitydecode(match, uchr=uchr): + entity = match.group(1) + if entity.startswith('#x'): + return uchr(int(entity[2:], 16)) + elif entity.startswith('#'): + return uchr(int(entity[1:])) + elif entity in name2codepoint: + return uchr(name2codepoint[entity]) + else: + return match.group(0) + return charrefpat.sub(entitydecode, text) + def highlight(text, query, hlClass="hl"): if query: text = text.replace('
', '|') diff --git a/oxutils/text.py b/oxutils/text.py index f26837f..79a553a 100644 --- a/oxutils/text.py +++ b/oxutils/text.py @@ -46,7 +46,13 @@ def truncateString(s, num): if words: ts += "..." return ts - + +def trimString(string, num): + "Truncates a string after a certain number of chacters, adding ... at -10 characters" + if len(string) > num: + string = string[:num - 13] + '...' + string[-10:] + return string + def truncateWords(s, num): "Truncates a string after a certain number of words." length = int(num)