add htmldecode, trimString, import missing chardet

2008-04-28 11:50:34 +02:00 · 2008-04-28 11:50:34 +02:00 · f0927aea2e
commit f0927aea2e
parent d54e0e5b27
3 changed files with 34 additions and 1 deletions
--- a/oxutils/cache.py
+++ b/oxutils/cache.py
@ -6,6 +6,8 @@ import sha
 import time
 import urlparse

+import chardet
+
 import net
 from net import DEFAULT_HEADERS

--- a/oxutils/html.py
+++ b/oxutils/html.py
@ -3,6 +3,7 @@
 # GPL written 2008 by j@pad.ma
 import re
 import string
+from htmlentitydefs import name2codepoint


 # Configuration for urlize() function
@ -116,6 +117,30 @@ def cleanHtml(text):
  text = trailing_empty_content_re.sub('', text)
  return text

+# This pattern matches a character entity reference (a decimal numeric
+# references, a hexadecimal numeric reference, or a named reference).
+charrefpat = re.compile(r'&(#(\d+|x[\da-fA-F]+)|[\w.:-]+);?')
+
+def htmldecode(text):
+  """Decode HTML entities in the given text."""
+  if type(text) != unicode:
+   text = unicode(text)[:]
+  if type(text) is unicode:
+    uchr = unichr
+  else:
+    uchr = lambda value: value > 255 and unichr(value) or chr(value)
+  def entitydecode(match, uchr=uchr):
+    entity = match.group(1)
+    if entity.startswith('#x'):
+      return uchr(int(entity[2:], 16))
+    elif entity.startswith('#'):
+      return uchr(int(entity[1:]))
+    elif entity in name2codepoint:
+      return uchr(name2codepoint[entity])
+    else:
+      return match.group(0)
+  return charrefpat.sub(entitydecode, text)
+
 def highlight(text, query, hlClass="hl"):
  if query:
    text = text.replace('<br />', '|')
--- a/oxutils/text.py
+++ b/oxutils/text.py
@ -46,7 +46,13 @@ def truncateString(s, num):
  if words:
    ts += "..."
  return ts
-    
+
+def trimString(string, num):
+  "Truncates a string after a certain number of chacters, adding ... at -10 characters"
+  if len(string) > num:
+    string = string[:num - 13] + '...' + string[-10:]
+  return string
+
 def truncateWords(s, num):
  "Truncates a string after a certain number of words."
  length = int(num)