
161 lines
4.3 KiB
Raw Normal View History

2007-03-01 15:11:35 +00:00
# -*- Mode: Python; -*-
# -*- coding: utf-8 -*-
# vi:si:et:sw=2:sts=2:ts=2
screape tools
import re
import time
import urllib
import urllib2
import djangohtml
# Default headers for HTTP requests.
DEFAULT_HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv: Gecko/20061201 Firefox/ (Ubuntu-feisty)'}
2007-03-01 15:11:35 +00:00
# --------------------------------------------------------------------
# Functions
# --------------------------------------------------------------------
def quote_plus(s):
A variant of urllib.quote_plus which handles ASCII and Unicode.
return urllib.quote_plus(s.encode('utf-8'))
def read_url_utf8(url, headers=DEFAULT_HEADERS, blocking=True):
Read str contents of given str URL.
Here headers is a map of str -> str for HTTP request headers. If
blocking is True, returns the str page contents. If blocking is
False, returns an iterator which gives None until a successful read,
at which point the str page contents is yielded.
req = urllib2.Request(url, None, headers)
f = urllib2.urlopen(req)
data =
ctype = f.headers.getheader('content-type')
charset = ctype.split('charset=')
if len(charset)>1: charset = charset[1]
else: charset = 'latin-1'
data = unicode(data, charset)
return data
2007-06-08 19:19:22 +00:00
def open_url(url, headers=DEFAULT_HEADERS):
url = url.replace(' ', '%20')
req = urllib2.Request(url, None, headers)
return urllib2.urlopen(req)
2007-03-01 15:11:35 +00:00
def read_url(url, headers=DEFAULT_HEADERS, blocking=True):
Read str contents of given str URL.
Here headers is a map of str -> str for HTTP request headers. If
blocking is True, returns the str page contents. If blocking is
False, returns an iterator which gives None until a successful read,
at which point the str page contents is yielded.
2007-04-06 17:03:24 +00:00
url = url.replace(' ', '%20')
2007-06-08 19:19:22 +00:00
f = open_url(url, headers)
2007-03-01 15:11:35 +00:00
data =
return data
2007-06-08 19:19:22 +00:00
2007-03-01 15:11:35 +00:00
def get_url(url, headers=DEFAULT_HEADERS, blocking=True):
opens given str URL and returns the url after redirection.
rurl = url
req = urllib2.Request(url, None, headers)
rurl = urllib2.urlopen(req).url
rurl = rurl.replace('&src=rss', '')
rurl = url
return rurl
def fix_url(url):
Given url str, trim redirect stuff and return actual URL.
Currently this just returns the URL unmodified.
# if url.lower().find('http%3a//') > 0:
# return 'http://' + url[url.lower().rindex('http%3a//')+9:]
# if url.find('http://') > 0:
# return url[url.rindex('http://'):]
return url
_html_entity_re = re.compile(r'&(?:([a-zA-Z][-.a-zA-Z0-9]*)|#(?:([0-9]+)|[xX]([0-9a-fA-F]+)));?')
import htmlentitydefs
def html_entity_decode(s, encoding = 'utf-8'):
r = []
p = 0
mo =, p)
while mo:
i = mo.lastindex
e =
if i == 1:
c = htmlentitydefs.name2codepoint[e]
elif i == 2:
c = int(e)
elif i == 3:
c = int(e, 16)
assert 0
except KeyError:
p = mo.end()
mo =, p)
return u''.join(r)
def stripTags(s):
2007-07-30 11:53:55 +00:00
if s:
2007-11-29 21:25:19 +00:00
s = htmldecode(s)
return djangohtml.strip_tags(s).strip()
2007-07-30 11:53:55 +00:00
return u''
2007-10-10 10:41:19 +00:00
2007-03-01 15:11:35 +00:00
from htmlentitydefs import name2codepoint
# This pattern matches a character entity reference (a decimal numeric
# references, a hexadecimal numeric reference, or a named reference).
charrefpat = re.compile(r'&(#(\d+|x[\da-fA-F]+)|[\w.:-]+);?')
def htmldecode(text):
"""Decode HTML entities in the given text."""
if type(text) != unicode:
2007-11-29 21:25:19 +00:00
text = unicode(text)[:]
2007-03-01 15:11:35 +00:00
if type(text) is unicode:
uchr = unichr
uchr = lambda value: value > 255 and unichr(value) or chr(value)
def entitydecode(match, uchr=uchr):
entity =
if entity.startswith('#x'):
return uchr(int(entity[2:], 16))
elif entity.startswith('#'):
return uchr(int(entity[1:]))
elif entity in name2codepoint:
return uchr(name2codepoint[entity])
return charrefpat.sub(entitydecode, text)