157 lines
4.2 KiB
Python
157 lines
4.2 KiB
Python
# -*- Mode: Python; -*-
|
|
# -*- coding: utf-8 -*-
|
|
# vi:si:et:sw=2:sts=2:ts=2
|
|
"""
|
|
screape tools
|
|
"""
|
|
|
|
import re
|
|
import time
|
|
import urllib
|
|
import urllib2
|
|
|
|
import djangohtml
|
|
|
|
|
|
# Default headers for HTTP requests.
|
|
DEFAULT_HEADERS = {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5)'}
|
|
|
|
# --------------------------------------------------------------------
|
|
# Functions
|
|
# --------------------------------------------------------------------
|
|
|
|
def quote_plus(s):
|
|
"""
|
|
A variant of urllib.quote_plus which handles ASCII and Unicode.
|
|
"""
|
|
return urllib.quote_plus(s.encode('utf-8'))
|
|
|
|
|
|
def read_url_utf8(url, headers=DEFAULT_HEADERS, blocking=True):
|
|
"""
|
|
Read str contents of given str URL.
|
|
|
|
Here headers is a map of str -> str for HTTP request headers. If
|
|
blocking is True, returns the str page contents. If blocking is
|
|
False, returns an iterator which gives None until a successful read,
|
|
at which point the str page contents is yielded.
|
|
"""
|
|
req = urllib2.Request(url, None, headers)
|
|
f = urllib2.urlopen(req)
|
|
data = f.read()
|
|
f.close()
|
|
ctype = f.headers.getheader('content-type')
|
|
charset = ctype.split('charset=')
|
|
if len(charset)>1: charset = charset[1]
|
|
else: charset = 'latin-1'
|
|
data = unicode(data, charset)
|
|
return data
|
|
|
|
def open_url(url, headers=DEFAULT_HEADERS):
|
|
url = url.replace(' ', '%20')
|
|
req = urllib2.Request(url, None, headers)
|
|
return urllib2.urlopen(req)
|
|
|
|
def read_url(url, headers=DEFAULT_HEADERS, blocking=True):
|
|
"""
|
|
Read str contents of given str URL.
|
|
|
|
Here headers is a map of str -> str for HTTP request headers. If
|
|
blocking is True, returns the str page contents. If blocking is
|
|
False, returns an iterator which gives None until a successful read,
|
|
at which point the str page contents is yielded.
|
|
"""
|
|
url = url.replace(' ', '%20')
|
|
f = open_url(url, headers)
|
|
data = f.read()
|
|
f.close()
|
|
return data
|
|
|
|
|
|
def get_url(url, headers=DEFAULT_HEADERS, blocking=True):
|
|
"""
|
|
opens given str URL and returns the url after redirection.
|
|
"""
|
|
rurl = url
|
|
try:
|
|
req = urllib2.Request(url, None, headers)
|
|
rurl = urllib2.urlopen(req).url
|
|
rurl = rurl.replace('&src=rss', '')
|
|
except:
|
|
rurl = url
|
|
return rurl
|
|
|
|
|
|
def fix_url(url):
|
|
"""
|
|
Given url str, trim redirect stuff and return actual URL.
|
|
|
|
Currently this just returns the URL unmodified.
|
|
"""
|
|
# if url.lower().find('http%3a//') > 0:
|
|
# return 'http://' + url[url.lower().rindex('http%3a//')+9:]
|
|
# if url.find('http://') > 0:
|
|
# return url[url.rindex('http://'):]
|
|
return url
|
|
|
|
|
|
_html_entity_re = re.compile(r'&(?:([a-zA-Z][-.a-zA-Z0-9]*)|#(?:([0-9]+)|[xX]([0-9a-fA-F]+)));?')
|
|
import htmlentitydefs
|
|
|
|
def html_entity_decode(s, encoding = 'utf-8'):
|
|
r = []
|
|
p = 0
|
|
mo = _html_entity_re.search(s, p)
|
|
while mo:
|
|
r.append(s[p:mo.start()].decode(encoding))
|
|
i = mo.lastindex
|
|
e = mo.group(i)
|
|
try:
|
|
if i == 1:
|
|
c = htmlentitydefs.name2codepoint[e]
|
|
elif i == 2:
|
|
c = int(e)
|
|
elif i == 3:
|
|
c = int(e, 16)
|
|
else:
|
|
assert 0
|
|
r.append(unichr(c))
|
|
except KeyError:
|
|
r.append(mo.group(0))
|
|
|
|
p = mo.end()
|
|
mo = _html_entity_re.search(s, p)
|
|
r.append(s[p:].decode(encoding))
|
|
return u''.join(r)
|
|
|
|
def stripTags(s):
|
|
if s:
|
|
return djangohtml.strip_tags(htmldecode(s)).strip()
|
|
return u''
|
|
|
|
from htmlentitydefs import name2codepoint
|
|
|
|
# This pattern matches a character entity reference (a decimal numeric
|
|
# references, a hexadecimal numeric reference, or a named reference).
|
|
charrefpat = re.compile(r'&(#(\d+|x[\da-fA-F]+)|[\w.:-]+);?')
|
|
|
|
def htmldecode(text):
|
|
"""Decode HTML entities in the given text."""
|
|
if type(text) != unicode:
|
|
text = unicode(text)
|
|
if type(text) is unicode:
|
|
uchr = unichr
|
|
else:
|
|
uchr = lambda value: value > 255 and unichr(value) or chr(value)
|
|
def entitydecode(match, uchr=uchr):
|
|
entity = match.group(1)
|
|
if entity.startswith('#x'):
|
|
return uchr(int(entity[2:], 16))
|
|
elif entity.startswith('#'):
|
|
return uchr(int(entity[1:]))
|
|
elif entity in name2codepoint:
|
|
return uchr(name2codepoint[entity])
|
|
else:
|
|
return match.group(0)
|
|
return charrefpat.sub(entitydecode, text)
|
|
|