scrapeit/scrapeit/utils.py

161 lines
4.3 KiB
Python

# -*- Mode: Python; -*-
# -*- coding: utf-8 -*-
# vi:si:et:sw=2:sts=2:ts=2
"""
screape tools
"""
import re
import time
import urllib
import urllib2
import djangohtml
# Default headers for HTTP requests.
DEFAULT_HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.6) Gecko/20061201 Firefox/2.0.0.6 (Ubuntu-feisty)'}
# --------------------------------------------------------------------
# Functions
# --------------------------------------------------------------------
def quote_plus(s):
"""
A variant of urllib.quote_plus which handles ASCII and Unicode.
"""
return urllib.quote_plus(s.encode('utf-8'))
def read_url_utf8(url, headers=DEFAULT_HEADERS, blocking=True):
"""
Read str contents of given str URL.
Here headers is a map of str -> str for HTTP request headers. If
blocking is True, returns the str page contents. If blocking is
False, returns an iterator which gives None until a successful read,
at which point the str page contents is yielded.
"""
req = urllib2.Request(url, None, headers)
f = urllib2.urlopen(req)
data = f.read()
f.close()
ctype = f.headers.getheader('content-type')
charset = ctype.split('charset=')
if len(charset)>1: charset = charset[1]
else: charset = 'latin-1'
data = unicode(data, charset)
return data
def open_url(url, headers=DEFAULT_HEADERS):
url = url.replace(' ', '%20')
req = urllib2.Request(url, None, headers)
return urllib2.urlopen(req)
def read_url(url, headers=DEFAULT_HEADERS, blocking=True):
"""
Read str contents of given str URL.
Here headers is a map of str -> str for HTTP request headers. If
blocking is True, returns the str page contents. If blocking is
False, returns an iterator which gives None until a successful read,
at which point the str page contents is yielded.
"""
url = url.replace(' ', '%20')
f = open_url(url, headers)
data = f.read()
f.close()
return data
def get_url(url, headers=DEFAULT_HEADERS, blocking=True):
"""
opens given str URL and returns the url after redirection.
"""
rurl = url
try:
req = urllib2.Request(url, None, headers)
rurl = urllib2.urlopen(req).url
rurl = rurl.replace('&src=rss', '')
except:
rurl = url
return rurl
def fix_url(url):
"""
Given url str, trim redirect stuff and return actual URL.
Currently this just returns the URL unmodified.
"""
# if url.lower().find('http%3a//') > 0:
# return 'http://' + url[url.lower().rindex('http%3a//')+9:]
# if url.find('http://') > 0:
# return url[url.rindex('http://'):]
return url
_html_entity_re = re.compile(r'&(?:([a-zA-Z][-.a-zA-Z0-9]*)|#(?:([0-9]+)|[xX]([0-9a-fA-F]+)));?')
import htmlentitydefs
def html_entity_decode(s, encoding = 'utf-8'):
r = []
p = 0
mo = _html_entity_re.search(s, p)
while mo:
r.append(s[p:mo.start()].decode(encoding))
i = mo.lastindex
e = mo.group(i)
try:
if i == 1:
c = htmlentitydefs.name2codepoint[e]
elif i == 2:
c = int(e)
elif i == 3:
c = int(e, 16)
else:
assert 0
r.append(unichr(c))
except KeyError:
r.append(mo.group(0))
p = mo.end()
mo = _html_entity_re.search(s, p)
r.append(s[p:].decode(encoding))
return u''.join(r)
def stripTags(s):
if s:
s = htmldecode(s)
return djangohtml.strip_tags(s).strip()
return u''
strip_tags=stripTags
from htmlentitydefs import name2codepoint
# This pattern matches a character entity reference (a decimal numeric
# references, a hexadecimal numeric reference, or a named reference).
charrefpat = re.compile(r'&(#(\d+|x[\da-fA-F]+)|[\w.:-]+);?')
def htmldecode(text):
"""Decode HTML entities in the given text."""
if type(text) != unicode:
text = unicode(text)[:]
if type(text) is unicode:
uchr = unichr
else:
uchr = lambda value: value > 255 and unichr(value) or chr(value)
def entitydecode(match, uchr=uchr):
entity = match.group(1)
if entity.startswith('#x'):
return uchr(int(entity[2:], 16))
elif entity.startswith('#'):
return uchr(int(entity[1:]))
elif entity in name2codepoint:
return uchr(name2codepoint[entity])
else:
return match.group(0)
return charrefpat.sub(entitydecode, text)