# -*- Mode: Python; -*- # -*- coding: utf-8 -*- # vi:si:et:sw=2:sts=2:ts=2 """ screape tools """ import re import time import urllib import urllib2 import djangohtml # Default headers for HTTP requests. DEFAULT_HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.6) Gecko/20061201 Firefox/2.0.0.6 (Ubuntu-feisty)'} # -------------------------------------------------------------------- # Functions # -------------------------------------------------------------------- def quote_plus(s): """ A variant of urllib.quote_plus which handles ASCII and Unicode. """ return urllib.quote_plus(s.encode('utf-8')) def read_url_utf8(url, headers=DEFAULT_HEADERS, blocking=True): """ Read str contents of given str URL. Here headers is a map of str -> str for HTTP request headers. If blocking is True, returns the str page contents. If blocking is False, returns an iterator which gives None until a successful read, at which point the str page contents is yielded. """ req = urllib2.Request(url, None, headers) f = urllib2.urlopen(req) data = f.read() f.close() ctype = f.headers.getheader('content-type') charset = ctype.split('charset=') if len(charset)>1: charset = charset[1] else: charset = 'latin-1' data = unicode(data, charset) return data def open_url(url, headers=DEFAULT_HEADERS): url = url.replace(' ', '%20') req = urllib2.Request(url, None, headers) return urllib2.urlopen(req) def read_url(url, headers=DEFAULT_HEADERS, blocking=True): """ Read str contents of given str URL. Here headers is a map of str -> str for HTTP request headers. If blocking is True, returns the str page contents. If blocking is False, returns an iterator which gives None until a successful read, at which point the str page contents is yielded. """ url = url.replace(' ', '%20') f = open_url(url, headers) data = f.read() f.close() return data def get_url(url, headers=DEFAULT_HEADERS, blocking=True): """ opens given str URL and returns the url after redirection. """ rurl = url try: req = urllib2.Request(url, None, headers) rurl = urllib2.urlopen(req).url rurl = rurl.replace('&src=rss', '') except: rurl = url return rurl def fix_url(url): """ Given url str, trim redirect stuff and return actual URL. Currently this just returns the URL unmodified. """ # if url.lower().find('http%3a//') > 0: # return 'http://' + url[url.lower().rindex('http%3a//')+9:] # if url.find('http://') > 0: # return url[url.rindex('http://'):] return url _html_entity_re = re.compile(r'&(?:([a-zA-Z][-.a-zA-Z0-9]*)|#(?:([0-9]+)|[xX]([0-9a-fA-F]+)));?') import htmlentitydefs def html_entity_decode(s, encoding = 'utf-8'): r = [] p = 0 mo = _html_entity_re.search(s, p) while mo: r.append(s[p:mo.start()].decode(encoding)) i = mo.lastindex e = mo.group(i) try: if i == 1: c = htmlentitydefs.name2codepoint[e] elif i == 2: c = int(e) elif i == 3: c = int(e, 16) else: assert 0 r.append(unichr(c)) except KeyError: r.append(mo.group(0)) p = mo.end() mo = _html_entity_re.search(s, p) r.append(s[p:].decode(encoding)) return u''.join(r) def stripTags(s): if s: s = htmldecode(s) return djangohtml.strip_tags(s).strip() return u'' strip_tags=stripTags from htmlentitydefs import name2codepoint # This pattern matches a character entity reference (a decimal numeric # references, a hexadecimal numeric reference, or a named reference). charrefpat = re.compile(r'&(#(\d+|x[\da-fA-F]+)|[\w.:-]+);?') def htmldecode(text): """Decode HTML entities in the given text.""" if type(text) != unicode: text = unicode(text)[:] if type(text) is unicode: uchr = unichr else: uchr = lambda value: value > 255 and unichr(value) or chr(value) def entitydecode(match, uchr=uchr): entity = match.group(1) if entity.startswith('#x'): return uchr(int(entity[2:], 16)) elif entity.startswith('#'): return uchr(int(entity[1:])) elif entity in name2codepoint: return uchr(name2codepoint[entity]) else: return match.group(0) return charrefpat.sub(entitydecode, text)