python-oxlib/oxlib/net.py

# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
# GPL 2008
import gzip
import StringIO
import urllib
import urllib2

from chardet.universaldetector import UniversalDetector


# Default headers for HTTP requests.
DEFAULT_HEADERS = {
    'User-Agent': 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9) Gecko/2008061015 Firefox/3.0',
    'Accept-Encoding': 'gzip'
}

def status(url, data=None, headers=DEFAULT_HEADERS):
    try:
        f = openUrl(url, data, headers)
        s = f.code
    except urllib2.HTTPError, e:
        s = e.code
    return s

def exists(url, data=None, headers=DEFAULT_HEADERS):
    s = status(url, data, headers)
    if s >= 200 and s < 400:
        return True
    return False

def getHeaders(url, data=None, headers=DEFAULT_HEADERS):
    try:
        f = openUrl(url, data, headers)
        f.headers['Status'] = "%s" % f.code
        headers = f.headers
        f.close()
    except urllib2.HTTPError, e:
        e.headers['Status'] = "%s" % e.code
        headers = e.headers
    return dict(headers)

def openUrl(url, data=None, headers=DEFAULT_HEADERS):
    url = url.replace(' ', '%20')
    req = urllib2.Request(url, data, headers)
    return urllib2.urlopen(req)

def getUrl(url, data=None, headers=DEFAULT_HEADERS, returnHeaders=False):
    f = openUrl(url, data, headers)
    data = f.read()
    f.close()
    if f.headers.get('content-encoding', None) == 'gzip':
        data = gzip.GzipFile(fileobj=StringIO.StringIO(data)).read()
    if returnHeaders:
        f.headers['Status'] = "%s" % f.code
        return dict(f.headers), data
    return data

def getUrlUnicode(url):
    data = getUrl(url)
    encoding = getEncoding(data)
    if not encoding:
        encoding = 'latin-1'
    return unicode(data, encoding)

def getEncoding(data):
    detector = UniversalDetector()
    for line in data.split('\n'):
        detector.feed(line)
        if detector.done:
            break
    detector.close()
    return detector.result['encoding']
add some functions 2008-04-27 16:54:37 +00:00			`# -- coding: utf-8 --`
vi:si:et:sw=4:sts=4:ts=4 2008-06-19 09:21:21 +00:00			`# vi:si:et:sw=4:sts=4:ts=4`
move and rename some 2008-07-06 13:00:06 +00:00			`# GPL 2008`
adding gzip support to getUrl 2008-04-28 09:55:30 +00:00			`import gzip`
			`import StringIO`
add some functions 2008-04-27 16:54:37 +00:00			`import urllib`
			`import urllib2`

faster way to detect encoding, speeds up getUrlUnicode on large pages 2008-06-17 10:53:29 +00:00			`from chardet.universaldetector import UniversalDetector`
add some functions 2008-04-27 16:54:37 +00:00

			`# Default headers for HTTP requests.`
adding encoding to default headers (itunes.py may supply others) 2008-04-29 08:45:14 +00:00			`DEFAULT_HEADERS = {`
vi:si:et:sw=4:sts=4:ts=4 2008-06-19 09:21:21 +00:00			`'User-Agent': 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9) Gecko/2008061015 Firefox/3.0',`
			`'Accept-Encoding': 'gzip'`
adding encoding to default headers (itunes.py may supply others) 2008-04-29 08:45:14 +00:00			`}`
add some functions 2008-04-27 16:54:37 +00:00
add status, exists and getHeaders to net and cache, fix timeout bug in cache, now timeouts other then default actually work 2008-04-30 12:43:14 +00:00			`def status(url, data=None, headers=DEFAULT_HEADERS):`
vi:si:et:sw=4:sts=4:ts=4 2008-06-19 09:21:21 +00:00			`try:`
			`f = openUrl(url, data, headers)`
			`s = f.code`
			`except urllib2.HTTPError, e:`
			`s = e.code`
			`return s`
add status, exists and getHeaders to net and cache, fix timeout bug in cache, now timeouts other then default actually work 2008-04-30 12:43:14 +00:00
			`def exists(url, data=None, headers=DEFAULT_HEADERS):`
vi:si:et:sw=4:sts=4:ts=4 2008-06-19 09:21:21 +00:00			`s = status(url, data, headers)`
			`if s >= 200 and s < 400:`
			`return True`
			`return False`
add status, exists and getHeaders to net and cache, fix timeout bug in cache, now timeouts other then default actually work 2008-04-30 12:43:14 +00:00
			`def getHeaders(url, data=None, headers=DEFAULT_HEADERS):`
vi:si:et:sw=4:sts=4:ts=4 2008-06-19 09:21:21 +00:00			`try:`
			`f = openUrl(url, data, headers)`
			`f.headers['Status'] = "%s" % f.code`
			`headers = f.headers`
			`f.close()`
			`except urllib2.HTTPError, e:`
			`e.headers['Status'] = "%s" % e.code`
			`headers = e.headers`
			`return dict(headers)`
add status, exists and getHeaders to net and cache, fix timeout bug in cache, now timeouts other then default actually work 2008-04-30 12:43:14 +00:00
add some functions 2008-04-27 16:54:37 +00:00			`def openUrl(url, data=None, headers=DEFAULT_HEADERS):`
vi:si:et:sw=4:sts=4:ts=4 2008-06-19 09:21:21 +00:00			`url = url.replace(' ', '%20')`
			`req = urllib2.Request(url, data, headers)`
			`return urllib2.urlopen(req)`
add some functions 2008-04-27 16:54:37 +00:00
add status, exists and getHeaders to net and cache, fix timeout bug in cache, now timeouts other then default actually work 2008-04-30 12:43:14 +00:00			`def getUrl(url, data=None, headers=DEFAULT_HEADERS, returnHeaders=False):`
vi:si:et:sw=4:sts=4:ts=4 2008-06-19 09:21:21 +00:00			`f = openUrl(url, data, headers)`
			`data = f.read()`
			`f.close()`
			`if f.headers.get('content-encoding', None) == 'gzip':`
			`data = gzip.GzipFile(fileobj=StringIO.StringIO(data)).read()`
			`if returnHeaders:`
			`f.headers['Status'] = "%s" % f.code`
			`return dict(f.headers), data`
			`return data`
add some functions 2008-04-27 16:54:37 +00:00
			`def getUrlUnicode(url):`
vi:si:et:sw=4:sts=4:ts=4 2008-06-19 09:21:21 +00:00			`data = getUrl(url)`
			`encoding = getEncoding(data)`
			`if not encoding:`
			`encoding = 'latin-1'`
			`return unicode(data, encoding)`
add some functions 2008-04-27 16:54:37 +00:00
faster way to detect encoding, speeds up getUrlUnicode on large pages 2008-06-17 10:53:29 +00:00			`def getEncoding(data):`
vi:si:et:sw=4:sts=4:ts=4 2008-06-19 09:21:21 +00:00			`detector = UniversalDetector()`
			`for line in data.split('\n'):`
			`detector.feed(line)`
			`if detector.done:`
			`break`
			`detector.close()`
			`return detector.result['encoding']`
faster way to detect encoding, speeds up getUrlUnicode on large pages 2008-06-17 10:53:29 +00:00