python-ox/oxlib/net.py

# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
# GPL 2008
import os
import gzip
import StringIO
import urllib
import urllib2

from chardet.universaldetector import UniversalDetector


# Default headers for HTTP requests.
DEFAULT_HEADERS = {
    'User-Agent': 'Mozilla/5.0 (X11; U; Linux i386; en-US; rv:1.9.1.1) Gecko/20090716 Firefox/3.5',
    'Accept-Encoding': 'gzip'
}

def status(url, data=None, headers=DEFAULT_HEADERS):
    try:
        f = openUrl(url, data, headers)
        s = f.code
    except urllib2.HTTPError, e:
        s = e.code
    return s

def exists(url, data=None, headers=DEFAULT_HEADERS):
    s = status(url, data, headers)
    if s >= 200 and s < 400:
        return True
    return False

def getHeaders(url, data=None, headers=DEFAULT_HEADERS):
    try:
        f = openUrl(url, data, headers)
        f.headers['Status'] = "%s" % f.code
        headers = f.headers
        f.close()
    except urllib2.HTTPError, e:
        e.headers['Status'] = "%s" % e.code
        headers = e.headers
    return dict(headers)

def openUrl(url, data=None, headers=DEFAULT_HEADERS):
    url = url.replace(' ', '%20')
    req = urllib2.Request(url, data, headers)
    return urllib2.urlopen(req)

def readUrl(url, data=None, headers=DEFAULT_HEADERS, returnHeaders=False):
    f = openUrl(url, data, headers)
    data = f.read()
    f.close()
    if f.headers.get('content-encoding', None) == 'gzip':
        data = gzip.GzipFile(fileobj=StringIO.StringIO(data)).read()
    if returnHeaders:
        f.headers['Status'] = "%s" % f.code
        return dict(f.headers), data
    return data

def readUrlUnicode(url):
    data = readUrl(url)
    encoding = getEncoding(data)
    if not encoding:
        encoding = 'latin-1'
    return unicode(data, encoding)

def getEncoding(data):
    if 'content="text/html; charset=utf-8"' in data:
        return 'utf-8'
    elif 'content="text/html; charset=iso-8859-1"' in data:
        return 'iso-8859-1'
    detector = UniversalDetector()
    for line in data.split('\n'):
        detector.feed(line)
        if detector.done:
            break
    detector.close()
    return detector.result['encoding']

def saveUrl(url, filename, overwrite=False):
    if not os.path.exists(filename) or overwrite:
        dirname = os.path.dirname(filename)
        if not os.path.exists(dirname):
            os.makedirs(dirname)
        data = readUrl(url)
        f = open(filename, 'w')
        f.write(data)
        f.close()
add some functions 2008-04-27 16:54:37 +00:00			`# -- coding: utf-8 --`
vi:si:et:sw=4:sts=4:ts=4 2008-06-19 09:21:21 +00:00			`# vi:si:et:sw=4:sts=4:ts=4`
move and rename some 2008-07-06 13:00:06 +00:00			`# GPL 2008`
fix saveUrl 2009-07-13 10:30:37 +00:00			`import os`
adding gzip support to getUrl 2008-04-28 09:55:30 +00:00			`import gzip`
			`import StringIO`
add some functions 2008-04-27 16:54:37 +00:00			`import urllib`
			`import urllib2`

faster way to detect encoding, speeds up getUrlUnicode on large pages 2008-06-17 10:53:29 +00:00			`from chardet.universaldetector import UniversalDetector`
add some functions 2008-04-27 16:54:37 +00:00

			`# Default headers for HTTP requests.`
adding encoding to default headers (itunes.py may supply others) 2008-04-29 08:45:14 +00:00			`DEFAULT_HEADERS = {`
update user agent strign to firefox 3.5.1 2009-07-18 08:50:51 +00:00			`'User-Agent': 'Mozilla/5.0 (X11; U; Linux i386; en-US; rv:1.9.1.1) Gecko/20090716 Firefox/3.5',`
vi:si:et:sw=4:sts=4:ts=4 2008-06-19 09:21:21 +00:00			`'Accept-Encoding': 'gzip'`
adding encoding to default headers (itunes.py may supply others) 2008-04-29 08:45:14 +00:00			`}`
add some functions 2008-04-27 16:54:37 +00:00
add status, exists and getHeaders to net and cache, fix timeout bug in cache, now timeouts other then default actually work 2008-04-30 12:43:14 +00:00			`def status(url, data=None, headers=DEFAULT_HEADERS):`
vi:si:et:sw=4:sts=4:ts=4 2008-06-19 09:21:21 +00:00			`try:`
			`f = openUrl(url, data, headers)`
			`s = f.code`
			`except urllib2.HTTPError, e:`
			`s = e.code`
			`return s`
add status, exists and getHeaders to net and cache, fix timeout bug in cache, now timeouts other then default actually work 2008-04-30 12:43:14 +00:00
			`def exists(url, data=None, headers=DEFAULT_HEADERS):`
vi:si:et:sw=4:sts=4:ts=4 2008-06-19 09:21:21 +00:00			`s = status(url, data, headers)`
			`if s >= 200 and s < 400:`
			`return True`
			`return False`
add status, exists and getHeaders to net and cache, fix timeout bug in cache, now timeouts other then default actually work 2008-04-30 12:43:14 +00:00
			`def getHeaders(url, data=None, headers=DEFAULT_HEADERS):`
vi:si:et:sw=4:sts=4:ts=4 2008-06-19 09:21:21 +00:00			`try:`
			`f = openUrl(url, data, headers)`
			`f.headers['Status'] = "%s" % f.code`
			`headers = f.headers`
			`f.close()`
			`except urllib2.HTTPError, e:`
			`e.headers['Status'] = "%s" % e.code`
			`headers = e.headers`
			`return dict(headers)`
add status, exists and getHeaders to net and cache, fix timeout bug in cache, now timeouts other then default actually work 2008-04-30 12:43:14 +00:00
add some functions 2008-04-27 16:54:37 +00:00			`def openUrl(url, data=None, headers=DEFAULT_HEADERS):`
vi:si:et:sw=4:sts=4:ts=4 2008-06-19 09:21:21 +00:00			`url = url.replace(' ', '%20')`
			`req = urllib2.Request(url, data, headers)`
			`return urllib2.urlopen(req)`
add some functions 2008-04-27 16:54:37 +00:00
change api, getUrl -> readUrl 2009-10-11 13:03:00 +00:00			`def readUrl(url, data=None, headers=DEFAULT_HEADERS, returnHeaders=False):`
vi:si:et:sw=4:sts=4:ts=4 2008-06-19 09:21:21 +00:00			`f = openUrl(url, data, headers)`
			`data = f.read()`
			`f.close()`
			`if f.headers.get('content-encoding', None) == 'gzip':`
			`data = gzip.GzipFile(fileobj=StringIO.StringIO(data)).read()`
			`if returnHeaders:`
			`f.headers['Status'] = "%s" % f.code`
			`return dict(f.headers), data`
			`return data`
add some functions 2008-04-27 16:54:37 +00:00
change api, getUrl -> readUrl 2009-10-11 13:03:00 +00:00			`def readUrlUnicode(url):`
			`data = readUrl(url)`
vi:si:et:sw=4:sts=4:ts=4 2008-06-19 09:21:21 +00:00			`encoding = getEncoding(data)`
			`if not encoding:`
			`encoding = 'latin-1'`
			`return unicode(data, encoding)`
add some functions 2008-04-27 16:54:37 +00:00
faster way to detect encoding, speeds up getUrlUnicode on large pages 2008-06-17 10:53:29 +00:00			`def getEncoding(data):`
get basic utf-8 case right 2009-03-14 21:02:20 +00:00			`if 'content="text/html; charset=utf-8"' in data:`
			`return 'utf-8'`
detecht iso-8859-1 in html header 2009-07-15 13:53:40 +00:00			`elif 'content="text/html; charset=iso-8859-1"' in data:`
			`return 'iso-8859-1'`
vi:si:et:sw=4:sts=4:ts=4 2008-06-19 09:21:21 +00:00			`detector = UniversalDetector()`
			`for line in data.split('\n'):`
			`detector.feed(line)`
			`if detector.done:`
			`break`
			`detector.close()`
			`return detector.result['encoding']`
faster way to detect encoding, speeds up getUrlUnicode on large pages 2008-06-17 10:53:29 +00:00
detecht iso-8859-1 in html header 2009-07-15 13:53:40 +00:00			`def saveUrl(url, filename, overwrite=False):`
			`if not os.path.exists(filename) or overwrite:`
			`dirname = os.path.dirname(filename)`
			`if not os.path.exists(dirname):`
			`os.makedirs(dirname)`
found another getUrl 2009-10-15 12:57:07 +00:00			`data = readUrl(url)`
detecht iso-8859-1 in html header 2009-07-15 13:53:40 +00:00			`f = open(filename, 'w')`
			`f.write(data)`
			`f.close()`