net/cache readUrl->read_url / Unicode -> unicode=True

format replace all CammelCase with under_score
This commit is contained in:
j 2012-08-14 15:58:05 +02:00
parent c1d0fc6242
commit 2de989e188
33 changed files with 243 additions and 254 deletions

View file

@ -18,7 +18,7 @@ from utils import json
from .file import makedirs from .file import makedirs
import net import net
from net import DEFAULT_HEADERS, getEncoding from net import DEFAULT_HEADERS, detect_encoding
cache_timeout = 30*24*60*60 # default is 30 days cache_timeout = 30*24*60*60 # default is 30 days
@ -40,7 +40,7 @@ def status(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):
>>> status('http://google.com/mysearch') >>> status('http://google.com/mysearch')
404 404
''' '''
headers = getHeaders(url, data, headers) headers = get_headers(url, data, headers)
return int(headers['status']) return int(headers['status'])
def exists(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout): def exists(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):
@ -55,10 +55,10 @@ def exists(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):
return True return True
return False return False
def getHeaders(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout): def get_headers(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):
url_headers = store.get(url, data, headers, timeout, "headers") url_headers = store.get(url, data, headers, timeout, "headers")
if not url_headers: if not url_headers:
url_headers = net.getHeaders(url, data, headers) url_headers = net.get_headers(url, data, headers)
store.set(url, data, -1, url_headers) store.set(url, data, -1, url_headers)
return url_headers return url_headers
@ -68,7 +68,7 @@ class InvalidResult(Exception):
self.result = result self.result = result
self.headers = headers self.headers = headers
def readUrl(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout, valid=None): def read_url(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout, valid=None, unicode=False):
''' '''
url - url to load url - url to load
data - possible post data data - possible post data
@ -80,31 +80,29 @@ def readUrl(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout, vali
#FIXME: send last-modified / etag from cache and only update if needed #FIXME: send last-modified / etag from cache and only update if needed
if isinstance(url, unicode): if isinstance(url, unicode):
url = url.encode('utf-8') url = url.encode('utf-8')
result = store.get(url, data, headers, timeout) data = store.get(url, data, headers, timeout)
if not result: if not data:
#print "get data", url #print "get data", url
try: try:
url_headers, result = net.readUrl(url, data, headers, returnHeaders=True) url_headers, data = net.read_url(url, data, headers, return_headers=True)
except urllib2.HTTPError, e: except urllib2.HTTPError, e:
e.headers['Status'] = "%s" % e.code e.headers['Status'] = "%s" % e.code
url_headers = dict(e.headers) url_headers = dict(e.headers)
result = e.read() data = e.read()
if url_headers.get('content-encoding', None) == 'gzip': if url_headers.get('content-encoding', None) == 'gzip':
result = gzip.GzipFile(fileobj=StringIO.StringIO(result)).read() data = gzip.GzipFile(fileobj=StringIO.StringIO(data)).read()
if not valid or valid(result, url_headers): if not valid or valid(data, url_headers):
store.set(url, data, result, url_headers) store.set(url, data, data, url_headers)
else: else:
raise InvalidResult(result, url_headers) raise InvalidResult(data, url_headers)
return result if unicode:
encoding = detect_encoding(data)
if not encoding:
encoding = 'latin-1'
data = data.decode(encoding)
return data
def readUrlUnicode(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout, _readUrl=readUrl, valid=None): def save_url(url, filename, overwrite=False):
data = _readUrl(url, data, headers, timeout, valid)
encoding = getEncoding(data)
if not encoding:
encoding = 'latin-1'
return unicode(data, encoding)
def saveUrl(url, filename, overwrite=False):
if not os.path.exists(filename) or overwrite: if not os.path.exists(filename) or overwrite:
dirname = os.path.dirname(filename) dirname = os.path.dirname(filename)
if not os.path.exists(dirname): if not os.path.exists(dirname):

View file

@ -217,15 +217,15 @@ def to36(q):
def from36(q): def from36(q):
return int(q, 36) return int(q, 36)
def intValue(strValue, default=u''): def int_value(strValue, default=u''):
""" """
>>> intValue('abc23') >>> int_value('abc23')
u'23' u'23'
>>> intValue(' abc23') >>> int_value(' abc23')
u'23' u'23'
>>> intValue('ab') >>> int_value('ab')
u'' u''
""" """
try: try:
@ -234,15 +234,15 @@ def intValue(strValue, default=u''):
val = default val = default
return val return val
def floatValue(strValue, default=u''): def float_value(strValue, default=u''):
""" """
>>> floatValue('abc23.4') >>> float_value('abc23.4')
u'23.4' u'23.4'
>>> floatValue(' abc23.4') >>> float_value(' abc23.4')
u'23.4' u'23.4'
>>> floatValue('ab') >>> float_value('ab')
u'' u''
""" """
try: try:
@ -251,46 +251,46 @@ def floatValue(strValue, default=u''):
val = default val = default
return val return val
def formatNumber(number, longName, shortName): def format_number(number, longName, shortName):
""" """
Return the number in a human-readable format (23 KB, 23.4 MB, 23.42 GB) Return the number in a human-readable format (23 KB, 23.4 MB, 23.42 GB)
>>> formatNumber(123, 'Byte', 'B') >>> format_number(123, 'Byte', 'B')
'123 Bytes' '123 Bytes'
>>> formatNumber(1234, 'Byte', 'B') >>> format_number(1234, 'Byte', 'B')
'1 KB' '1 KB'
>>> formatNumber(1234567, 'Byte', 'B') >>> format_number(1234567, 'Byte', 'B')
'1.2 MB' '1.2 MB'
>>> formatNumber(1234567890, 'Byte', 'B') >>> format_number(1234567890, 'Byte', 'B')
'1.15 GB' '1.15 GB'
>>> formatNumber(1234567890123456789, 'Byte', 'B') >>> format_number(1234567890123456789, 'Byte', 'B')
'1,096.5166 PB' '1,096.5166 PB'
>>> formatNumber(-1234567890123456789, 'Byte', 'B') >>> format_number(-1234567890123456789, 'Byte', 'B')
'-1,096.5166 PB' '-1,096.5166 PB'
""" """
if abs(number) < 1024: if abs(number) < 1024:
return '%s %s%s' % (formatThousands(number), longName, number != 1 and 's' or '') return '%s %s%s' % (format_thousands(number), longName, number != 1 and 's' or '')
prefix = ['K', 'M', 'G', 'T', 'P'] prefix = ['K', 'M', 'G', 'T', 'P']
for i in range(5): for i in range(5):
if abs(number) < math.pow(1024, i + 2) or i == 4: if abs(number) < math.pow(1024, i + 2) or i == 4:
n = number / math.pow(1024, i + 1) n = number / math.pow(1024, i + 1)
return '%s %s%s' % (formatThousands('%.*f' % (i, n)), prefix[i], shortName) return '%s %s%s' % (format_thousands('%.*f' % (i, n)), prefix[i], shortName)
def formatThousands(number, separator = ','): def format_thousands(number, separator = ','):
""" """
Return the number with separators (1,000,000) Return the number with separators (1,000,000)
>>> formatThousands(1) >>> format_thousands(1)
'1' '1'
>>> formatThousands(1000) >>> format_thousands(1000)
'1,000' '1,000'
>>> formatThousands(1000000) >>> format_thousands(1000000)
'1,000,000' '1,000,000'
""" """
string = str(number).split('.') string = str(number).split('.')
@ -302,16 +302,16 @@ def formatThousands(number, separator = ','):
string[0] = ''.join(l) string[0] = ''.join(l)
return '.'.join(string) return '.'.join(string)
def formatBits(number): def format_bits(number):
return formatNumber(number, 'bit', 'b') return format_number(number, 'bit', 'b')
def formatBytes(number): def format_bytes(number):
return formatNumber(number, 'byte', 'B') return format_number(number, 'byte', 'B')
def formatPixels(number): def format_pixels(number):
return formatNumber(number, 'pixel', 'px') return format_number(number, 'pixel', 'px')
def formatCurrency(amount, currency="$"): def format_currency(amount, currency="$"):
if amount: if amount:
temp = "%.2f" % amount temp = "%.2f" % amount
profile=re.compile(r"(\d)(\d\d\d[.,])") profile=re.compile(r"(\d)(\d\d\d[.,])")
@ -336,9 +336,9 @@ def plural(amount, unit, plural='s'):
if plural == 's': if plural == 's':
unit = unit + plural unit = unit + plural
else: unit = plural else: unit = plural
return "%s %s" % (formatThousands(amount), unit) return "%s %s" % (format_thousands(amount), unit)
def formatDuration(ms, verbosity=0, years=True, hours=True, milliseconds=True): def format_duration(ms, verbosity=0, years=True, hours=True, milliseconds=True):
''' '''
verbosity verbosity
0: D:HH:MM:SS 0: D:HH:MM:SS
@ -353,13 +353,13 @@ def formatDuration(ms, verbosity=0, years=True, hours=True, milliseconds=True):
milliseconds milliseconds
True: always display milliseconds True: always display milliseconds
False: never display milliseconds False: never display milliseconds
>>> formatDuration(1000 * 60 * 60 * 24 * 366) >>> format_duration(1000 * 60 * 60 * 24 * 366)
'1:001:00:00:00.000' '1:001:00:00:00.000'
>>> formatDuration(1000 * 60 * 60 * 24 * 366, years=False) >>> format_duration(1000 * 60 * 60 * 24 * 366, years=False)
'366:00:00:00.000' '366:00:00:00.000'
>>> formatDuration(1000 * 60 * 60 * 24 * 365 + 2003, verbosity=2) >>> format_duration(1000 * 60 * 60 * 24 * 365 + 2003, verbosity=2)
'1 year 2 seconds 3 milliseconds' '1 year 2 seconds 3 milliseconds'
>>> formatDuration(1000 * 30, hours=False, milliseconds=False) >>> format_duration(1000 * 30, hours=False, milliseconds=False)
'00:30' '00:30'
''' '''
if not ms and ms != 0: if not ms and ms != 0:
@ -403,7 +403,7 @@ def formatDuration(ms, verbosity=0, years=True, hours=True, milliseconds=True):
return duration return duration
def ms2runtime(ms, shortenLong=False): def ms2runtime(ms, shortenLong=False):
# deprecated - use formatDuration # deprecated - use format_duration
''' '''
>>> ms2runtime(5000) >>> ms2runtime(5000)
'5 seconds' '5 seconds'
@ -415,11 +415,11 @@ def ms2runtime(ms, shortenLong=False):
'13 hours 53 minutes' '13 hours 53 minutes'
''' '''
if shortenLong and ms > 1000 * 60 * 60 * 24 * 464: if shortenLong and ms > 1000 * 60 * 60 * 24 * 464:
return formatDuration(ms, verbosity=1, milliseconds=False) return format_duration(ms, verbosity=1, milliseconds=False)
return formatDuration(ms, verbosity=2, milliseconds=False) return format_duration(ms, verbosity=2, milliseconds=False)
def ms2playtime(ms, hours=False): def ms2playtime(ms, hours=False):
# deprecated - use formatDuration # deprecated - use format_duration
''' '''
>>> ms2playtime(5000) >>> ms2playtime(5000)
'00:05' '00:05'
@ -428,15 +428,15 @@ def ms2playtime(ms, hours=False):
>>> ms2playtime(50000000) >>> ms2playtime(50000000)
'13:53:20' '13:53:20'
''' '''
return formatDuration(ms, hours=False, years=False, milliseconds=False) return format_duration(ms, hours=False, years=False, milliseconds=False)
def ms2time(ms): def ms2time(ms):
# deprecated - use formatDuration # deprecated - use format_duration
''' '''
>>> ms2time(44592123) >>> ms2time(44592123)
'12:23:12.123' '12:23:12.123'
''' '''
return formatDuration(ms, years=False) return format_duration(ms, years=False)
def time2ms(timeString): def time2ms(timeString):
''' '''
@ -451,7 +451,7 @@ def time2ms(timeString):
ms = ms * 60 + float(_p) ms = ms * 60 + float(_p)
return int(ms * 1000) return int(ms * 1000)
def shiftTime(offset, timeString): def shift_time(offset, timeString):
newTime = time2ms(timeString) + offset newTime = time2ms(timeString) + offset
return ms2time(newTime) return ms2time(newTime)

View file

@ -22,7 +22,7 @@ DEFAULT_HEADERS = {
def status(url, data=None, headers=DEFAULT_HEADERS): def status(url, data=None, headers=DEFAULT_HEADERS):
try: try:
f = openUrl(url, data, headers) f = open_url(url, data, headers)
s = f.code s = f.code
except urllib2.HTTPError, e: except urllib2.HTTPError, e:
s = e.code s = e.code
@ -34,9 +34,9 @@ def exists(url, data=None, headers=DEFAULT_HEADERS):
return True return True
return False return False
def getHeaders(url, data=None, headers=DEFAULT_HEADERS): def headers(url, data=None, headers=DEFAULT_HEADERS):
try: try:
f = openUrl(url, data, headers) f = open_url(url, data, headers)
f.headers['Status'] = "%s" % f.code f.headers['Status'] = "%s" % f.code
headers = f.headers headers = f.headers
f.close() f.close()
@ -45,30 +45,28 @@ def getHeaders(url, data=None, headers=DEFAULT_HEADERS):
headers = e.headers headers = e.headers
return dict(headers) return dict(headers)
def openUrl(url, data=None, headers=DEFAULT_HEADERS): def open_url(url, data=None, headers=DEFAULT_HEADERS):
url = url.replace(' ', '%20') url = url.replace(' ', '%20')
req = urllib2.Request(url, data, headers) req = urllib2.Request(url, data, headers)
return urllib2.urlopen(req) return urllib2.urlopen(req)
def readUrl(url, data=None, headers=DEFAULT_HEADERS, returnHeaders=False): def read_url(url, data=None, headers=DEFAULT_HEADERS, return_headers=False, unicode=False):
f = openUrl(url, data, headers) f = open_url(url, data, headers)
data = f.read() data = f.read()
f.close() f.close()
if f.headers.get('content-encoding', None) == 'gzip': if f.headers.get('content-encoding', None) == 'gzip':
data = gzip.GzipFile(fileobj=StringIO.StringIO(data)).read() data = gzip.GzipFile(fileobj=StringIO.StringIO(data)).read()
if returnHeaders: if unicode:
encoding = detect_encoding(data)
if not encoding:
encoding = 'latin-1'
data = data.decode(encoding)
if return_headers:
f.headers['Status'] = "%s" % f.code f.headers['Status'] = "%s" % f.code
return dict(f.headers), data return dict(f.headers), data
return data return data
def readUrlUnicode(url, data=None, headers=DEFAULT_HEADERS): def detect_encoding(data):
data = readUrl(url, data, headers)
encoding = getEncoding(data)
if not encoding:
encoding = 'latin-1'
return unicode(data, encoding)
def getEncoding(data):
if 'content="text/html; charset=utf-8"' in data: if 'content="text/html; charset=utf-8"' in data:
return 'utf-8' return 'utf-8'
elif 'content="text/html; charset=iso-8859-1"' in data: elif 'content="text/html; charset=iso-8859-1"' in data:
@ -81,7 +79,7 @@ def getEncoding(data):
detector.close() detector.close()
return detector.result['encoding'] return detector.result['encoding']
def saveUrl(url, filename, overwrite=False): def save_url(url, filename, overwrite=False):
if not os.path.exists(filename) or overwrite: if not os.path.exists(filename) or overwrite:
dirname = os.path.dirname(filename) dirname = os.path.dirname(filename)
if not os.path.exists(dirname): if not os.path.exists(dirname):

View file

@ -97,8 +97,8 @@ def encode(data):
for s in data: for s in data:
srt += '%d\r\n%s --> %s\r\n%s\r\n\r\n' % ( srt += '%d\r\n%s --> %s\r\n%s\r\n\r\n' % (
i, i,
ox.formatDuration(s['in']*1000, years=False).replace('.', ','), ox.format_duration(s['in']*1000, years=False).replace('.', ','),
ox.formatDuration(s['out']*1000, years=False).replace('.', ','), ox.format_duration(s['out']*1000, years=False).replace('.', ','),
s['value'].replace('\n', '\r\n').strip() s['value'].replace('\n', '\r\n').strip()
) )
i += 1 i += 1

View file

@ -3,8 +3,8 @@
import re import re
import time import time
from ox import stripTags, findRe from ox import strip_tags, findRe
from ox.cache import readUrlUnicode from ox.cache import read_url
def getId(url): def getId(url):
@ -26,7 +26,7 @@ def getData(id):
data = { data = {
"url": getUrl(id) "url": getUrl(id)
} }
html = readUrlUnicode(data["url"]) html = read_url(data["url"], unicode=True)
data['aka'] = parseList(html, 'AKA') data['aka'] = parseList(html, 'AKA')
data['category'] = findRe(html, '<dt>category</dt>.*?<dd>(.*?)</dd>') data['category'] = findRe(html, '<dt>category</dt>.*?<dd>(.*?)</dd>')
data['countries'] = parseList(html, 'countries') data['countries'] = parseList(html, 'countries')
@ -40,18 +40,18 @@ def getData(id):
data['releasedate'] = parseList(html, 'release date') data['releasedate'] = parseList(html, 'release date')
data['runtime'] = parseEntry(html, 'run time').replace('min.', '').strip() data['runtime'] = parseEntry(html, 'run time').replace('min.', '').strip()
data['set'] = parseEntry(html, 'set in') data['set'] = parseEntry(html, 'set in')
data['synopsis'] = stripTags(findRe(html, '<div class="toggle-text" itemprop="description">(.*?)</div>')).strip() data['synopsis'] = strip_tags(findRe(html, '<div class="toggle-text" itemprop="description">(.*?)</div>')).strip()
data['themes'] = parseList(html, 'themes') data['themes'] = parseList(html, 'themes')
data['types'] = parseList(html, 'types') data['types'] = parseList(html, 'types')
data['year'] = findRe(html, '<span class="year">.*?(\d+)') data['year'] = findRe(html, '<span class="year">.*?(\d+)')
#data['stills'] = [re.sub('_derived.*?/', '', i) for i in re.compile('<a href="#" title="movie still".*?<img src="(.*?)"', re.DOTALL).findall(html)] #data['stills'] = [re.sub('_derived.*?/', '', i) for i in re.compile('<a href="#" title="movie still".*?<img src="(.*?)"', re.DOTALL).findall(html)]
data['stills'] = re.compile('<a href="#" title="movie still".*?<img src="(.*?)"', re.DOTALL).findall(html) data['stills'] = re.compile('<a href="#" title="movie still".*?<img src="(.*?)"', re.DOTALL).findall(html)
#html = readUrlUnicode("http://allmovie.com/work/%s/cast" % id) #html = read_url("http://allmovie.com/work/%s/cast" % id, unicode=True)
#data['cast'] = parseTable(html) #data['cast'] = parseTable(html)
#html = readUrlUnicode("http://allmovie.com/work/%s/credits" % id) #html = read_url("http://allmovie.com/work/%s/credits" % id, unicode=True)
#data['credits'] = parseTable(html) #data['credits'] = parseTable(html)
html = readUrlUnicode("http://allmovie.com/work/%s/review" % id) html = read_url("http://allmovie.com/work/%s/review" % id, unicode=True)
data['review'] = stripTags(findRe(html, '<div class="toggle-text" itemprop="description">(.*?)</div>')).strip() data['review'] = strip_tags(findRe(html, '<div class="toggle-text" itemprop="description">(.*?)</div>')).strip()
return data return data
def getUrl(id): def getUrl(id):
@ -59,26 +59,26 @@ def getUrl(id):
def parseEntry(html, title): def parseEntry(html, title):
html = findRe(html, '<dt>%s</dt>.*?<dd>(.*?)</dd>' % title) html = findRe(html, '<dt>%s</dt>.*?<dd>(.*?)</dd>' % title)
return stripTags(html).strip() return strip_tags(html).strip()
def parseList(html, title): def parseList(html, title):
html = findRe(html, '<dt>%s</dt>.*?<dd>(.*?)</dd>' % title.lower()) html = findRe(html, '<dt>%s</dt>.*?<dd>(.*?)</dd>' % title.lower())
r = map(lambda x: stripTags(x), re.compile('<li>(.*?)</li>', re.DOTALL).findall(html)) r = map(lambda x: strip_tags(x), re.compile('<li>(.*?)</li>', re.DOTALL).findall(html))
if not r and html: if not r and html:
r = [stripTags(html)] r = [strip_tags(html)]
return r return r
def parseTable(html): def parseTable(html):
return map( return map(
lambda x: map( lambda x: map(
lambda x: stripTags(x).strip().replace('&nbsp;', ''), lambda x: strip_tags(x).strip().replace('&nbsp;', ''),
x.split('<td width="305">-') x.split('<td width="305">-')
), ),
findRe(html, '<div id="results-table">(.*?)</table>').split('</tr>')[:-1] findRe(html, '<div id="results-table">(.*?)</table>').split('</tr>')[:-1]
) )
def parseText(html, title): def parseText(html, title):
return stripTags(findRe(html, '%s</td>.*?<td colspan="2"><p>(.*?)</td>' % title)).strip() return strip_tags(findRe(html, '%s</td>.*?<td colspan="2"><p>(.*?)</td>' % title)).strip()
if __name__ == '__main__': if __name__ == '__main__':
print getData('129689') print getData('129689')

View file

@ -3,14 +3,14 @@
import re import re
from urllib import quote from urllib import quote
from ox import findRe, stripTags, decodeHtml from ox import findRe, strip_tags, decodeHtml
from ox.cache import readUrlUnicode from ox.cache import read_url
def findISBN(title, author): def findISBN(title, author):
q = '%s %s' % (title, author) q = '%s %s' % (title, author)
url = "http://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Dstripbooks&field-keywords=" + "%s&x=0&y=0" % quote(q) url = "http://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Dstripbooks&field-keywords=" + "%s&x=0&y=0" % quote(q)
data = readUrlUnicode(url) data = read_url(url, unicode=True)
links = re.compile('href="(http://www.amazon.com/.*?/dp/.*?)"').findall(data) links = re.compile('href="(http://www.amazon.com/.*?/dp/.*?)"').findall(data)
id = findRe(re.compile('href="(http://www.amazon.com/.*?/dp/.*?)"').findall(data)[0], '/dp/(.*?)/') id = findRe(re.compile('href="(http://www.amazon.com/.*?/dp/.*?)"').findall(data)[0], '/dp/(.*?)/')
data = getData(id) data = getData(id)
@ -20,7 +20,7 @@ def findISBN(title, author):
def getData(id): def getData(id):
url = "http://www.amazon.com/title/dp/%s/" % id url = "http://www.amazon.com/title/dp/%s/" % id
data = readUrlUnicode(url) data = read_url(url, unicode=True)
def findData(key): def findData(key):
@ -44,9 +44,9 @@ def getData(id):
if not r['pages']: if not r['pages']:
r['pages'] = findData('Hardcover') r['pages'] = findData('Hardcover')
r['review'] = stripTags(findRe(data, '<h3 class="productDescriptionSource">Review</h3>.*?<div class="productDescriptionWrapper">(.*?)</div>').replace('<br />', '\n')).strip() r['review'] = strip_tags(findRe(data, '<h3 class="productDescriptionSource">Review</h3>.*?<div class="productDescriptionWrapper">(.*?)</div>').replace('<br />', '\n')).strip()
r['description'] = stripTags(findRe(data, '<h3 class="productDescriptionSource">Product Description</h3>.*?<div class="productDescriptionWrapper">(.*?)</div>').replace('<br />', '\n')).strip() r['description'] = strip_tags(findRe(data, '<h3 class="productDescriptionSource">Product Description</h3>.*?<div class="productDescriptionWrapper">(.*?)</div>').replace('<br />', '\n')).strip()
r['cover'] = re.findall('src="(.*?)" id="prodImage"', data) r['cover'] = re.findall('src="(.*?)" id="prodImage"', data)
if r['cover']: if r['cover']:

View file

@ -1,7 +1,7 @@
import json import json
import re import re
from ox.cache import readUrlUnicode from ox.cache import read_url
HEADERS = { HEADERS = {
'User-Agent': 'iTunes/10.4 (Macintosh; Intel Mac OS X 10.7) AppleWebKit/534.48.3', 'User-Agent': 'iTunes/10.4 (Macintosh; Intel Mac OS X 10.7) AppleWebKit/534.48.3',
@ -26,21 +26,21 @@ def getMovieData(title, director):
url += '&actorNames=&directorProducerName=' + director url += '&actorNames=&directorProducerName=' + director
url += '&releaseYearTerm=&descriptionTerm=&genreIndex=1&ratingIndex=1' url += '&releaseYearTerm=&descriptionTerm=&genreIndex=1&ratingIndex=1'
HEADERS['Referer'] = url HEADERS['Referer'] = url
html = readUrlUnicode(url, headers=HEADERS) html = read_url(url, headers=HEADERS, unicode=True)
regexp = '<a href="(http://itunes.apple.com/us/movie/.*?)" class="artwork-link"><div class="artwork">' regexp = '<a href="(http://itunes.apple.com/us/movie/.*?)" class="artwork-link"><div class="artwork">'
regexp += '<img width=".*?" height=".*?" alt=".*?" class="artwork" src="(.*?)" /></div></a>' regexp += '<img width=".*?" height=".*?" alt=".*?" class="artwork" src="(.*?)" /></div></a>'
results = re.compile(regexp).findall(html) results = re.compile(regexp).findall(html)
if results: if results:
data['link'] = results[0][0] data['link'] = results[0][0]
data['poster'] = results[0][1].replace('140x140', '600x600') data['poster'] = results[0][1].replace('140x140', '600x600')
html = readUrlUnicode(data['link'], headers=HEADERS) html = read_url(data['link'], headers=HEADERS, unicode=True)
results = re.compile('video-preview-url="(.*?)"').findall(html) results = re.compile('video-preview-url="(.*?)"').findall(html)
if results: if results:
data['trailer'] = results[0] data['trailer'] = results[0]
# trailers section (preferred source for poster and trailer) # trailers section (preferred source for poster and trailer)
host = 'http://trailers.apple.com' host = 'http://trailers.apple.com'
url = host + '/trailers/home/scripts/quickfind.php?callback=searchCallback&q=' + title url = host + '/trailers/home/scripts/quickfind.php?callback=searchCallback&q=' + title
js = json.loads(readUrlUnicode(url)[16:-4]) js = json.loads(read_url(url, unicode=True)[16:-4])
results = js['results'] results = js['results']
if results: if results:
url = host + results[0]['location'] url = host + results[0]['location']
@ -49,11 +49,11 @@ def getMovieData(title, director):
headers = { headers = {
'User-Agent': USER_AGENT 'User-Agent': USER_AGENT
} }
html = readUrlUnicode(url, headers=headers) html = read_url(url, headers=headers, unicode=True)
results = re.compile('"(' + host + '.*?poster\.jpg)"').findall(html) results = re.compile('"(' + host + '.*?poster\.jpg)"').findall(html)
if results: if results:
data['poster'] = results[0].replace('poster.jpg', 'poster-xlarge.jpg') data['poster'] = results[0].replace('poster.jpg', 'poster-xlarge.jpg')
html = readUrlUnicode(url + 'includes/playlists/web.inc', headers=headers) html = read_url(url + 'includes/playlists/web.inc', headers=headers, unicode=True)
results = re.compile('"(' + host + '\S+\.mov)"').findall(html) results = re.compile('"(' + host + '\S+\.mov)"').findall(html)
if results: if results:
data['trailer'] = results[-1] data['trailer'] = results[-1]

View file

@ -12,7 +12,7 @@ def getUrl(id):
def getData(id): def getData(id):
data = {} data = {}
url = getUrl(id) url = getUrl(id)
details = cache.readUrl('%s?output=json' % url) details = cache.read_url('%s?output=json' % url)
details = json.loads(details) details = json.loads(details)
for key in ('title', 'description', 'runtime'): for key in ('title', 'description', 'runtime'):
data[key] = details['metadata'][key] data[key] = details['metadata'][key]

View file

@ -3,8 +3,8 @@
import re import re
import ox.cache import ox.cache
from ox.cache import readUrlUnicode from ox.cache import read_url
from ox.html import stripTags from ox.html import strip_tags
from ox.text import findRe, removeSpecialCharacters from ox.text import findRe, removeSpecialCharacters
import imdb import imdb
@ -30,19 +30,19 @@ def getData(id, timeout=ox.cache.cache_timeout, get_imdb=False):
"url": getUrl(id) "url": getUrl(id)
} }
try: try:
html = readUrlUnicode(data["url"], timeout=timeout) html = read_url(data["url"], timeout=timeout, unicode=True)
except: except:
html = ox.cache.readUrl(data["url"], timeout=timeout) html = ox.cache.read_url(data["url"], timeout=timeout)
data["number"] = findRe(html, "<li>Spine #(\d+)") data["number"] = findRe(html, "<li>Spine #(\d+)")
data["title"] = findRe(html, "<meta property=['\"]og:title['\"] content=['\"](.*?)['\"]") data["title"] = findRe(html, "<meta property=['\"]og:title['\"] content=['\"](.*?)['\"]")
data["title"] = data["title"].split(u' \u2014 The Television Version')[0] data["title"] = data["title"].split(u' \u2014 The Television Version')[0]
data["director"] = stripTags(findRe(html, "<h2 class=\"director\">(.*?)</h2>")) data["director"] = strip_tags(findRe(html, "<h2 class=\"director\">(.*?)</h2>"))
results = findRe(html, '<div class="left_column">(.*?)</div>') results = findRe(html, '<div class="left_column">(.*?)</div>')
results = re.compile("<li>(.*?)</li>").findall(results) results = re.compile("<li>(.*?)</li>").findall(results)
data["country"] = results[0] data["country"] = results[0]
data["year"] = results[1] data["year"] = results[1]
data["synopsis"] = stripTags(findRe(html, "<p><strong>SYNOPSIS:</strong> (.*?)</p>")) data["synopsis"] = strip_tags(findRe(html, "<p><strong>SYNOPSIS:</strong> (.*?)</p>"))
result = findRe(html, "<div class=\"purchase\">(.*?)</div>") result = findRe(html, "<div class=\"purchase\">(.*?)</div>")
if 'Blu-Ray' in result or 'Essential Art House DVD' in result: if 'Blu-Ray' in result or 'Essential Art House DVD' in result:
@ -53,7 +53,7 @@ def getData(id, timeout=ox.cache.cache_timeout, get_imdb=False):
if not "/boxsets/" in result: if not "/boxsets/" in result:
data["posters"] = [result] data["posters"] = [result]
else: else:
html_ = readUrlUnicode(result) html_ = read_url(result, unicode=True)
result = findRe(html_, '<a href="http://www.criterion.com/films/%s.*?">(.*?)</a>' % id) result = findRe(html_, '<a href="http://www.criterion.com/films/%s.*?">(.*?)</a>' % id)
result = findRe(result, "src=\"(.*?)\"") result = findRe(result, "src=\"(.*?)\"")
if result: if result:
@ -77,7 +77,7 @@ def getData(id, timeout=ox.cache.cache_timeout, get_imdb=False):
def getIds(): def getIds():
ids = [] ids = []
html = readUrlUnicode("http://www.criterion.com/library/expanded_view?m=dvd&p=1&pp=50&s=spine") html = read_url("http://www.criterion.com/library/expanded_view?m=dvd&p=1&pp=50&s=spine", unicode=True)
results = re.compile("\&amp;p=(\d+)\&").findall(html) results = re.compile("\&amp;p=(\d+)\&").findall(html)
pages = max(map(int, results)) pages = max(map(int, results))
for page in range(1, pages): for page in range(1, pages):
@ -88,13 +88,13 @@ def getIds():
def getIdsByPage(page): def getIdsByPage(page):
ids = [] ids = []
url = "http://www.criterion.com/library/expanded_view?m=dvd&p=%s&pp=50&s=spine" % page url = "http://www.criterion.com/library/expanded_view?m=dvd&p=%s&pp=50&s=spine" % page
html = readUrlUnicode(url) html = read_url(url, unicode=True)
results = re.compile("films/(\d+)").findall(html) results = re.compile("films/(\d+)").findall(html)
for result in results: for result in results:
ids.append(result) ids.append(result)
results = re.compile("boxsets/(.*?)\"").findall(html) results = re.compile("boxsets/(.*?)\"").findall(html)
for result in results: for result in results:
html = readUrlUnicode("http://www.criterion.com/boxsets/" + result) html = read_url("http://www.criterion.com/boxsets/" + result, unicode=True)
results = re.compile("films/(\d+)").findall(html) results = re.compile("films/(\d+)").findall(html)
for result in results: for result in results:
ids.append(result) ids.append(result)

View file

@ -2,7 +2,7 @@
# vi:si:et:sw=4:sts=4:ts=4 # vi:si:et:sw=4:sts=4:ts=4
import re import re
from urllib import unquote from urllib import unquote
from ox.cache import readUrl from ox.cache import read_url
def getVideoUrl(url): def getVideoUrl(url):
@ -13,7 +13,7 @@ def getVideoUrl(url):
>>> getVideoUrl('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3ou94_priere-pour-refuznik-2-jeanluc-goda_shortfilms').split('?auth')[0] >>> getVideoUrl('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3ou94_priere-pour-refuznik-2-jeanluc-goda_shortfilms').split('?auth')[0]
'http://www.dailymotion.com/cdn/FLV-320x240/video/x3ou94_priere-pour-refuznik-2-jean-luc-god_shortfilms.flv' 'http://www.dailymotion.com/cdn/FLV-320x240/video/x3ou94_priere-pour-refuznik-2-jean-luc-god_shortfilms.flv'
''' '''
data = readUrl(url) data = read_url(url)
video = re.compile('''video", "(.*?)"''').findall(data) video = re.compile('''video", "(.*?)"''').findall(data)
for v in video: for v in video:
v = unquote(v).split('@@')[0] v = unquote(v).split('@@')[0]

View file

@ -3,9 +3,9 @@
import re import re
import urllib import urllib
import ox import ox
from ox import stripTags, decodeHtml from ox import strip_tags, decodeHtml
from ox.utils import json from ox.utils import json
from ox.cache import readUrlUnicode from ox.cache import read_url
def find(query, timeout=ox.cache.cache_timeout): def find(query, timeout=ox.cache.cache_timeout):
@ -13,10 +13,10 @@ def find(query, timeout=ox.cache.cache_timeout):
query = query.encode('utf-8') query = query.encode('utf-8')
params = urllib.urlencode({'q': query}) params = urllib.urlencode({'q': query})
url = 'http://duckduckgo.com/html/?' + params url = 'http://duckduckgo.com/html/?' + params
data = readUrlUnicode(url, timeout=timeout) data = read_url(url, timeout=timeout, unicode=True)
results = [] results = []
regex = '<a .*?class="l le" href="(.+?)">(.*?)</a>.*?<div class="cra">(.*?)</div>' regex = '<a .*?class="l le" href="(.+?)">(.*?)</a>.*?<div class="cra">(.*?)</div>'
for r in re.compile(regex, re.DOTALL).findall(data): for r in re.compile(regex, re.DOTALL).findall(data):
results.append((stripTags(decodeHtml(r[1])), r[0], stripTags(decodeHtml(r[2])))) results.append((strip_tags(decodeHtml(r[1])), r[0], strip_tags(decodeHtml(r[2]))))
return results return results

View file

@ -3,8 +3,8 @@
import re import re
import time import time
from ox import stripTags, findRe from ox import strip_tags, findRe
from ox.cache import readUrlUnicode from ox.cache import read_url
import google import google
@ -21,9 +21,9 @@ def getShowUrl(title):
return None return None
def getShowData(url): def getShowData(url):
data = readUrlUnicode(url) data = read_url(url, unicode=True)
r = {} r = {}
r['title'] = stripTags(findRe(data, '<h1>(.*?)</h1>')) r['title'] = strip_tags(findRe(data, '<h1>(.*?)</h1>'))
r['imdb'] = findRe(data, '<h1><a href=".*?/title/tt(\d.*?)">.*?</a></h1>') r['imdb'] = findRe(data, '<h1><a href=".*?/title/tt(\d.*?)">.*?</a></h1>')
r['episodes'] = {} r['episodes'] = {}
#1. 1- 1 1001 7 Aug 05 You Can't Miss the Bear #1. 1- 1 1001 7 Aug 05 You Can't Miss the Bear

View file

@ -4,8 +4,8 @@
import re import re
from lxml.html import document_fromstring from lxml.html import document_fromstring
from ox.cache import readUrlUnicode from ox.cache import read_url
from ox import findRe, stripTags from ox import findRe, strip_tags
from ox.web.imdb import ImdbCombined from ox.web.imdb import ImdbCombined
@ -32,7 +32,7 @@ def getData(id, timeout=-1):
data = { data = {
"url": getUrl(id), "url": getUrl(id),
} }
html = readUrlUnicode(data['url'], timeout=timeout) html = read_url(data['url'], timeout=timeout, timeout=True)
doc = document_fromstring(html) doc = document_fromstring(html)
props = { props = {

View file

@ -2,7 +2,7 @@
# vi:si:et:sw=4:sts=4:ts=4 # vi:si:et:sw=4:sts=4:ts=4
import json import json
from ox.cache import readUrlUnicode from ox.cache import read_url
from ox import findRe from ox import findRe
class Imdb(dict): class Imdb(dict):
@ -12,7 +12,7 @@ class Imdb(dict):
"http://graph.freebase.com/imdb.title.tt%s" % id "http://graph.freebase.com/imdb.title.tt%s" % id
might also be of interest at some point, right now not much info might also be of interest at some point, right now not much info
''' '''
data = readUrlUnicode(url) data = read_url(url, unicode=True)
try: try:
data = json.loads(data) data = json.loads(data)
except ValueError: except ValueError:

View file

@ -4,13 +4,13 @@ import re
import urllib import urllib
import ox import ox
from ox import stripTags, decodeHtml from ox import strip_tags, decodeHtml
DEFAULT_MAX_RESULTS = 10 DEFAULT_MAX_RESULTS = 10
DEFAULT_TIMEOUT = 24*60*60 DEFAULT_TIMEOUT = 24*60*60
def readUrlUnicode(url, data=None, headers=ox.net.DEFAULT_HEADERS, timeout=DEFAULT_TIMEOUT): def read_url(url, data=None, headers=ox.net.DEFAULT_HEADERS, timeout=DEFAULT_TIMEOUT):
return ox.cache.readUrlUnicode(url, data, headers, timeout) return ox.cache.read_url(url, data, headers, timeout, unicode=True)
def quote_plus(s): def quote_plus(s):
if not isinstance(s, str): if not isinstance(s, str):
@ -28,13 +28,13 @@ def find(query, max_results=DEFAULT_MAX_RESULTS, timeout=DEFAULT_TIMEOUT):
u'http://www.imdb.com/title/tt0133093/' u'http://www.imdb.com/title/tt0133093/'
""" """
url = 'http://google.com/search?q=%s' % quote_plus(query) url = 'http://google.com/search?q=%s' % quote_plus(query)
data = readUrlUnicode(url, timeout=timeout) data = read_url(url, timeout=timeout)
results = [] results = []
data = re.sub('<span class="f">(.*?)</span>', '\\1', data) data = re.sub('<span class="f">(.*?)</span>', '\\1', data)
for a in re.compile( for a in re.compile(
'<a href="(\S+?)" class=l .*?>(.*?)</a>.*?<span class="st">(.*?)<\/span>' '<a href="(\S+?)" class=l .*?>(.*?)</a>.*?<span class="st">(.*?)<\/span>'
).findall(data): ).findall(data):
results.append((stripTags(decodeHtml(a[1])), a[0], stripTags(decodeHtml(a[2])))) results.append((strip_tags(decodeHtml(a[1])), a[0], strip_tags(decodeHtml(a[2]))))
if len(results) >= max_results: if len(results) >= max_results:
break break
return results return results

View file

@ -8,7 +8,7 @@ import time
import unicodedata import unicodedata
import ox import ox
from ox import findRe, stripTags from ox import findRe, strip_tags
from ox.normalize import normalizeTitle, normalizeImdbId from ox.normalize import normalizeTitle, normalizeImdbId
import ox.cache import ox.cache
@ -16,12 +16,9 @@ from siteparser import SiteParser
import google import google
def readUrl(url, data=None, headers=ox.cache.DEFAULT_HEADERS, timeout=ox.cache.cache_timeout, valid=None): def read_url(url, data=None, headers=ox.cache.DEFAULT_HEADERS, timeout=ox.cache.cache_timeout, valid=None, unicode=False):
headers = headers.copy() headers = headers.copy()
return ox.cache.readUrl(url, data, headers, timeout) return ox.cache.read_url(url, data, headers, timeout, unicode=unicode)
def readUrlUnicode(url, timeout=ox.cache.cache_timeout):
return ox.cache.readUrlUnicode(url, _readUrl=readUrl, timeout=timeout)
def getUrl(id): def getUrl(id):
return "http://www.imdb.com/title/tt%s/" % id return "http://www.imdb.com/title/tt%s/" % id
@ -61,7 +58,7 @@ class Imdb(SiteParser):
'page': 'combined', 'page': 'combined',
're': [ 're': [
'<td class="nm">.*?>(.*?)</a>.*?<td class="char">(.*?)</td>', '<td class="nm">.*?>(.*?)</a>.*?<td class="char">(.*?)</td>',
lambda ll: [stripTags(l) for l in ll] lambda ll: [strip_tags(l) for l in ll]
], ],
'type': 'list' 'type': 'list'
}, },
@ -266,8 +263,8 @@ class Imdb(SiteParser):
} }
} }
def readUrlUnicode(self, url, timeout): def read_url(self, url, timeout):
return readUrlUnicode(url, timeout) return read_url(url, timeout, unicode=True)
def __init__(self, id, timeout=-1): def __init__(self, id, timeout=-1):
#use akas.imdb.com to always get original title: #use akas.imdb.com to always get original title:
@ -276,7 +273,7 @@ class Imdb(SiteParser):
super(Imdb, self).__init__(timeout) super(Imdb, self).__init__(timeout)
url = self.baseUrl + 'combined' url = self.baseUrl + 'combined'
page = self.readUrlUnicode(url, timeout=-1) page = self.read_url(url, timeout=-1)
if '<title>IMDb: Page not found</title>' in page \ if '<title>IMDb: Page not found</title>' in page \
or 'The requested URL was not found on our server.' in page: or 'The requested URL was not found on our server.' in page:
return return
@ -460,7 +457,7 @@ def getMovieIdByTitle(title, timeout=-1):
params['q'] = params['q'].encode('utf-8') params['q'] = params['q'].encode('utf-8')
params = urllib.urlencode(params) params = urllib.urlencode(params)
url = "http://akas.imdb.com/find?" + params url = "http://akas.imdb.com/find?" + params
data = readUrlUnicode(url, timeout=timeout) data = read_url(url, timeout=timeout, unicode=True)
#if search results in redirect, get id of current page #if search results in redirect, get id of current page
r = '<meta property="og:url" content="http://www.imdb.com/title/tt(\d{7})/" />' r = '<meta property="og:url" content="http://www.imdb.com/title/tt(\d{7})/" />'
results = re.compile(r).findall(data) results = re.compile(r).findall(data)
@ -538,7 +535,7 @@ def getMovieId(title, director='', year='', timeout=-1):
url = "http://akas.imdb.com/find?" + params url = "http://akas.imdb.com/find?" + params
#print url #print url
data = readUrlUnicode(url, timeout=timeout) data = read_url(url, timeout=timeout, unicode=True)
#if search results in redirect, get id of current page #if search results in redirect, get id of current page
r = '<meta property="og:url" content="http://www.imdb.com/title/tt(\d{7})/" />' r = '<meta property="og:url" content="http://www.imdb.com/title/tt(\d{7})/" />'
results = re.compile(r).findall(data) results = re.compile(r).findall(data)
@ -569,7 +566,7 @@ def getMoviePoster(imdbId):
info = ImdbCombined(imdbId) info = ImdbCombined(imdbId)
if 'posterId' in info: if 'posterId' in info:
url = "http://www.imdb.com/rg/action-box-title/primary-photo/media/rm%s/tt%s" % (info['posterId'], imdbId) url = "http://www.imdb.com/rg/action-box-title/primary-photo/media/rm%s/tt%s" % (info['posterId'], imdbId)
data = readUrl(url) data = read_url(url)
poster = findRe(data, 'img id="primary-img".*?src="(.*?)"') poster = findRe(data, 'img id="primary-img".*?src="(.*?)"')
return poster return poster
elif 'series' in info: elif 'series' in info:
@ -578,7 +575,7 @@ def getMoviePoster(imdbId):
def maxVotes(): def maxVotes():
url = 'http://www.imdb.com/search/title?num_votes=500000,&sort=num_votes,desc' url = 'http://www.imdb.com/search/title?num_votes=500000,&sort=num_votes,desc'
data = ox.cache.readUrl(url) data = ox.cache.read_url(url)
votes = max([int(v.replace(',', '')) votes = max([int(v.replace(',', ''))
for v in re.compile('<td class="sort_col">([\d,]+)</td>').findall(data)]) for v in re.compile('<td class="sort_col">([\d,]+)</td>').findall(data)])
return votes return votes

View file

@ -2,8 +2,8 @@
# encoding: utf-8 # encoding: utf-8
import re import re
from ox.cache import readUrlUnicode from ox.cache import read_url
from ox.html import stripTags from ox.html import strip_tags
from ox.text import findRe from ox.text import findRe
@ -21,11 +21,11 @@ def getData(id):
data = { data = {
'url': getUrl(id) 'url': getUrl(id)
} }
html = readUrlUnicode(data['url']) html = read_url(data['url'], unicode=True)
data['imdbId'] = findRe(html, 'imdb.com/title/tt(\d{7})') data['imdbId'] = findRe(html, 'imdb.com/title/tt(\d{7})')
if not data['imdbId']: if not data['imdbId']:
data['imdbId'] = _id_map.get(id, '') data['imdbId'] = _id_map.get(id, '')
data['title'] = stripTags(findRe(html, '<p class="name white">(.*?) \(<a href="alpha1.html">')) data['title'] = strip_tags(findRe(html, '<p class="name white">(.*?) \(<a href="alpha1.html">'))
data['year'] = findRe(html, '\(<a href="alpha1.html">(.*?)</a>\)') data['year'] = findRe(html, '\(<a href="alpha1.html">(.*?)</a>\)')
data['posters'] = [] data['posters'] = []
poster = findRe(html, '<img src="(posters.*?)"') poster = findRe(html, '<img src="(posters.*?)"')
@ -36,11 +36,11 @@ def getData(id):
for result in results: for result in results:
result = result.replace('_xlg.html', '.html') result = result.replace('_xlg.html', '.html')
url = 'http://www.impawards.com/%s/%s' % (data['year'], result) url = 'http://www.impawards.com/%s/%s' % (data['year'], result)
html = readUrlUnicode(url) html = read_url(url, unicode=True)
result = findRe(html, '<a href = (\w*?_xlg.html)') result = findRe(html, '<a href = (\w*?_xlg.html)')
if result: if result:
url = 'http://www.impawards.com/%s/%s' % (data['year'], result) url = 'http://www.impawards.com/%s/%s' % (data['year'], result)
html = readUrlUnicode(url) html = read_url(url, unicode=True)
poster = 'http://www.impawards.com/%s/%s' % (data['year'], findRe(html, '<img SRC="(.*?)"')) poster = 'http://www.impawards.com/%s/%s' % (data['year'], findRe(html, '<img SRC="(.*?)"'))
else: else:
poster = 'http://www.impawards.com/%s/%s' % (data['year'], findRe(html, '<img src="(posters.*?)"')) poster = 'http://www.impawards.com/%s/%s' % (data['year'], findRe(html, '<img src="(posters.*?)"'))
@ -61,7 +61,7 @@ def getId(url):
def getIds(): def getIds():
ids = [] ids = []
html = readUrlUnicode('http://www.impawards.com/archives/latest.html', timeout = 60*60) html = read_url('http://www.impawards.com/archives/latest.html', timeout = 60*60, unicode=True)
pages = int(findRe(html, '<a href= page(.*?).html>')) + 1 pages = int(findRe(html, '<a href= page(.*?).html>')) + 1
for page in range(pages, 0, -1): for page in range(pages, 0, -1):
for id in getIdsByPage(page): for id in getIdsByPage(page):
@ -71,7 +71,7 @@ def getIds():
def getIdsByPage(page): def getIdsByPage(page):
ids = [] ids = []
html = readUrlUnicode('http://www.impawards.com/archives/page%s.html' % page, timeout = -1) html = read_url('http://www.impawards.com/archives/page%s.html' % page, timeout = -1, unicode=True)
results = re.compile('<a href = \.\./(.*?)>', re.DOTALL).findall(html) results = re.compile('<a href = \.\./(.*?)>', re.DOTALL).findall(html)
for result in results: for result in results:
url = 'http://impawards.com/%s' % result url = 'http://impawards.com/%s' % result
@ -80,7 +80,7 @@ def getIdsByPage(page):
def getUrl(id): def getUrl(id):
url = u"http://www.impawards.com/%s.html" % id url = u"http://www.impawards.com/%s.html" % id
html = readUrlUnicode(url) html = read_url(url, unicode=True)
if findRe(html, "No Movie Posters on This Page"): if findRe(html, "No Movie Posters on This Page"):
url = u"http://www.impawards.com/%s_ver1.html" % id url = u"http://www.impawards.com/%s_ver1.html" % id
return url return url

View file

@ -3,8 +3,8 @@
import re import re
import urllib import urllib
from ox.cache import readUrl from ox.cache import read_url
from ox.html import decodeHtml, stripTags from ox.html import decodeHtml, strip_tags
from ox.text import findRe from ox.text import findRe
from ox.text import findString from ox.text import findString
@ -113,20 +113,20 @@ class ItunesAlbum:
def getId(self): def getId(self):
url = composeUrl('advancedSearch', {'media': 'music', 'title': self.title, 'artist': self.artist}) url = composeUrl('advancedSearch', {'media': 'music', 'title': self.title, 'artist': self.artist})
xml = readUrl(url, headers = ITUNES_HEADERS) xml = read_url(url, headers = ITUNES_HEADERS)
id = findRe(xml, 'viewAlbum\?id=(.*?)&') id = findRe(xml, 'viewAlbum\?id=(.*?)&')
return id return id
def getData(self): def getData(self):
data = {'id': self.id} data = {'id': self.id}
url = composeUrl('viewAlbum', {'id': self.id}) url = composeUrl('viewAlbum', {'id': self.id})
xml = readUrl(url, None, ITUNES_HEADERS) xml = read_url(url, None, ITUNES_HEADERS)
data['albumName'] = findRe(xml, '<B>(.*?)</B>') data['albumName'] = findRe(xml, '<B>(.*?)</B>')
data['artistName'] = findRe(xml, '<b>(.*?)</b>') data['artistName'] = findRe(xml, '<b>(.*?)</b>')
data['coverUrl'] = findRe(xml, 'reflection="." url="(.*?)"') data['coverUrl'] = findRe(xml, 'reflection="." url="(.*?)"')
data['genre'] = findRe(xml, 'Genre:(.*?)<') data['genre'] = findRe(xml, 'Genre:(.*?)<')
data['releaseDate'] = findRe(xml, 'Released(.*?)<') data['releaseDate'] = findRe(xml, 'Released(.*?)<')
data['review'] = stripTags(findRe(xml, 'REVIEW</b>.*?<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>')) data['review'] = strip_tags(findRe(xml, 'REVIEW</b>.*?<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
data['tracks'] = [] data['tracks'] = []
strings = findRe(xml, '<key>items</key>.*?<dict>(.*?)$').split('<dict>') strings = findRe(xml, '<key>items</key>.*?<dict>(.*?)$').split('<dict>')
for string in strings: for string in strings:
@ -144,14 +144,14 @@ class ItunesMovie:
def getId(self): def getId(self):
url = composeUrl('advancedSearch', {'media': 'movie', 'title': self.title, 'director': self.director}) url = composeUrl('advancedSearch', {'media': 'movie', 'title': self.title, 'director': self.director})
xml = readUrl(url, headers = ITUNES_HEADERS) xml = read_url(url, headers = ITUNES_HEADERS)
id = findRe(xml, 'viewMovie\?id=(.*?)&') id = findRe(xml, 'viewMovie\?id=(.*?)&')
return id return id
def getData(self): def getData(self):
data = {'id': self.id} data = {'id': self.id}
url = composeUrl('viewMovie', {'id': self.id}) url = composeUrl('viewMovie', {'id': self.id})
xml = readUrl(url, None, ITUNES_HEADERS) xml = read_url(url, None, ITUNES_HEADERS)
f = open('/Users/rolux/Desktop/iTunesData.xml', 'w') f = open('/Users/rolux/Desktop/iTunesData.xml', 'w')
f.write(xml) f.write(xml)
f.close() f.close()

View file

@ -1,15 +1,15 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4 # vi:si:et:sw=4:sts=4:ts=4
from ox.cache import readUrl from ox.cache import read_url
from ox.html import decodeHtml from ox.html import decodeHtml
from ox.text import findRe from ox.text import findRe
def getLyrics(title, artist): def getLyrics(title, artist):
html = readUrl('http://lyricsfly.com/api/') html = read_url('http://lyricsfly.com/api/')
key = findRe(html, '<font color=green><b>(.*?)</b></font>') key = findRe(html, '<font color=green><b>(.*?)</b></font>')
url = 'http://lyricsfly.com/api/api.php?i=%s&a=%s&t=%s' % (key, artist, title) url = 'http://lyricsfly.com/api/api.php?i=%s&a=%s&t=%s' % (key, artist, title)
xml = readUrl(url) xml = read_url(url)
lyrics = findRe(xml, '<tx>(.*?)\[br\] Lyrics [a-z]* by lyricsfly.com') lyrics = findRe(xml, '<tx>(.*?)\[br\] Lyrics [a-z]* by lyricsfly.com')
lyrics = lyrics.replace('\n', '').replace('\r', '') lyrics = lyrics.replace('\n', '').replace('\r', '')
lyrics = lyrics.replace('[br]', '\n').strip() lyrics = lyrics.replace('[br]', '\n').strip()

View file

@ -4,8 +4,8 @@ import re
from urllib import quote from urllib import quote
from lxml.html import document_fromstring from lxml.html import document_fromstring
from ox.cache import readUrl, readUrlUnicode from ox.cache import read_url
from ox import findRe, stripTags from ox import findRe, strip_tags
def getUrl(id): def getUrl(id):
return 'http://www.metacritic.com/movie/%s' % id return 'http://www.metacritic.com/movie/%s' % id
@ -15,18 +15,18 @@ def getId(url):
def getUrlByImdb(imdb): def getUrlByImdb(imdb):
url = "http://www.imdb.com/title/tt%s/criticreviews" % imdb url = "http://www.imdb.com/title/tt%s/criticreviews" % imdb
data = readUrl(url) data = read_url(url)
metacritic_url = findRe(data, '"(http://www.metacritic.com/movie/.*?)"') metacritic_url = findRe(data, '"(http://www.metacritic.com/movie/.*?)"')
return metacritic_url or None return metacritic_url or None
def getMetacriticShowUrl(title): def getMetacriticShowUrl(title):
title = quote(title) title = quote(title)
url = "http://www.metacritic.com/search/process?ty=6&ts=%s&tfs=tvshow_title&x=0&y=0&sb=0&release_date_s=&release_date_e=&metascore_s=&metascore_e=" % title url = "http://www.metacritic.com/search/process?ty=6&ts=%s&tfs=tvshow_title&x=0&y=0&sb=0&release_date_s=&release_date_e=&metascore_s=&metascore_e=" % title
data = readUrl(url) data = read_url(url)
return findRe(data, '(http://www.metacritic.com/tv/shows/.*?)\?') return findRe(data, '(http://www.metacritic.com/tv/shows/.*?)\?')
def getData(url): def getData(url):
data = readUrlUnicode(url) data = read_url(url, unicode=True)
doc = document_fromstring(data) doc = document_fromstring(data)
score = filter(lambda s: s.attrib.get('property') == 'v:average', score = filter(lambda s: s.attrib.get('property') == 'v:average',
doc.xpath('//span[@class="score_value"]')) doc.xpath('//span[@class="score_value"]'))
@ -51,7 +51,7 @@ def getData(url):
'critic': authors[i], 'critic': authors[i],
'url': urls[i], 'url': urls[i],
'source': sources[i], 'source': sources[i],
'quote': stripTags(reviews[i]).strip(), 'quote': strip_tags(reviews[i]).strip(),
'score': scores[i], 'score': scores[i],
}) })

View file

@ -5,8 +5,8 @@ import re
import socket import socket
from urllib import quote from urllib import quote
from ox.cache import readUrl, readUrlUnicode from ox.cache import read_url
from ox import findRe, cache, stripTags, decodeHtml, getTorrentInfo, intValue, normalizeNewlines from ox import findRe, cache, strip_tags, decodeHtml, getTorrentInfo, int_value, normalizeNewlines
from ox.normalize import normalizeImdbId from ox.normalize import normalizeImdbId
import ox import ox
@ -31,7 +31,7 @@ def findMovie(query, max_results=10):
'''search for torrents on mininova '''search for torrents on mininova
''' '''
url = "http://www.mininova.org/search/%s/seeds" % quote(query) url = "http://www.mininova.org/search/%s/seeds" % quote(query)
data = readUrlUnicode(url) data = read_url(url, unicode=True)
return _parseResultsPage(data, max_results) return _parseResultsPage(data, max_results)
def findMovieByImdb(imdbId): def findMovieByImdb(imdbId):
@ -39,7 +39,7 @@ def findMovieByImdb(imdbId):
''' '''
results = [] results = []
imdbId = normalizeImdbId(imdbId) imdbId = normalizeImdbId(imdbId)
data = readUrlUnicode("http://www.mininova.org/imdb/?imdb=%s" % imdbId) data = read_url("http://www.mininova.org/imdb/?imdb=%s" % imdbId, unicode=True)
return _parseResultsPage(data) return _parseResultsPage(data)
def getId(mininovaId): def getId(mininovaId):
@ -55,7 +55,7 @@ def getId(mininovaId):
def exists(mininovaId): def exists(mininovaId):
mininovaId = getId(mininovaId) mininovaId = getId(mininovaId)
data = ox.net.readUrl("http://www.mininova.org/tor/%s" % mininovaId) data = ox.net.read_url("http://www.mininova.org/tor/%s" % mininovaId)
if not data or 'Torrent not found...' in data: if not data or 'Torrent not found...' in data:
return False return False
if 'tracker</a> of this torrent requires registration.' in data: if 'tracker</a> of this torrent requires registration.' in data:
@ -74,22 +74,22 @@ def getData(mininovaId):
torrent[u'torrent_link'] = "http://www.mininova.org/get/%s" % mininovaId torrent[u'torrent_link'] = "http://www.mininova.org/get/%s" % mininovaId
torrent[u'details_link'] = "http://www.mininova.org/det/%s" % mininovaId torrent[u'details_link'] = "http://www.mininova.org/det/%s" % mininovaId
data = readUrlUnicode(torrent['comment_link']) + readUrlUnicode(torrent['details_link']) data = read_url(torrent['comment_link'], unicode=True) + read_url(torrent['details_link'], unicode=True)
if '<h1>Torrent not found...</h1>' in data: if '<h1>Torrent not found...</h1>' in data:
return None return None
for d in re.compile('<p>.<strong>(.*?):</strong>(.*?)</p>', re.DOTALL).findall(data): for d in re.compile('<p>.<strong>(.*?):</strong>(.*?)</p>', re.DOTALL).findall(data):
key = d[0].lower().strip() key = d[0].lower().strip()
key = _key_map.get(key, key) key = _key_map.get(key, key)
value = decodeHtml(stripTags(d[1].strip())) value = decodeHtml(strip_tags(d[1].strip()))
torrent[key] = value torrent[key] = value
torrent[u'title'] = findRe(data, '<title>(.*?):.*?</title>') torrent[u'title'] = findRe(data, '<title>(.*?):.*?</title>')
torrent[u'imdbId'] = findRe(data, 'title/tt(\d{7})') torrent[u'imdbId'] = findRe(data, 'title/tt(\d{7})')
torrent[u'description'] = findRe(data, '<div id="description">(.*?)</div>') torrent[u'description'] = findRe(data, '<div id="description">(.*?)</div>')
if torrent['description']: if torrent['description']:
torrent['description'] = normalizeNewlines(decodeHtml(stripTags(torrent['description']))).strip() torrent['description'] = normalizeNewlines(decodeHtml(strip_tags(torrent['description']))).strip()
t = readUrl(torrent[u'torrent_link']) t = read_url(torrent[u'torrent_link'])
torrent[u'torrent_info'] = getTorrentInfo(t) torrent[u'torrent_info'] = getTorrentInfo(t)
return torrent return torrent
@ -109,13 +109,13 @@ class Mininova(Torrent):
self['seeder'] = -1 self['seeder'] = -1
self['leecher'] = -1 self['leecher'] = -1
if len(ratio) == 2: if len(ratio) == 2:
val = intValue(ratio[0].replace(',','').strip()) val = int_value(ratio[0].replace(',','').strip())
if val: if val:
self['seeder'] = int(val) self['seeder'] = int(val)
val = intValue(ratio[1].replace(',','').strip()) val = int_value(ratio[1].replace(',','').strip())
if val: if val:
self['leecher'] = int(val) self['leecher'] = int(val)
val = intValue(self.data['downloads'].replace(',','').strip()) val = int_value(self.data['downloads'].replace(',','').strip())
if val: if val:
self['downloaded'] = int(val) self['downloaded'] = int(val)
else: else:

View file

@ -3,7 +3,7 @@
import re import re
from ox.cache import readUrlUnicode from ox.cache import read_url
from ox import findRe from ox import findRe
def getData(id): def getData(id):
@ -24,7 +24,7 @@ def getId(url):
def getPostersByUrl(url, group=True, timeout=-1): def getPostersByUrl(url, group=True, timeout=-1):
posters = [] posters = []
html = readUrlUnicode(url, timeout=timeout) html = read_url(url, timeout=timeout, unicode=True)
if url in html: if url in html:
if group: if group:
results = re.compile('<a href="(http://www.movieposterdb.com/group/.+?)\??">', re.DOTALL).findall(html) results = re.compile('<a href="(http://www.movieposterdb.com/group/.+?)\??">', re.DOTALL).findall(html)
@ -32,7 +32,7 @@ def getPostersByUrl(url, group=True, timeout=-1):
posters += getPostersByUrl(result, False) posters += getPostersByUrl(result, False)
results = re.compile('<a href="(http://www.movieposterdb.com/poster/.+?)">', re.DOTALL).findall(html) results = re.compile('<a href="(http://www.movieposterdb.com/poster/.+?)">', re.DOTALL).findall(html)
for result in results: for result in results:
html = readUrlUnicode(result, timeout=timeout) html = read_url(result, timeout=timeout, unicode=True)
posters.append(findRe(html, '"(http://www.movieposterdb.com/posters/.+?\.jpg)"')) posters.append(findRe(html, '"(http://www.movieposterdb.com/posters/.+?\.jpg)"'))
return posters return posters

View file

@ -3,8 +3,8 @@
import re import re
import feedparser import feedparser
from ox.cache import readUrl, readUrlUnicode from ox.cache import read_url
from ox import findRe, stripTags from ox import findRe, strip_tags
from ox import langCode2To3, langTo3Code from ox import langCode2To3, langTo3Code
def findSubtitlesByImdb(imdb, parts = 1, language = "eng"): def findSubtitlesByImdb(imdb, parts = 1, language = "eng"):
@ -16,7 +16,7 @@ def findSubtitlesByImdb(imdb, parts = 1, language = "eng"):
if language: if language:
url += "sublanguageid-%s/" % language url += "sublanguageid-%s/" % language
url += "subsumcd-%s/subformat-srt/imdbid-%s/rss_2_00" % (parts, imdb) url += "subsumcd-%s/subformat-srt/imdbid-%s/rss_2_00" % (parts, imdb)
data = readUrl(url) data = read_url(url)
if "title>opensubtitles.com - search results</title" in data: if "title>opensubtitles.com - search results</title" in data:
fd = feedparser.parse(data) fd = feedparser.parse(data)
opensubtitleId = None opensubtitleId = None
@ -31,11 +31,11 @@ def findSubtitlesByImdb(imdb, parts = 1, language = "eng"):
def downloadSubtitleById(opensubtitle_id): def downloadSubtitleById(opensubtitle_id):
srts = {} srts = {}
data = readUrl('http://www.opensubtitles.org/en/subtitles/%s' % opensubtitle_id) data = read_url('http://www.opensubtitles.org/en/subtitles/%s' % opensubtitle_id)
reg_exp = 'href="(/en/download/file/.*?)">(.*?)</a>' reg_exp = 'href="(/en/download/file/.*?)">(.*?)</a>'
for f in re.compile(reg_exp, re.DOTALL).findall(data): for f in re.compile(reg_exp, re.DOTALL).findall(data):
name = stripTags(f[1]).split('\n')[0] name = strip_tags(f[1]).split('\n')[0]
url = "http://www.opensubtitles.com%s" % f[0] url = "http://www.opensubtitles.com%s" % f[0]
srts[name] = readUrlUnicode(url) srts[name] = read_url(url, unicode=True)
return srts return srts

View file

@ -1,11 +1,11 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4 # vi:si:et:sw=4:sts=4:ts=4
import re import re
from ox.net import readUrlUnicode from ox.net import read_url
def getPosterUrl(id): def getPosterUrl(id):
url = 'http://piratecinema.org/posters/' url = 'http://piratecinema.org/posters/'
html = readUrlUnicode(url) html = read_url(url, unicode=True)
results = re.compile('src="(.+)" title=".+\((\d{7})\)"').findall(html) results = re.compile('src="(.+)" title=".+\((\d{7})\)"').findall(html)
for result in results: for result in results:
if result[1] == id: if result[1] == id:

View file

@ -2,8 +2,8 @@
# vi:si:et:sw=4:sts=4:ts=4 # vi:si:et:sw=4:sts=4:ts=4
import re import re
from ox.cache import getHeaders, readUrl, readUrlUnicode from ox.cache import getHeaders, read_url
from ox import findRe, stripTags from ox import findRe, strip_tags
def getUrlByImdb(imdb): def getUrlByImdb(imdb):
@ -14,7 +14,7 @@ def getUrlByImdb(imdb):
return u.url return u.url
''' '''
url = "http://www.rottentomatoes.com/alias?type=imdbid&s=%s" % imdb url = "http://www.rottentomatoes.com/alias?type=imdbid&s=%s" % imdb
data = readUrl(url) data = read_url(url)
if "movie_title" in data: if "movie_title" in data:
movies = re.compile('(/m/.*?/)').findall(data) movies = re.compile('(/m/.*?/)').findall(data)
if movies: if movies:
@ -25,13 +25,13 @@ def get_og(data, key):
return findRe(data, '<meta property="og:%s".*?content="(.*?)"' % key) return findRe(data, '<meta property="og:%s".*?content="(.*?)"' % key)
def getData(url): def getData(url):
data = readUrl(url) data = read_url(url)
r = {} r = {}
r['title'] = findRe(data, '<h1 class="movie_title">(.*?)</h1>') r['title'] = findRe(data, '<h1 class="movie_title">(.*?)</h1>')
if '(' in r['title']: if '(' in r['title']:
r['year'] = findRe(r['title'], '\((\d*?)\)') r['year'] = findRe(r['title'], '\((\d*?)\)')
r['title'] = stripTags(re.sub('\((\d*?)\)', '', r['title'])).strip() r['title'] = strip_tags(re.sub('\((\d*?)\)', '', r['title'])).strip()
r['summary'] = stripTags(findRe(data, '<p id="movieSynopsis" class="movie_synopsis" itemprop="description">(.*?)</p>')).strip() r['summary'] = strip_tags(findRe(data, '<p id="movieSynopsis" class="movie_synopsis" itemprop="description">(.*?)</p>')).strip()
r['summary'] = r['summary'].replace('\t', ' ').replace('\n', ' ').replace(' ', ' ').replace(' ', ' ') r['summary'] = r['summary'].replace('\t', ' ').replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')
if not r['summary']: if not r['summary']:
r['summary'] = get_og(data, 'description') r['summary'] = get_og(data, 'description')

View file

@ -2,16 +2,16 @@
# vi:si:et:sw=4:sts=4:ts=4 # vi:si:et:sw=4:sts=4:ts=4
import re import re
from ..cache import readUrlUnicode from ..cache import read_url
from .. import stripTags, decodeHtml from .. import strip_tags, decodeHtml
from ..utils import datetime from ..utils import datetime
def cleanup(key, data, data_type): def cleanup(key, data, data_type):
if data: if data:
if isinstance(data[0], basestring): if isinstance(data[0], basestring):
#FIXME: some types need stripTags #FIXME: some types need strip_tags
#data = [stripTags(decodeHtml(p)).strip() for p in data] #data = [strip_tags(decodeHtml(p)).strip() for p in data]
data = [decodeHtml(p).strip() for p in data] data = [decodeHtml(p).strip() for p in data]
elif isinstance(data[0], list) or isinstance(data[0], tuple): elif isinstance(data[0], list) or isinstance(data[0], tuple):
data = [cleanup(key, p, data_type) for p in data] data = [cleanup(key, p, data_type) for p in data]
@ -30,13 +30,13 @@ class SiteParser(dict):
def getUrl(self, page): def getUrl(self, page):
return "%s%s" % (self.baseUrl, page) return "%s%s" % (self.baseUrl, page)
def readUrlUnicode(self, url, timeout): def read_url(self, url, timeout):
return readUrlUnicode(url, timeout=timeout) return read_url(url, timeout=timeout, unicode=True)
def __init__(self, timeout=-1): def __init__(self, timeout=-1):
for key in self.regex: for key in self.regex:
url = self.getUrl(self.regex[key]['page']) url = self.getUrl(self.regex[key]['page'])
data = self.readUrlUnicode(url, timeout) data = self.read_url(url, timeout)
if isinstance(self.regex[key]['re'], basestring): if isinstance(self.regex[key]['re'], basestring):
data = re.compile(self.regex[key]['re'], re.DOTALL).findall(data) data = re.compile(self.regex[key]['re'], re.DOTALL).findall(data)
data = cleanup(key, data, self.regex[key]['type']) data = cleanup(key, data, self.regex[key]['type'])

View file

@ -5,7 +5,7 @@ import re
import time import time
import ox.cache import ox.cache
from ox.html import decodeHtml, stripTags from ox.html import decodeHtml, strip_tags
import ox.net import ox.net
@ -21,11 +21,11 @@ def getNews(year, month, day):
for section in sections: for section in sections:
url = 'http://www.spiegel.de/%s/0,1518,archiv-%d-%03d,00.html' % (section, year, day) url = 'http://www.spiegel.de/%s/0,1518,archiv-%d-%03d,00.html' % (section, year, day)
if date == time.strftime('%d.%m.%Y', time.localtime()): if date == time.strftime('%d.%m.%Y', time.localtime()):
html = ox.net.readUrl(url) html = ox.net.read_url(url)
else: else:
html = ox.cache.readUrl(url) html = ox.cache.read_url(url)
for item in re.compile('<div class="spTeaserCenterpage(.*?)</p>', re.DOTALL).findall(html): for item in re.compile('<div class="spTeaserCenterpage(.*?)</p>', re.DOTALL).findall(html):
dateString = stripTags(re.compile('<div class="spDateTime">(.*?)</div>', re.DOTALL).findall(item)[0]).strip() dateString = strip_tags(re.compile('<div class="spDateTime">(.*?)</div>', re.DOTALL).findall(item)[0]).strip()
try: try:
description = formatString(re.compile('<p>(.*?)<', re.DOTALL).findall(item)[0]) description = formatString(re.compile('<p>(.*?)<', re.DOTALL).findall(item)[0])
except: except:
@ -104,12 +104,12 @@ def getIssue(year, week):
return None return None
url = 'http://service.spiegel.de/digas/servlet/epaper?Q=SP&JG=%d&AG=%d&SE=1&AN=INHALT' % (year, week) url = 'http://service.spiegel.de/digas/servlet/epaper?Q=SP&JG=%d&AG=%d&SE=1&AN=INHALT' % (year, week)
contents = [] contents = []
data = ox.cache.readUrl(url) data = ox.cache.read_url(url)
items = re.compile('<a.?href="http://service.spiegel.de/digas/servlet/epaper\?Q=SP&JG=".?>(.*?)</a>').findall(data) items = re.compile('<a.?href="http://service.spiegel.de/digas/servlet/epaper\?Q=SP&JG=".?>(.*?)</a>').findall(data)
for item in items: for item in items:
item = item[1] item = item[1]
page = int(re.compile('&amp;SE=(.*?)"').findall(item)[0]) page = int(re.compile('&amp;SE=(.*?)"').findall(item)[0])
title = stripTags(item).strip() title = strip_tags(item).strip()
contents.append({'title': title, 'page': page}) contents.append({'title': title, 'page': page})
pageUrl = {} pageUrl = {}
pages = page + 2 pages = page + 2
@ -163,7 +163,7 @@ def archiveIssues():
f.close() f.close()
filename = '%s/Der Spiegel %d %02d.jpg' % (dirname, y, w) filename = '%s/Der Spiegel %d %02d.jpg' % (dirname, y, w)
if not os.path.exists(filename): if not os.path.exists(filename):
data = ox.cache.readUrl(issue['coverUrl']) data = ox.cache.read_url(issue['coverUrl'])
f = open(filename, 'w') f = open(filename, 'w')
f.write(data) f.write(data)
f.close() f.close()
@ -172,7 +172,7 @@ def archiveIssues():
if url: if url:
filename = '%s/Der Spiegel %d %02d %03d.jpg' % (dirname, y, w, page) filename = '%s/Der Spiegel %d %02d %03d.jpg' % (dirname, y, w, page)
if not os.path.exists(filename): if not os.path.exists(filename):
data = ox.cache.readUrl(url) data = ox.cache.read_url(url)
f = open(filename, 'w') f = open(filename, 'w')
f.write(data) f.write(data)
f.close() f.close()
@ -243,7 +243,7 @@ def archiveNews():
f.close() f.close()
filename = dirname + '/' + new['imageUrl'].split('/')[-1] filename = dirname + '/' + new['imageUrl'].split('/')[-1]
if not os.path.exists(filename): if not os.path.exists(filename):
data = ox.cache.readUrl(new['imageUrl']) data = ox.cache.read_url(new['imageUrl'])
f = open(filename, 'w') f = open(filename, 'w')
f.write(data) f.write(data)
f.close() f.close()

View file

@ -6,8 +6,7 @@ import socket
from urllib import quote, urlencode from urllib import quote, urlencode
from urllib2 import URLError from urllib2 import URLError
from ox.cache import readUrl, readUrlUnicode from ox import findRe, cache, strip_tags, decodeHtml, getTorrentInfo, normalizeNewlines
from ox import findRe, cache, stripTags, decodeHtml, getTorrentInfo, normalizeNewlines
from ox.normalize import normalizeImdbId from ox.normalize import normalizeImdbId
import ox import ox
@ -18,13 +17,10 @@ cache_timeout = 24*60*60 # cache search only for 24 hours
season_episode = re.compile("S..E..", re.IGNORECASE) season_episode = re.compile("S..E..", re.IGNORECASE)
def _readUrl(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None): def read_url(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None, unicode=False):
headers = headers.copy() headers = headers.copy()
headers['Cookie'] = 'language=en_EN' headers['Cookie'] = 'language=en_EN'
return cache.readUrl(url, data, headers, timeout) return cache.read_url(url, data, headers, timeout, unicode=unicode)
def _readUrlUnicode(url, timeout=cache.cache_timeout):
return cache.readUrlUnicode(url, _readUrl=_readUrl, timeout=timeout)
def findMovies(query, max_results=10): def findMovies(query, max_results=10):
results = [] results = []
@ -37,7 +33,7 @@ def findMovies(query, max_results=10):
if not url.startswith('/'): if not url.startswith('/'):
url = "/" + url url = "/" + url
url = "http://thepiratebay.org" + url url = "http://thepiratebay.org" + url
data = _readUrlUnicode(url, timeout=cache_timeout) data = read_url(url, timeout=cache_timeout, unicode=True)
regexp = '''<tr.*?<td class="vertTh"><a href="/browse/(.*?)".*?<td><a href="(/torrent/.*?)" class="detLink".*?>(.*?)</a>.*?</tr>''' regexp = '''<tr.*?<td class="vertTh"><a href="/browse/(.*?)".*?<td><a href="(/torrent/.*?)" class="detLink".*?>(.*?)</a>.*?</tr>'''
for row in re.compile(regexp, re.DOTALL).findall(data): for row in re.compile(regexp, re.DOTALL).findall(data):
torrentType = row[0] torrentType = row[0]
@ -83,7 +79,7 @@ def getData(piratebayId):
torrent[u'domain'] = 'thepiratebay.org' torrent[u'domain'] = 'thepiratebay.org'
torrent[u'comment_link'] = 'http://thepiratebay.org/torrent/%s' % piratebayId torrent[u'comment_link'] = 'http://thepiratebay.org/torrent/%s' % piratebayId
data = _readUrlUnicode(torrent['comment_link']) data = read_url(torrent['comment_link'], unicode=True)
torrent[u'title'] = findRe(data, '<title>(.*?) \(download torrent\) - TPB</title>') torrent[u'title'] = findRe(data, '<title>(.*?) \(download torrent\) - TPB</title>')
if not torrent[u'title']: if not torrent[u'title']:
return None return None
@ -94,12 +90,12 @@ def getData(piratebayId):
for d in re.compile('dt>(.*?):</dt>.*?<dd.*?>(.*?)</dd>', re.DOTALL).findall(data): for d in re.compile('dt>(.*?):</dt>.*?<dd.*?>(.*?)</dd>', re.DOTALL).findall(data):
key = d[0].lower().strip() key = d[0].lower().strip()
key = _key_map.get(key, key) key = _key_map.get(key, key)
value = decodeHtml(stripTags(d[1].strip())) value = decodeHtml(strip_tags(d[1].strip()))
torrent[key] = value torrent[key] = value
torrent[u'description'] = findRe(data, '<div class="nfo">(.*?)</div>') torrent[u'description'] = findRe(data, '<div class="nfo">(.*?)</div>')
if torrent[u'description']: if torrent[u'description']:
torrent['description'] = normalizeNewlines(decodeHtml(stripTags(torrent['description']))).strip() torrent['description'] = normalizeNewlines(decodeHtml(strip_tags(torrent['description']))).strip()
t = _readUrl(torrent[u'torrent_link']) t = _read_url(torrent[u'torrent_link'])
torrent[u'torrent_info'] = getTorrentInfo(t) torrent[u'torrent_info'] = getTorrentInfo(t)
return torrent return torrent

View file

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4 # vi:si:et:sw=4:sts=4:ts=4
from ox import intValue from ox import int_value
class Torrent(dict): class Torrent(dict):
@ -25,7 +25,7 @@ class Torrent(dict):
for key in self._int_keys: for key in self._int_keys:
value = self.data.get(key, -1) value = self.data.get(key, -1)
if not isinstance(value, int): if not isinstance(value, int):
value = int(intValue(value)) value = int(int_value(value))
self[key] = value self[key] = value
self['infohash'] = self.data['torrent_info'].get('hash', '') self['infohash'] = self.data['torrent_info'].get('hash', '')
self['size'] = self.data['torrent_info'].get('size', -1) self['size'] = self.data['torrent_info'].get('size', -1)

View file

@ -3,8 +3,8 @@
import re import re
import time import time
from ox import stripTags, findRe from ox import strip_tags, findRe
from ox.cache import readUrlUnicode from ox.cache import read_url
def getEpisodeData(url): def getEpisodeData(url):
@ -14,9 +14,9 @@ def getEpisodeData(url):
example: example:
getEpisodeData('http://www.tv.com/lost/do-no-harm/episode/399310/summary.html') getEpisodeData('http://www.tv.com/lost/do-no-harm/episode/399310/summary.html')
''' '''
data = readUrlUnicode(url) data = read_url(url, unicode=True)
r = {} r = {}
r['description'] = stripTags(findRe(data, 'div id="main-col">.*?<div>(.*?)</div').split('\r')[0]) r['description'] = strip_tags(findRe(data, 'div id="main-col">.*?<div>(.*?)</div').split('\r')[0])
r['show'] = findRe(data, '<h1>(.*?)</h1>') r['show'] = findRe(data, '<h1>(.*?)</h1>')
r['title'] = findRe(data, '<title>.*?: (.*?) - TV.com </title>') r['title'] = findRe(data, '<title>.*?: (.*?) - TV.com </title>')
#episode score #episode score

View file

@ -4,13 +4,13 @@ import re
from StringIO import StringIO from StringIO import StringIO
import xml.etree.ElementTree as ET import xml.etree.ElementTree as ET
from ox.cache import readUrl, readUrlUnicode from ox.cache import read_url
from ox import findString, findRe from ox import findString, findRe
def getData(id): def getData(id):
url = 'http://www.vimeo.com/moogaloop/load/clip:%s' %id url = 'http://www.vimeo.com/moogaloop/load/clip:%s' %id
xml = readUrl(url) xml = read_url(url)
tree = ET.parse(StringIO(xml)) tree = ET.parse(StringIO(xml))
request_signature = tree.find('request_signature').text request_signature = tree.find('request_signature').text
request_signature_expires = tree.find('request_signature_expires').text request_signature_expires = tree.find('request_signature_expires').text

View file

@ -4,7 +4,7 @@ import re
from urllib import urlencode from urllib import urlencode
from ox.utils import json from ox.utils import json
from ox.cache import readUrl, readUrlUnicode from ox.cache import read_url
from ox import findRe, decodeHtml from ox import findRe, decodeHtml
@ -47,7 +47,7 @@ def getUrlByAllmovieId(allmovieId):
def getWikiData(wikipediaUrl): def getWikiData(wikipediaUrl):
url = wikipediaUrl.replace('wikipedia.org/wiki/', 'wikipedia.org/w/index.php?title=') url = wikipediaUrl.replace('wikipedia.org/wiki/', 'wikipedia.org/w/index.php?title=')
url = "%s&action=raw" % url url = "%s&action=raw" % url
data = readUrl(url).decode('utf-8') data = read_url(url).decode('utf-8')
return data return data
def getMovieData(wikipediaUrl): def getMovieData(wikipediaUrl):
@ -106,7 +106,7 @@ def getMovieData(wikipediaUrl):
def getImageUrl(name): def getImageUrl(name):
url = 'http://en.wikipedia.org/wiki/Image:' + name.replace(' ', '%20') url = 'http://en.wikipedia.org/wiki/Image:' + name.replace(' ', '%20')
data = readUrlUnicode(url) data = read_url(url, unicode=True)
url = findRe(data, 'href="(http://upload.wikimedia.org/.*?)"') url = findRe(data, 'href="(http://upload.wikimedia.org/.*?)"')
if not url: if not url:
url = findRe(data, 'href="(//upload.wikimedia.org/.*?)"') url = findRe(data, 'href="(//upload.wikimedia.org/.*?)"')
@ -133,9 +133,9 @@ def find(query, max_results=10):
query = {'action': 'query', 'list':'search', 'format': 'json', query = {'action': 'query', 'list':'search', 'format': 'json',
'srlimit': max_results, 'srwhat': 'text', 'srsearch': query.encode('utf-8')} 'srlimit': max_results, 'srwhat': 'text', 'srsearch': query.encode('utf-8')}
url = "http://en.wikipedia.org/w/api.php?" + urlencode(query) url = "http://en.wikipedia.org/w/api.php?" + urlencode(query)
data = readUrl(url) data = read_url(url)
if not data: if not data:
data = readUrl(url, timeout=0) data = read_url(url, timeout=0)
result = json.loads(data) result = json.loads(data)
results = [] results = []
if result and 'query' in result: if result and 'query' in result:

View file

@ -5,7 +5,7 @@ import re
from xml.dom.minidom import parseString from xml.dom.minidom import parseString
import feedparser import feedparser
from ox.cache import readUrl, cache_timeout from ox.cache import read_url, cache_timeout
def getVideoUrl(youtubeId, format='mp4', timeout=cache_timeout): def getVideoUrl(youtubeId, format='mp4', timeout=cache_timeout):
@ -33,7 +33,7 @@ def getVideoUrl(youtubeId, format='mp4', timeout=cache_timeout):
def find(query, max_results=10, offset=1, orderBy='relevance'): def find(query, max_results=10, offset=1, orderBy='relevance'):
query = quote(query) query = quote(query)
url = "http://gdata.youtube.com/feeds/api/videos?vq=%s&orderby=%s&start-index=%s&max-results=%s" % (query, orderBy, offset, max_results) url = "http://gdata.youtube.com/feeds/api/videos?vq=%s&orderby=%s&start-index=%s&max-results=%s" % (query, orderBy, offset, max_results)
data = readUrl(url) data = read_url(url)
fd = feedparser.parse(data) fd = feedparser.parse(data)
videos = [] videos = []
for item in fd.entries: for item in fd.entries:
@ -48,7 +48,7 @@ def find(query, max_results=10, offset=1, orderBy='relevance'):
def info(id): def info(id):
info = {} info = {}
url = "http://gdata.youtube.com/feeds/api/videos/%s?v=2" % id url = "http://gdata.youtube.com/feeds/api/videos/%s?v=2" % id
data = readUrl(url) data = read_url(url)
xml = parseString(data) xml = parseString(data)
info['url'] = 'http://www.youtube.com/watch?v=%s' % id info['url'] = 'http://www.youtube.com/watch?v=%s' % id
info['title'] = xml.getElementsByTagName('title')[0].firstChild.data info['title'] = xml.getElementsByTagName('title')[0].firstChild.data
@ -62,21 +62,21 @@ def info(id):
info['keywords'] = xml.getElementsByTagName('media:keywords')[0].firstChild.data.split(', ') info['keywords'] = xml.getElementsByTagName('media:keywords')[0].firstChild.data.split(', ')
url = "http://www.youtube.com/watch?v=%s" % id url = "http://www.youtube.com/watch?v=%s" % id
data = readUrl(url) data = read_url(url)
match = re.compile('<h4>License:</h4>(.*?)</p>', re.DOTALL).findall(data) match = re.compile('<h4>License:</h4>(.*?)</p>', re.DOTALL).findall(data)
if match: if match:
info['license'] = match[0].strip() info['license'] = match[0].strip()
info['license'] = re.sub('<.+?>', '', info['license']).strip() info['license'] = re.sub('<.+?>', '', info['license']).strip()
url = "http://www.youtube.com/api/timedtext?hl=en&type=list&tlangs=1&v=%s&asrs=1"%id url = "http://www.youtube.com/api/timedtext?hl=en&type=list&tlangs=1&v=%s&asrs=1"%id
data = readUrl(url) data = read_url(url)
xml = parseString(data) xml = parseString(data)
languages = [t.getAttribute('lang_code') for t in xml.getElementsByTagName('track')] languages = [t.getAttribute('lang_code') for t in xml.getElementsByTagName('track')]
if languages: if languages:
info['subtitles'] = {} info['subtitles'] = {}
for language in languages: for language in languages:
url = "http://www.youtube.com/api/timedtext?hl=en&v=%s&type=track&lang=%s&name&kind"%(id, language) url = "http://www.youtube.com/api/timedtext?hl=en&v=%s&type=track&lang=%s&name&kind"%(id, language)
data = readUrl(url) data = read_url(url)
xml = parseString(data) xml = parseString(data)
subs = [] subs = []
for t in xml.getElementsByTagName('text'): for t in xml.getElementsByTagName('text'):
@ -101,7 +101,7 @@ def videos(id, format=''):
'mp4': 'video/mp4' 'mp4': 'video/mp4'
}.get(format) }.get(format)
url = "http://www.youtube.com/watch?v=%s" % id url = "http://www.youtube.com/watch?v=%s" % id
data = readUrl(url) data = read_url(url)
match = re.compile('"url_encoded_fmt_stream_map": "(.*?)"').findall(data) match = re.compile('"url_encoded_fmt_stream_map": "(.*?)"').findall(data)
streams = {} streams = {}
for x in match[0].split(','): for x in match[0].split(','):