net/cache readUrl->read_url / Unicode -> unicode=True
format replace all CammelCase with under_score
This commit is contained in:
parent
c1d0fc6242
commit
2de989e188
33 changed files with 243 additions and 254 deletions
38
ox/cache.py
38
ox/cache.py
|
@ -18,7 +18,7 @@ from utils import json
|
||||||
from .file import makedirs
|
from .file import makedirs
|
||||||
|
|
||||||
import net
|
import net
|
||||||
from net import DEFAULT_HEADERS, getEncoding
|
from net import DEFAULT_HEADERS, detect_encoding
|
||||||
|
|
||||||
cache_timeout = 30*24*60*60 # default is 30 days
|
cache_timeout = 30*24*60*60 # default is 30 days
|
||||||
|
|
||||||
|
@ -40,7 +40,7 @@ def status(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):
|
||||||
>>> status('http://google.com/mysearch')
|
>>> status('http://google.com/mysearch')
|
||||||
404
|
404
|
||||||
'''
|
'''
|
||||||
headers = getHeaders(url, data, headers)
|
headers = get_headers(url, data, headers)
|
||||||
return int(headers['status'])
|
return int(headers['status'])
|
||||||
|
|
||||||
def exists(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):
|
def exists(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):
|
||||||
|
@ -55,10 +55,10 @@ def exists(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def getHeaders(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):
|
def get_headers(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):
|
||||||
url_headers = store.get(url, data, headers, timeout, "headers")
|
url_headers = store.get(url, data, headers, timeout, "headers")
|
||||||
if not url_headers:
|
if not url_headers:
|
||||||
url_headers = net.getHeaders(url, data, headers)
|
url_headers = net.get_headers(url, data, headers)
|
||||||
store.set(url, data, -1, url_headers)
|
store.set(url, data, -1, url_headers)
|
||||||
return url_headers
|
return url_headers
|
||||||
|
|
||||||
|
@ -68,7 +68,7 @@ class InvalidResult(Exception):
|
||||||
self.result = result
|
self.result = result
|
||||||
self.headers = headers
|
self.headers = headers
|
||||||
|
|
||||||
def readUrl(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout, valid=None):
|
def read_url(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout, valid=None, unicode=False):
|
||||||
'''
|
'''
|
||||||
url - url to load
|
url - url to load
|
||||||
data - possible post data
|
data - possible post data
|
||||||
|
@ -80,31 +80,29 @@ def readUrl(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout, vali
|
||||||
#FIXME: send last-modified / etag from cache and only update if needed
|
#FIXME: send last-modified / etag from cache and only update if needed
|
||||||
if isinstance(url, unicode):
|
if isinstance(url, unicode):
|
||||||
url = url.encode('utf-8')
|
url = url.encode('utf-8')
|
||||||
result = store.get(url, data, headers, timeout)
|
data = store.get(url, data, headers, timeout)
|
||||||
if not result:
|
if not data:
|
||||||
#print "get data", url
|
#print "get data", url
|
||||||
try:
|
try:
|
||||||
url_headers, result = net.readUrl(url, data, headers, returnHeaders=True)
|
url_headers, data = net.read_url(url, data, headers, return_headers=True)
|
||||||
except urllib2.HTTPError, e:
|
except urllib2.HTTPError, e:
|
||||||
e.headers['Status'] = "%s" % e.code
|
e.headers['Status'] = "%s" % e.code
|
||||||
url_headers = dict(e.headers)
|
url_headers = dict(e.headers)
|
||||||
result = e.read()
|
data = e.read()
|
||||||
if url_headers.get('content-encoding', None) == 'gzip':
|
if url_headers.get('content-encoding', None) == 'gzip':
|
||||||
result = gzip.GzipFile(fileobj=StringIO.StringIO(result)).read()
|
data = gzip.GzipFile(fileobj=StringIO.StringIO(data)).read()
|
||||||
if not valid or valid(result, url_headers):
|
if not valid or valid(data, url_headers):
|
||||||
store.set(url, data, result, url_headers)
|
store.set(url, data, data, url_headers)
|
||||||
else:
|
else:
|
||||||
raise InvalidResult(result, url_headers)
|
raise InvalidResult(data, url_headers)
|
||||||
return result
|
if unicode:
|
||||||
|
encoding = detect_encoding(data)
|
||||||
def readUrlUnicode(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout, _readUrl=readUrl, valid=None):
|
|
||||||
data = _readUrl(url, data, headers, timeout, valid)
|
|
||||||
encoding = getEncoding(data)
|
|
||||||
if not encoding:
|
if not encoding:
|
||||||
encoding = 'latin-1'
|
encoding = 'latin-1'
|
||||||
return unicode(data, encoding)
|
data = data.decode(encoding)
|
||||||
|
return data
|
||||||
|
|
||||||
def saveUrl(url, filename, overwrite=False):
|
def save_url(url, filename, overwrite=False):
|
||||||
if not os.path.exists(filename) or overwrite:
|
if not os.path.exists(filename) or overwrite:
|
||||||
dirname = os.path.dirname(filename)
|
dirname = os.path.dirname(filename)
|
||||||
if not os.path.exists(dirname):
|
if not os.path.exists(dirname):
|
||||||
|
|
84
ox/format.py
84
ox/format.py
|
@ -217,15 +217,15 @@ def to36(q):
|
||||||
def from36(q):
|
def from36(q):
|
||||||
return int(q, 36)
|
return int(q, 36)
|
||||||
|
|
||||||
def intValue(strValue, default=u''):
|
def int_value(strValue, default=u''):
|
||||||
"""
|
"""
|
||||||
>>> intValue('abc23')
|
>>> int_value('abc23')
|
||||||
u'23'
|
u'23'
|
||||||
|
|
||||||
>>> intValue(' abc23')
|
>>> int_value(' abc23')
|
||||||
u'23'
|
u'23'
|
||||||
|
|
||||||
>>> intValue('ab')
|
>>> int_value('ab')
|
||||||
u''
|
u''
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
|
@ -234,15 +234,15 @@ def intValue(strValue, default=u''):
|
||||||
val = default
|
val = default
|
||||||
return val
|
return val
|
||||||
|
|
||||||
def floatValue(strValue, default=u''):
|
def float_value(strValue, default=u''):
|
||||||
"""
|
"""
|
||||||
>>> floatValue('abc23.4')
|
>>> float_value('abc23.4')
|
||||||
u'23.4'
|
u'23.4'
|
||||||
|
|
||||||
>>> floatValue(' abc23.4')
|
>>> float_value(' abc23.4')
|
||||||
u'23.4'
|
u'23.4'
|
||||||
|
|
||||||
>>> floatValue('ab')
|
>>> float_value('ab')
|
||||||
u''
|
u''
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
|
@ -251,46 +251,46 @@ def floatValue(strValue, default=u''):
|
||||||
val = default
|
val = default
|
||||||
return val
|
return val
|
||||||
|
|
||||||
def formatNumber(number, longName, shortName):
|
def format_number(number, longName, shortName):
|
||||||
"""
|
"""
|
||||||
Return the number in a human-readable format (23 KB, 23.4 MB, 23.42 GB)
|
Return the number in a human-readable format (23 KB, 23.4 MB, 23.42 GB)
|
||||||
|
|
||||||
>>> formatNumber(123, 'Byte', 'B')
|
>>> format_number(123, 'Byte', 'B')
|
||||||
'123 Bytes'
|
'123 Bytes'
|
||||||
|
|
||||||
>>> formatNumber(1234, 'Byte', 'B')
|
>>> format_number(1234, 'Byte', 'B')
|
||||||
'1 KB'
|
'1 KB'
|
||||||
|
|
||||||
>>> formatNumber(1234567, 'Byte', 'B')
|
>>> format_number(1234567, 'Byte', 'B')
|
||||||
'1.2 MB'
|
'1.2 MB'
|
||||||
|
|
||||||
>>> formatNumber(1234567890, 'Byte', 'B')
|
>>> format_number(1234567890, 'Byte', 'B')
|
||||||
'1.15 GB'
|
'1.15 GB'
|
||||||
|
|
||||||
>>> formatNumber(1234567890123456789, 'Byte', 'B')
|
>>> format_number(1234567890123456789, 'Byte', 'B')
|
||||||
'1,096.5166 PB'
|
'1,096.5166 PB'
|
||||||
|
|
||||||
>>> formatNumber(-1234567890123456789, 'Byte', 'B')
|
>>> format_number(-1234567890123456789, 'Byte', 'B')
|
||||||
'-1,096.5166 PB'
|
'-1,096.5166 PB'
|
||||||
|
|
||||||
"""
|
"""
|
||||||
if abs(number) < 1024:
|
if abs(number) < 1024:
|
||||||
return '%s %s%s' % (formatThousands(number), longName, number != 1 and 's' or '')
|
return '%s %s%s' % (format_thousands(number), longName, number != 1 and 's' or '')
|
||||||
prefix = ['K', 'M', 'G', 'T', 'P']
|
prefix = ['K', 'M', 'G', 'T', 'P']
|
||||||
for i in range(5):
|
for i in range(5):
|
||||||
if abs(number) < math.pow(1024, i + 2) or i == 4:
|
if abs(number) < math.pow(1024, i + 2) or i == 4:
|
||||||
n = number / math.pow(1024, i + 1)
|
n = number / math.pow(1024, i + 1)
|
||||||
return '%s %s%s' % (formatThousands('%.*f' % (i, n)), prefix[i], shortName)
|
return '%s %s%s' % (format_thousands('%.*f' % (i, n)), prefix[i], shortName)
|
||||||
|
|
||||||
def formatThousands(number, separator = ','):
|
def format_thousands(number, separator = ','):
|
||||||
"""
|
"""
|
||||||
Return the number with separators (1,000,000)
|
Return the number with separators (1,000,000)
|
||||||
|
|
||||||
>>> formatThousands(1)
|
>>> format_thousands(1)
|
||||||
'1'
|
'1'
|
||||||
>>> formatThousands(1000)
|
>>> format_thousands(1000)
|
||||||
'1,000'
|
'1,000'
|
||||||
>>> formatThousands(1000000)
|
>>> format_thousands(1000000)
|
||||||
'1,000,000'
|
'1,000,000'
|
||||||
"""
|
"""
|
||||||
string = str(number).split('.')
|
string = str(number).split('.')
|
||||||
|
@ -302,16 +302,16 @@ def formatThousands(number, separator = ','):
|
||||||
string[0] = ''.join(l)
|
string[0] = ''.join(l)
|
||||||
return '.'.join(string)
|
return '.'.join(string)
|
||||||
|
|
||||||
def formatBits(number):
|
def format_bits(number):
|
||||||
return formatNumber(number, 'bit', 'b')
|
return format_number(number, 'bit', 'b')
|
||||||
|
|
||||||
def formatBytes(number):
|
def format_bytes(number):
|
||||||
return formatNumber(number, 'byte', 'B')
|
return format_number(number, 'byte', 'B')
|
||||||
|
|
||||||
def formatPixels(number):
|
def format_pixels(number):
|
||||||
return formatNumber(number, 'pixel', 'px')
|
return format_number(number, 'pixel', 'px')
|
||||||
|
|
||||||
def formatCurrency(amount, currency="$"):
|
def format_currency(amount, currency="$"):
|
||||||
if amount:
|
if amount:
|
||||||
temp = "%.2f" % amount
|
temp = "%.2f" % amount
|
||||||
profile=re.compile(r"(\d)(\d\d\d[.,])")
|
profile=re.compile(r"(\d)(\d\d\d[.,])")
|
||||||
|
@ -336,9 +336,9 @@ def plural(amount, unit, plural='s'):
|
||||||
if plural == 's':
|
if plural == 's':
|
||||||
unit = unit + plural
|
unit = unit + plural
|
||||||
else: unit = plural
|
else: unit = plural
|
||||||
return "%s %s" % (formatThousands(amount), unit)
|
return "%s %s" % (format_thousands(amount), unit)
|
||||||
|
|
||||||
def formatDuration(ms, verbosity=0, years=True, hours=True, milliseconds=True):
|
def format_duration(ms, verbosity=0, years=True, hours=True, milliseconds=True):
|
||||||
'''
|
'''
|
||||||
verbosity
|
verbosity
|
||||||
0: D:HH:MM:SS
|
0: D:HH:MM:SS
|
||||||
|
@ -353,13 +353,13 @@ def formatDuration(ms, verbosity=0, years=True, hours=True, milliseconds=True):
|
||||||
milliseconds
|
milliseconds
|
||||||
True: always display milliseconds
|
True: always display milliseconds
|
||||||
False: never display milliseconds
|
False: never display milliseconds
|
||||||
>>> formatDuration(1000 * 60 * 60 * 24 * 366)
|
>>> format_duration(1000 * 60 * 60 * 24 * 366)
|
||||||
'1:001:00:00:00.000'
|
'1:001:00:00:00.000'
|
||||||
>>> formatDuration(1000 * 60 * 60 * 24 * 366, years=False)
|
>>> format_duration(1000 * 60 * 60 * 24 * 366, years=False)
|
||||||
'366:00:00:00.000'
|
'366:00:00:00.000'
|
||||||
>>> formatDuration(1000 * 60 * 60 * 24 * 365 + 2003, verbosity=2)
|
>>> format_duration(1000 * 60 * 60 * 24 * 365 + 2003, verbosity=2)
|
||||||
'1 year 2 seconds 3 milliseconds'
|
'1 year 2 seconds 3 milliseconds'
|
||||||
>>> formatDuration(1000 * 30, hours=False, milliseconds=False)
|
>>> format_duration(1000 * 30, hours=False, milliseconds=False)
|
||||||
'00:30'
|
'00:30'
|
||||||
'''
|
'''
|
||||||
if not ms and ms != 0:
|
if not ms and ms != 0:
|
||||||
|
@ -403,7 +403,7 @@ def formatDuration(ms, verbosity=0, years=True, hours=True, milliseconds=True):
|
||||||
return duration
|
return duration
|
||||||
|
|
||||||
def ms2runtime(ms, shortenLong=False):
|
def ms2runtime(ms, shortenLong=False):
|
||||||
# deprecated - use formatDuration
|
# deprecated - use format_duration
|
||||||
'''
|
'''
|
||||||
>>> ms2runtime(5000)
|
>>> ms2runtime(5000)
|
||||||
'5 seconds'
|
'5 seconds'
|
||||||
|
@ -415,11 +415,11 @@ def ms2runtime(ms, shortenLong=False):
|
||||||
'13 hours 53 minutes'
|
'13 hours 53 minutes'
|
||||||
'''
|
'''
|
||||||
if shortenLong and ms > 1000 * 60 * 60 * 24 * 464:
|
if shortenLong and ms > 1000 * 60 * 60 * 24 * 464:
|
||||||
return formatDuration(ms, verbosity=1, milliseconds=False)
|
return format_duration(ms, verbosity=1, milliseconds=False)
|
||||||
return formatDuration(ms, verbosity=2, milliseconds=False)
|
return format_duration(ms, verbosity=2, milliseconds=False)
|
||||||
|
|
||||||
def ms2playtime(ms, hours=False):
|
def ms2playtime(ms, hours=False):
|
||||||
# deprecated - use formatDuration
|
# deprecated - use format_duration
|
||||||
'''
|
'''
|
||||||
>>> ms2playtime(5000)
|
>>> ms2playtime(5000)
|
||||||
'00:05'
|
'00:05'
|
||||||
|
@ -428,15 +428,15 @@ def ms2playtime(ms, hours=False):
|
||||||
>>> ms2playtime(50000000)
|
>>> ms2playtime(50000000)
|
||||||
'13:53:20'
|
'13:53:20'
|
||||||
'''
|
'''
|
||||||
return formatDuration(ms, hours=False, years=False, milliseconds=False)
|
return format_duration(ms, hours=False, years=False, milliseconds=False)
|
||||||
|
|
||||||
def ms2time(ms):
|
def ms2time(ms):
|
||||||
# deprecated - use formatDuration
|
# deprecated - use format_duration
|
||||||
'''
|
'''
|
||||||
>>> ms2time(44592123)
|
>>> ms2time(44592123)
|
||||||
'12:23:12.123'
|
'12:23:12.123'
|
||||||
'''
|
'''
|
||||||
return formatDuration(ms, years=False)
|
return format_duration(ms, years=False)
|
||||||
|
|
||||||
def time2ms(timeString):
|
def time2ms(timeString):
|
||||||
'''
|
'''
|
||||||
|
@ -451,7 +451,7 @@ def time2ms(timeString):
|
||||||
ms = ms * 60 + float(_p)
|
ms = ms * 60 + float(_p)
|
||||||
return int(ms * 1000)
|
return int(ms * 1000)
|
||||||
|
|
||||||
def shiftTime(offset, timeString):
|
def shift_time(offset, timeString):
|
||||||
newTime = time2ms(timeString) + offset
|
newTime = time2ms(timeString) + offset
|
||||||
return ms2time(newTime)
|
return ms2time(newTime)
|
||||||
|
|
||||||
|
|
30
ox/net.py
30
ox/net.py
|
@ -22,7 +22,7 @@ DEFAULT_HEADERS = {
|
||||||
|
|
||||||
def status(url, data=None, headers=DEFAULT_HEADERS):
|
def status(url, data=None, headers=DEFAULT_HEADERS):
|
||||||
try:
|
try:
|
||||||
f = openUrl(url, data, headers)
|
f = open_url(url, data, headers)
|
||||||
s = f.code
|
s = f.code
|
||||||
except urllib2.HTTPError, e:
|
except urllib2.HTTPError, e:
|
||||||
s = e.code
|
s = e.code
|
||||||
|
@ -34,9 +34,9 @@ def exists(url, data=None, headers=DEFAULT_HEADERS):
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def getHeaders(url, data=None, headers=DEFAULT_HEADERS):
|
def headers(url, data=None, headers=DEFAULT_HEADERS):
|
||||||
try:
|
try:
|
||||||
f = openUrl(url, data, headers)
|
f = open_url(url, data, headers)
|
||||||
f.headers['Status'] = "%s" % f.code
|
f.headers['Status'] = "%s" % f.code
|
||||||
headers = f.headers
|
headers = f.headers
|
||||||
f.close()
|
f.close()
|
||||||
|
@ -45,30 +45,28 @@ def getHeaders(url, data=None, headers=DEFAULT_HEADERS):
|
||||||
headers = e.headers
|
headers = e.headers
|
||||||
return dict(headers)
|
return dict(headers)
|
||||||
|
|
||||||
def openUrl(url, data=None, headers=DEFAULT_HEADERS):
|
def open_url(url, data=None, headers=DEFAULT_HEADERS):
|
||||||
url = url.replace(' ', '%20')
|
url = url.replace(' ', '%20')
|
||||||
req = urllib2.Request(url, data, headers)
|
req = urllib2.Request(url, data, headers)
|
||||||
return urllib2.urlopen(req)
|
return urllib2.urlopen(req)
|
||||||
|
|
||||||
def readUrl(url, data=None, headers=DEFAULT_HEADERS, returnHeaders=False):
|
def read_url(url, data=None, headers=DEFAULT_HEADERS, return_headers=False, unicode=False):
|
||||||
f = openUrl(url, data, headers)
|
f = open_url(url, data, headers)
|
||||||
data = f.read()
|
data = f.read()
|
||||||
f.close()
|
f.close()
|
||||||
if f.headers.get('content-encoding', None) == 'gzip':
|
if f.headers.get('content-encoding', None) == 'gzip':
|
||||||
data = gzip.GzipFile(fileobj=StringIO.StringIO(data)).read()
|
data = gzip.GzipFile(fileobj=StringIO.StringIO(data)).read()
|
||||||
if returnHeaders:
|
if unicode:
|
||||||
|
encoding = detect_encoding(data)
|
||||||
|
if not encoding:
|
||||||
|
encoding = 'latin-1'
|
||||||
|
data = data.decode(encoding)
|
||||||
|
if return_headers:
|
||||||
f.headers['Status'] = "%s" % f.code
|
f.headers['Status'] = "%s" % f.code
|
||||||
return dict(f.headers), data
|
return dict(f.headers), data
|
||||||
return data
|
return data
|
||||||
|
|
||||||
def readUrlUnicode(url, data=None, headers=DEFAULT_HEADERS):
|
def detect_encoding(data):
|
||||||
data = readUrl(url, data, headers)
|
|
||||||
encoding = getEncoding(data)
|
|
||||||
if not encoding:
|
|
||||||
encoding = 'latin-1'
|
|
||||||
return unicode(data, encoding)
|
|
||||||
|
|
||||||
def getEncoding(data):
|
|
||||||
if 'content="text/html; charset=utf-8"' in data:
|
if 'content="text/html; charset=utf-8"' in data:
|
||||||
return 'utf-8'
|
return 'utf-8'
|
||||||
elif 'content="text/html; charset=iso-8859-1"' in data:
|
elif 'content="text/html; charset=iso-8859-1"' in data:
|
||||||
|
@ -81,7 +79,7 @@ def getEncoding(data):
|
||||||
detector.close()
|
detector.close()
|
||||||
return detector.result['encoding']
|
return detector.result['encoding']
|
||||||
|
|
||||||
def saveUrl(url, filename, overwrite=False):
|
def save_url(url, filename, overwrite=False):
|
||||||
if not os.path.exists(filename) or overwrite:
|
if not os.path.exists(filename) or overwrite:
|
||||||
dirname = os.path.dirname(filename)
|
dirname = os.path.dirname(filename)
|
||||||
if not os.path.exists(dirname):
|
if not os.path.exists(dirname):
|
||||||
|
|
|
@ -97,8 +97,8 @@ def encode(data):
|
||||||
for s in data:
|
for s in data:
|
||||||
srt += '%d\r\n%s --> %s\r\n%s\r\n\r\n' % (
|
srt += '%d\r\n%s --> %s\r\n%s\r\n\r\n' % (
|
||||||
i,
|
i,
|
||||||
ox.formatDuration(s['in']*1000, years=False).replace('.', ','),
|
ox.format_duration(s['in']*1000, years=False).replace('.', ','),
|
||||||
ox.formatDuration(s['out']*1000, years=False).replace('.', ','),
|
ox.format_duration(s['out']*1000, years=False).replace('.', ','),
|
||||||
s['value'].replace('\n', '\r\n').strip()
|
s['value'].replace('\n', '\r\n').strip()
|
||||||
)
|
)
|
||||||
i += 1
|
i += 1
|
||||||
|
|
|
@ -3,8 +3,8 @@
|
||||||
import re
|
import re
|
||||||
import time
|
import time
|
||||||
|
|
||||||
from ox import stripTags, findRe
|
from ox import strip_tags, findRe
|
||||||
from ox.cache import readUrlUnicode
|
from ox.cache import read_url
|
||||||
|
|
||||||
|
|
||||||
def getId(url):
|
def getId(url):
|
||||||
|
@ -26,7 +26,7 @@ def getData(id):
|
||||||
data = {
|
data = {
|
||||||
"url": getUrl(id)
|
"url": getUrl(id)
|
||||||
}
|
}
|
||||||
html = readUrlUnicode(data["url"])
|
html = read_url(data["url"], unicode=True)
|
||||||
data['aka'] = parseList(html, 'AKA')
|
data['aka'] = parseList(html, 'AKA')
|
||||||
data['category'] = findRe(html, '<dt>category</dt>.*?<dd>(.*?)</dd>')
|
data['category'] = findRe(html, '<dt>category</dt>.*?<dd>(.*?)</dd>')
|
||||||
data['countries'] = parseList(html, 'countries')
|
data['countries'] = parseList(html, 'countries')
|
||||||
|
@ -40,18 +40,18 @@ def getData(id):
|
||||||
data['releasedate'] = parseList(html, 'release date')
|
data['releasedate'] = parseList(html, 'release date')
|
||||||
data['runtime'] = parseEntry(html, 'run time').replace('min.', '').strip()
|
data['runtime'] = parseEntry(html, 'run time').replace('min.', '').strip()
|
||||||
data['set'] = parseEntry(html, 'set in')
|
data['set'] = parseEntry(html, 'set in')
|
||||||
data['synopsis'] = stripTags(findRe(html, '<div class="toggle-text" itemprop="description">(.*?)</div>')).strip()
|
data['synopsis'] = strip_tags(findRe(html, '<div class="toggle-text" itemprop="description">(.*?)</div>')).strip()
|
||||||
data['themes'] = parseList(html, 'themes')
|
data['themes'] = parseList(html, 'themes')
|
||||||
data['types'] = parseList(html, 'types')
|
data['types'] = parseList(html, 'types')
|
||||||
data['year'] = findRe(html, '<span class="year">.*?(\d+)')
|
data['year'] = findRe(html, '<span class="year">.*?(\d+)')
|
||||||
#data['stills'] = [re.sub('_derived.*?/', '', i) for i in re.compile('<a href="#" title="movie still".*?<img src="(.*?)"', re.DOTALL).findall(html)]
|
#data['stills'] = [re.sub('_derived.*?/', '', i) for i in re.compile('<a href="#" title="movie still".*?<img src="(.*?)"', re.DOTALL).findall(html)]
|
||||||
data['stills'] = re.compile('<a href="#" title="movie still".*?<img src="(.*?)"', re.DOTALL).findall(html)
|
data['stills'] = re.compile('<a href="#" title="movie still".*?<img src="(.*?)"', re.DOTALL).findall(html)
|
||||||
#html = readUrlUnicode("http://allmovie.com/work/%s/cast" % id)
|
#html = read_url("http://allmovie.com/work/%s/cast" % id, unicode=True)
|
||||||
#data['cast'] = parseTable(html)
|
#data['cast'] = parseTable(html)
|
||||||
#html = readUrlUnicode("http://allmovie.com/work/%s/credits" % id)
|
#html = read_url("http://allmovie.com/work/%s/credits" % id, unicode=True)
|
||||||
#data['credits'] = parseTable(html)
|
#data['credits'] = parseTable(html)
|
||||||
html = readUrlUnicode("http://allmovie.com/work/%s/review" % id)
|
html = read_url("http://allmovie.com/work/%s/review" % id, unicode=True)
|
||||||
data['review'] = stripTags(findRe(html, '<div class="toggle-text" itemprop="description">(.*?)</div>')).strip()
|
data['review'] = strip_tags(findRe(html, '<div class="toggle-text" itemprop="description">(.*?)</div>')).strip()
|
||||||
return data
|
return data
|
||||||
|
|
||||||
def getUrl(id):
|
def getUrl(id):
|
||||||
|
@ -59,26 +59,26 @@ def getUrl(id):
|
||||||
|
|
||||||
def parseEntry(html, title):
|
def parseEntry(html, title):
|
||||||
html = findRe(html, '<dt>%s</dt>.*?<dd>(.*?)</dd>' % title)
|
html = findRe(html, '<dt>%s</dt>.*?<dd>(.*?)</dd>' % title)
|
||||||
return stripTags(html).strip()
|
return strip_tags(html).strip()
|
||||||
|
|
||||||
def parseList(html, title):
|
def parseList(html, title):
|
||||||
html = findRe(html, '<dt>%s</dt>.*?<dd>(.*?)</dd>' % title.lower())
|
html = findRe(html, '<dt>%s</dt>.*?<dd>(.*?)</dd>' % title.lower())
|
||||||
r = map(lambda x: stripTags(x), re.compile('<li>(.*?)</li>', re.DOTALL).findall(html))
|
r = map(lambda x: strip_tags(x), re.compile('<li>(.*?)</li>', re.DOTALL).findall(html))
|
||||||
if not r and html:
|
if not r and html:
|
||||||
r = [stripTags(html)]
|
r = [strip_tags(html)]
|
||||||
return r
|
return r
|
||||||
|
|
||||||
def parseTable(html):
|
def parseTable(html):
|
||||||
return map(
|
return map(
|
||||||
lambda x: map(
|
lambda x: map(
|
||||||
lambda x: stripTags(x).strip().replace(' ', ''),
|
lambda x: strip_tags(x).strip().replace(' ', ''),
|
||||||
x.split('<td width="305">-')
|
x.split('<td width="305">-')
|
||||||
),
|
),
|
||||||
findRe(html, '<div id="results-table">(.*?)</table>').split('</tr>')[:-1]
|
findRe(html, '<div id="results-table">(.*?)</table>').split('</tr>')[:-1]
|
||||||
)
|
)
|
||||||
|
|
||||||
def parseText(html, title):
|
def parseText(html, title):
|
||||||
return stripTags(findRe(html, '%s</td>.*?<td colspan="2"><p>(.*?)</td>' % title)).strip()
|
return strip_tags(findRe(html, '%s</td>.*?<td colspan="2"><p>(.*?)</td>' % title)).strip()
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
print getData('129689')
|
print getData('129689')
|
||||||
|
|
|
@ -3,14 +3,14 @@
|
||||||
import re
|
import re
|
||||||
from urllib import quote
|
from urllib import quote
|
||||||
|
|
||||||
from ox import findRe, stripTags, decodeHtml
|
from ox import findRe, strip_tags, decodeHtml
|
||||||
from ox.cache import readUrlUnicode
|
from ox.cache import read_url
|
||||||
|
|
||||||
|
|
||||||
def findISBN(title, author):
|
def findISBN(title, author):
|
||||||
q = '%s %s' % (title, author)
|
q = '%s %s' % (title, author)
|
||||||
url = "http://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Dstripbooks&field-keywords=" + "%s&x=0&y=0" % quote(q)
|
url = "http://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Dstripbooks&field-keywords=" + "%s&x=0&y=0" % quote(q)
|
||||||
data = readUrlUnicode(url)
|
data = read_url(url, unicode=True)
|
||||||
links = re.compile('href="(http://www.amazon.com/.*?/dp/.*?)"').findall(data)
|
links = re.compile('href="(http://www.amazon.com/.*?/dp/.*?)"').findall(data)
|
||||||
id = findRe(re.compile('href="(http://www.amazon.com/.*?/dp/.*?)"').findall(data)[0], '/dp/(.*?)/')
|
id = findRe(re.compile('href="(http://www.amazon.com/.*?/dp/.*?)"').findall(data)[0], '/dp/(.*?)/')
|
||||||
data = getData(id)
|
data = getData(id)
|
||||||
|
@ -20,7 +20,7 @@ def findISBN(title, author):
|
||||||
|
|
||||||
def getData(id):
|
def getData(id):
|
||||||
url = "http://www.amazon.com/title/dp/%s/" % id
|
url = "http://www.amazon.com/title/dp/%s/" % id
|
||||||
data = readUrlUnicode(url)
|
data = read_url(url, unicode=True)
|
||||||
|
|
||||||
|
|
||||||
def findData(key):
|
def findData(key):
|
||||||
|
@ -44,9 +44,9 @@ def getData(id):
|
||||||
if not r['pages']:
|
if not r['pages']:
|
||||||
r['pages'] = findData('Hardcover')
|
r['pages'] = findData('Hardcover')
|
||||||
|
|
||||||
r['review'] = stripTags(findRe(data, '<h3 class="productDescriptionSource">Review</h3>.*?<div class="productDescriptionWrapper">(.*?)</div>').replace('<br />', '\n')).strip()
|
r['review'] = strip_tags(findRe(data, '<h3 class="productDescriptionSource">Review</h3>.*?<div class="productDescriptionWrapper">(.*?)</div>').replace('<br />', '\n')).strip()
|
||||||
|
|
||||||
r['description'] = stripTags(findRe(data, '<h3 class="productDescriptionSource">Product Description</h3>.*?<div class="productDescriptionWrapper">(.*?)</div>').replace('<br />', '\n')).strip()
|
r['description'] = strip_tags(findRe(data, '<h3 class="productDescriptionSource">Product Description</h3>.*?<div class="productDescriptionWrapper">(.*?)</div>').replace('<br />', '\n')).strip()
|
||||||
|
|
||||||
r['cover'] = re.findall('src="(.*?)" id="prodImage"', data)
|
r['cover'] = re.findall('src="(.*?)" id="prodImage"', data)
|
||||||
if r['cover']:
|
if r['cover']:
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from ox.cache import readUrlUnicode
|
from ox.cache import read_url
|
||||||
|
|
||||||
HEADERS = {
|
HEADERS = {
|
||||||
'User-Agent': 'iTunes/10.4 (Macintosh; Intel Mac OS X 10.7) AppleWebKit/534.48.3',
|
'User-Agent': 'iTunes/10.4 (Macintosh; Intel Mac OS X 10.7) AppleWebKit/534.48.3',
|
||||||
|
@ -26,21 +26,21 @@ def getMovieData(title, director):
|
||||||
url += '&actorNames=&directorProducerName=' + director
|
url += '&actorNames=&directorProducerName=' + director
|
||||||
url += '&releaseYearTerm=&descriptionTerm=&genreIndex=1&ratingIndex=1'
|
url += '&releaseYearTerm=&descriptionTerm=&genreIndex=1&ratingIndex=1'
|
||||||
HEADERS['Referer'] = url
|
HEADERS['Referer'] = url
|
||||||
html = readUrlUnicode(url, headers=HEADERS)
|
html = read_url(url, headers=HEADERS, unicode=True)
|
||||||
regexp = '<a href="(http://itunes.apple.com/us/movie/.*?)" class="artwork-link"><div class="artwork">'
|
regexp = '<a href="(http://itunes.apple.com/us/movie/.*?)" class="artwork-link"><div class="artwork">'
|
||||||
regexp += '<img width=".*?" height=".*?" alt=".*?" class="artwork" src="(.*?)" /></div></a>'
|
regexp += '<img width=".*?" height=".*?" alt=".*?" class="artwork" src="(.*?)" /></div></a>'
|
||||||
results = re.compile(regexp).findall(html)
|
results = re.compile(regexp).findall(html)
|
||||||
if results:
|
if results:
|
||||||
data['link'] = results[0][0]
|
data['link'] = results[0][0]
|
||||||
data['poster'] = results[0][1].replace('140x140', '600x600')
|
data['poster'] = results[0][1].replace('140x140', '600x600')
|
||||||
html = readUrlUnicode(data['link'], headers=HEADERS)
|
html = read_url(data['link'], headers=HEADERS, unicode=True)
|
||||||
results = re.compile('video-preview-url="(.*?)"').findall(html)
|
results = re.compile('video-preview-url="(.*?)"').findall(html)
|
||||||
if results:
|
if results:
|
||||||
data['trailer'] = results[0]
|
data['trailer'] = results[0]
|
||||||
# trailers section (preferred source for poster and trailer)
|
# trailers section (preferred source for poster and trailer)
|
||||||
host = 'http://trailers.apple.com'
|
host = 'http://trailers.apple.com'
|
||||||
url = host + '/trailers/home/scripts/quickfind.php?callback=searchCallback&q=' + title
|
url = host + '/trailers/home/scripts/quickfind.php?callback=searchCallback&q=' + title
|
||||||
js = json.loads(readUrlUnicode(url)[16:-4])
|
js = json.loads(read_url(url, unicode=True)[16:-4])
|
||||||
results = js['results']
|
results = js['results']
|
||||||
if results:
|
if results:
|
||||||
url = host + results[0]['location']
|
url = host + results[0]['location']
|
||||||
|
@ -49,11 +49,11 @@ def getMovieData(title, director):
|
||||||
headers = {
|
headers = {
|
||||||
'User-Agent': USER_AGENT
|
'User-Agent': USER_AGENT
|
||||||
}
|
}
|
||||||
html = readUrlUnicode(url, headers=headers)
|
html = read_url(url, headers=headers, unicode=True)
|
||||||
results = re.compile('"(' + host + '.*?poster\.jpg)"').findall(html)
|
results = re.compile('"(' + host + '.*?poster\.jpg)"').findall(html)
|
||||||
if results:
|
if results:
|
||||||
data['poster'] = results[0].replace('poster.jpg', 'poster-xlarge.jpg')
|
data['poster'] = results[0].replace('poster.jpg', 'poster-xlarge.jpg')
|
||||||
html = readUrlUnicode(url + 'includes/playlists/web.inc', headers=headers)
|
html = read_url(url + 'includes/playlists/web.inc', headers=headers, unicode=True)
|
||||||
results = re.compile('"(' + host + '\S+\.mov)"').findall(html)
|
results = re.compile('"(' + host + '\S+\.mov)"').findall(html)
|
||||||
if results:
|
if results:
|
||||||
data['trailer'] = results[-1]
|
data['trailer'] = results[-1]
|
||||||
|
|
|
@ -12,7 +12,7 @@ def getUrl(id):
|
||||||
def getData(id):
|
def getData(id):
|
||||||
data = {}
|
data = {}
|
||||||
url = getUrl(id)
|
url = getUrl(id)
|
||||||
details = cache.readUrl('%s?output=json' % url)
|
details = cache.read_url('%s?output=json' % url)
|
||||||
details = json.loads(details)
|
details = json.loads(details)
|
||||||
for key in ('title', 'description', 'runtime'):
|
for key in ('title', 'description', 'runtime'):
|
||||||
data[key] = details['metadata'][key]
|
data[key] = details['metadata'][key]
|
||||||
|
|
|
@ -3,8 +3,8 @@
|
||||||
import re
|
import re
|
||||||
|
|
||||||
import ox.cache
|
import ox.cache
|
||||||
from ox.cache import readUrlUnicode
|
from ox.cache import read_url
|
||||||
from ox.html import stripTags
|
from ox.html import strip_tags
|
||||||
from ox.text import findRe, removeSpecialCharacters
|
from ox.text import findRe, removeSpecialCharacters
|
||||||
|
|
||||||
import imdb
|
import imdb
|
||||||
|
@ -30,19 +30,19 @@ def getData(id, timeout=ox.cache.cache_timeout, get_imdb=False):
|
||||||
"url": getUrl(id)
|
"url": getUrl(id)
|
||||||
}
|
}
|
||||||
try:
|
try:
|
||||||
html = readUrlUnicode(data["url"], timeout=timeout)
|
html = read_url(data["url"], timeout=timeout, unicode=True)
|
||||||
except:
|
except:
|
||||||
html = ox.cache.readUrl(data["url"], timeout=timeout)
|
html = ox.cache.read_url(data["url"], timeout=timeout)
|
||||||
data["number"] = findRe(html, "<li>Spine #(\d+)")
|
data["number"] = findRe(html, "<li>Spine #(\d+)")
|
||||||
|
|
||||||
data["title"] = findRe(html, "<meta property=['\"]og:title['\"] content=['\"](.*?)['\"]")
|
data["title"] = findRe(html, "<meta property=['\"]og:title['\"] content=['\"](.*?)['\"]")
|
||||||
data["title"] = data["title"].split(u' \u2014 The Television Version')[0]
|
data["title"] = data["title"].split(u' \u2014 The Television Version')[0]
|
||||||
data["director"] = stripTags(findRe(html, "<h2 class=\"director\">(.*?)</h2>"))
|
data["director"] = strip_tags(findRe(html, "<h2 class=\"director\">(.*?)</h2>"))
|
||||||
results = findRe(html, '<div class="left_column">(.*?)</div>')
|
results = findRe(html, '<div class="left_column">(.*?)</div>')
|
||||||
results = re.compile("<li>(.*?)</li>").findall(results)
|
results = re.compile("<li>(.*?)</li>").findall(results)
|
||||||
data["country"] = results[0]
|
data["country"] = results[0]
|
||||||
data["year"] = results[1]
|
data["year"] = results[1]
|
||||||
data["synopsis"] = stripTags(findRe(html, "<p><strong>SYNOPSIS:</strong> (.*?)</p>"))
|
data["synopsis"] = strip_tags(findRe(html, "<p><strong>SYNOPSIS:</strong> (.*?)</p>"))
|
||||||
|
|
||||||
result = findRe(html, "<div class=\"purchase\">(.*?)</div>")
|
result = findRe(html, "<div class=\"purchase\">(.*?)</div>")
|
||||||
if 'Blu-Ray' in result or 'Essential Art House DVD' in result:
|
if 'Blu-Ray' in result or 'Essential Art House DVD' in result:
|
||||||
|
@ -53,7 +53,7 @@ def getData(id, timeout=ox.cache.cache_timeout, get_imdb=False):
|
||||||
if not "/boxsets/" in result:
|
if not "/boxsets/" in result:
|
||||||
data["posters"] = [result]
|
data["posters"] = [result]
|
||||||
else:
|
else:
|
||||||
html_ = readUrlUnicode(result)
|
html_ = read_url(result, unicode=True)
|
||||||
result = findRe(html_, '<a href="http://www.criterion.com/films/%s.*?">(.*?)</a>' % id)
|
result = findRe(html_, '<a href="http://www.criterion.com/films/%s.*?">(.*?)</a>' % id)
|
||||||
result = findRe(result, "src=\"(.*?)\"")
|
result = findRe(result, "src=\"(.*?)\"")
|
||||||
if result:
|
if result:
|
||||||
|
@ -77,7 +77,7 @@ def getData(id, timeout=ox.cache.cache_timeout, get_imdb=False):
|
||||||
|
|
||||||
def getIds():
|
def getIds():
|
||||||
ids = []
|
ids = []
|
||||||
html = readUrlUnicode("http://www.criterion.com/library/expanded_view?m=dvd&p=1&pp=50&s=spine")
|
html = read_url("http://www.criterion.com/library/expanded_view?m=dvd&p=1&pp=50&s=spine", unicode=True)
|
||||||
results = re.compile("\&p=(\d+)\&").findall(html)
|
results = re.compile("\&p=(\d+)\&").findall(html)
|
||||||
pages = max(map(int, results))
|
pages = max(map(int, results))
|
||||||
for page in range(1, pages):
|
for page in range(1, pages):
|
||||||
|
@ -88,13 +88,13 @@ def getIds():
|
||||||
def getIdsByPage(page):
|
def getIdsByPage(page):
|
||||||
ids = []
|
ids = []
|
||||||
url = "http://www.criterion.com/library/expanded_view?m=dvd&p=%s&pp=50&s=spine" % page
|
url = "http://www.criterion.com/library/expanded_view?m=dvd&p=%s&pp=50&s=spine" % page
|
||||||
html = readUrlUnicode(url)
|
html = read_url(url, unicode=True)
|
||||||
results = re.compile("films/(\d+)").findall(html)
|
results = re.compile("films/(\d+)").findall(html)
|
||||||
for result in results:
|
for result in results:
|
||||||
ids.append(result)
|
ids.append(result)
|
||||||
results = re.compile("boxsets/(.*?)\"").findall(html)
|
results = re.compile("boxsets/(.*?)\"").findall(html)
|
||||||
for result in results:
|
for result in results:
|
||||||
html = readUrlUnicode("http://www.criterion.com/boxsets/" + result)
|
html = read_url("http://www.criterion.com/boxsets/" + result, unicode=True)
|
||||||
results = re.compile("films/(\d+)").findall(html)
|
results = re.compile("films/(\d+)").findall(html)
|
||||||
for result in results:
|
for result in results:
|
||||||
ids.append(result)
|
ids.append(result)
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
# vi:si:et:sw=4:sts=4:ts=4
|
# vi:si:et:sw=4:sts=4:ts=4
|
||||||
import re
|
import re
|
||||||
from urllib import unquote
|
from urllib import unquote
|
||||||
from ox.cache import readUrl
|
from ox.cache import read_url
|
||||||
|
|
||||||
|
|
||||||
def getVideoUrl(url):
|
def getVideoUrl(url):
|
||||||
|
@ -13,7 +13,7 @@ def getVideoUrl(url):
|
||||||
>>> getVideoUrl('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3ou94_priere-pour-refuznik-2-jeanluc-goda_shortfilms').split('?auth')[0]
|
>>> getVideoUrl('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3ou94_priere-pour-refuznik-2-jeanluc-goda_shortfilms').split('?auth')[0]
|
||||||
'http://www.dailymotion.com/cdn/FLV-320x240/video/x3ou94_priere-pour-refuznik-2-jean-luc-god_shortfilms.flv'
|
'http://www.dailymotion.com/cdn/FLV-320x240/video/x3ou94_priere-pour-refuznik-2-jean-luc-god_shortfilms.flv'
|
||||||
'''
|
'''
|
||||||
data = readUrl(url)
|
data = read_url(url)
|
||||||
video = re.compile('''video", "(.*?)"''').findall(data)
|
video = re.compile('''video", "(.*?)"''').findall(data)
|
||||||
for v in video:
|
for v in video:
|
||||||
v = unquote(v).split('@@')[0]
|
v = unquote(v).split('@@')[0]
|
||||||
|
|
|
@ -3,9 +3,9 @@
|
||||||
import re
|
import re
|
||||||
import urllib
|
import urllib
|
||||||
import ox
|
import ox
|
||||||
from ox import stripTags, decodeHtml
|
from ox import strip_tags, decodeHtml
|
||||||
from ox.utils import json
|
from ox.utils import json
|
||||||
from ox.cache import readUrlUnicode
|
from ox.cache import read_url
|
||||||
|
|
||||||
|
|
||||||
def find(query, timeout=ox.cache.cache_timeout):
|
def find(query, timeout=ox.cache.cache_timeout):
|
||||||
|
@ -13,10 +13,10 @@ def find(query, timeout=ox.cache.cache_timeout):
|
||||||
query = query.encode('utf-8')
|
query = query.encode('utf-8')
|
||||||
params = urllib.urlencode({'q': query})
|
params = urllib.urlencode({'q': query})
|
||||||
url = 'http://duckduckgo.com/html/?' + params
|
url = 'http://duckduckgo.com/html/?' + params
|
||||||
data = readUrlUnicode(url, timeout=timeout)
|
data = read_url(url, timeout=timeout, unicode=True)
|
||||||
results = []
|
results = []
|
||||||
regex = '<a .*?class="l le" href="(.+?)">(.*?)</a>.*?<div class="cra">(.*?)</div>'
|
regex = '<a .*?class="l le" href="(.+?)">(.*?)</a>.*?<div class="cra">(.*?)</div>'
|
||||||
for r in re.compile(regex, re.DOTALL).findall(data):
|
for r in re.compile(regex, re.DOTALL).findall(data):
|
||||||
results.append((stripTags(decodeHtml(r[1])), r[0], stripTags(decodeHtml(r[2]))))
|
results.append((strip_tags(decodeHtml(r[1])), r[0], strip_tags(decodeHtml(r[2]))))
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|
|
@ -3,8 +3,8 @@
|
||||||
import re
|
import re
|
||||||
import time
|
import time
|
||||||
|
|
||||||
from ox import stripTags, findRe
|
from ox import strip_tags, findRe
|
||||||
from ox.cache import readUrlUnicode
|
from ox.cache import read_url
|
||||||
|
|
||||||
import google
|
import google
|
||||||
|
|
||||||
|
@ -21,9 +21,9 @@ def getShowUrl(title):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def getShowData(url):
|
def getShowData(url):
|
||||||
data = readUrlUnicode(url)
|
data = read_url(url, unicode=True)
|
||||||
r = {}
|
r = {}
|
||||||
r['title'] = stripTags(findRe(data, '<h1>(.*?)</h1>'))
|
r['title'] = strip_tags(findRe(data, '<h1>(.*?)</h1>'))
|
||||||
r['imdb'] = findRe(data, '<h1><a href=".*?/title/tt(\d.*?)">.*?</a></h1>')
|
r['imdb'] = findRe(data, '<h1><a href=".*?/title/tt(\d.*?)">.*?</a></h1>')
|
||||||
r['episodes'] = {}
|
r['episodes'] = {}
|
||||||
#1. 1- 1 1001 7 Aug 05 You Can't Miss the Bear
|
#1. 1- 1 1001 7 Aug 05 You Can't Miss the Bear
|
||||||
|
|
|
@ -4,8 +4,8 @@
|
||||||
import re
|
import re
|
||||||
from lxml.html import document_fromstring
|
from lxml.html import document_fromstring
|
||||||
|
|
||||||
from ox.cache import readUrlUnicode
|
from ox.cache import read_url
|
||||||
from ox import findRe, stripTags
|
from ox import findRe, strip_tags
|
||||||
from ox.web.imdb import ImdbCombined
|
from ox.web.imdb import ImdbCombined
|
||||||
|
|
||||||
|
|
||||||
|
@ -32,7 +32,7 @@ def getData(id, timeout=-1):
|
||||||
data = {
|
data = {
|
||||||
"url": getUrl(id),
|
"url": getUrl(id),
|
||||||
}
|
}
|
||||||
html = readUrlUnicode(data['url'], timeout=timeout)
|
html = read_url(data['url'], timeout=timeout, timeout=True)
|
||||||
doc = document_fromstring(html)
|
doc = document_fromstring(html)
|
||||||
|
|
||||||
props = {
|
props = {
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
# vi:si:et:sw=4:sts=4:ts=4
|
# vi:si:et:sw=4:sts=4:ts=4
|
||||||
import json
|
import json
|
||||||
|
|
||||||
from ox.cache import readUrlUnicode
|
from ox.cache import read_url
|
||||||
from ox import findRe
|
from ox import findRe
|
||||||
|
|
||||||
class Imdb(dict):
|
class Imdb(dict):
|
||||||
|
@ -12,7 +12,7 @@ class Imdb(dict):
|
||||||
"http://graph.freebase.com/imdb.title.tt%s" % id
|
"http://graph.freebase.com/imdb.title.tt%s" % id
|
||||||
might also be of interest at some point, right now not much info
|
might also be of interest at some point, right now not much info
|
||||||
'''
|
'''
|
||||||
data = readUrlUnicode(url)
|
data = read_url(url, unicode=True)
|
||||||
try:
|
try:
|
||||||
data = json.loads(data)
|
data = json.loads(data)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
|
|
|
@ -4,13 +4,13 @@ import re
|
||||||
import urllib
|
import urllib
|
||||||
|
|
||||||
import ox
|
import ox
|
||||||
from ox import stripTags, decodeHtml
|
from ox import strip_tags, decodeHtml
|
||||||
|
|
||||||
DEFAULT_MAX_RESULTS = 10
|
DEFAULT_MAX_RESULTS = 10
|
||||||
DEFAULT_TIMEOUT = 24*60*60
|
DEFAULT_TIMEOUT = 24*60*60
|
||||||
|
|
||||||
def readUrlUnicode(url, data=None, headers=ox.net.DEFAULT_HEADERS, timeout=DEFAULT_TIMEOUT):
|
def read_url(url, data=None, headers=ox.net.DEFAULT_HEADERS, timeout=DEFAULT_TIMEOUT):
|
||||||
return ox.cache.readUrlUnicode(url, data, headers, timeout)
|
return ox.cache.read_url(url, data, headers, timeout, unicode=True)
|
||||||
|
|
||||||
def quote_plus(s):
|
def quote_plus(s):
|
||||||
if not isinstance(s, str):
|
if not isinstance(s, str):
|
||||||
|
@ -28,13 +28,13 @@ def find(query, max_results=DEFAULT_MAX_RESULTS, timeout=DEFAULT_TIMEOUT):
|
||||||
u'http://www.imdb.com/title/tt0133093/'
|
u'http://www.imdb.com/title/tt0133093/'
|
||||||
"""
|
"""
|
||||||
url = 'http://google.com/search?q=%s' % quote_plus(query)
|
url = 'http://google.com/search?q=%s' % quote_plus(query)
|
||||||
data = readUrlUnicode(url, timeout=timeout)
|
data = read_url(url, timeout=timeout)
|
||||||
results = []
|
results = []
|
||||||
data = re.sub('<span class="f">(.*?)</span>', '\\1', data)
|
data = re.sub('<span class="f">(.*?)</span>', '\\1', data)
|
||||||
for a in re.compile(
|
for a in re.compile(
|
||||||
'<a href="(\S+?)" class=l .*?>(.*?)</a>.*?<span class="st">(.*?)<\/span>'
|
'<a href="(\S+?)" class=l .*?>(.*?)</a>.*?<span class="st">(.*?)<\/span>'
|
||||||
).findall(data):
|
).findall(data):
|
||||||
results.append((stripTags(decodeHtml(a[1])), a[0], stripTags(decodeHtml(a[2]))))
|
results.append((strip_tags(decodeHtml(a[1])), a[0], strip_tags(decodeHtml(a[2]))))
|
||||||
if len(results) >= max_results:
|
if len(results) >= max_results:
|
||||||
break
|
break
|
||||||
return results
|
return results
|
||||||
|
|
|
@ -8,7 +8,7 @@ import time
|
||||||
import unicodedata
|
import unicodedata
|
||||||
|
|
||||||
import ox
|
import ox
|
||||||
from ox import findRe, stripTags
|
from ox import findRe, strip_tags
|
||||||
from ox.normalize import normalizeTitle, normalizeImdbId
|
from ox.normalize import normalizeTitle, normalizeImdbId
|
||||||
import ox.cache
|
import ox.cache
|
||||||
|
|
||||||
|
@ -16,12 +16,9 @@ from siteparser import SiteParser
|
||||||
import google
|
import google
|
||||||
|
|
||||||
|
|
||||||
def readUrl(url, data=None, headers=ox.cache.DEFAULT_HEADERS, timeout=ox.cache.cache_timeout, valid=None):
|
def read_url(url, data=None, headers=ox.cache.DEFAULT_HEADERS, timeout=ox.cache.cache_timeout, valid=None, unicode=False):
|
||||||
headers = headers.copy()
|
headers = headers.copy()
|
||||||
return ox.cache.readUrl(url, data, headers, timeout)
|
return ox.cache.read_url(url, data, headers, timeout, unicode=unicode)
|
||||||
|
|
||||||
def readUrlUnicode(url, timeout=ox.cache.cache_timeout):
|
|
||||||
return ox.cache.readUrlUnicode(url, _readUrl=readUrl, timeout=timeout)
|
|
||||||
|
|
||||||
def getUrl(id):
|
def getUrl(id):
|
||||||
return "http://www.imdb.com/title/tt%s/" % id
|
return "http://www.imdb.com/title/tt%s/" % id
|
||||||
|
@ -61,7 +58,7 @@ class Imdb(SiteParser):
|
||||||
'page': 'combined',
|
'page': 'combined',
|
||||||
're': [
|
're': [
|
||||||
'<td class="nm">.*?>(.*?)</a>.*?<td class="char">(.*?)</td>',
|
'<td class="nm">.*?>(.*?)</a>.*?<td class="char">(.*?)</td>',
|
||||||
lambda ll: [stripTags(l) for l in ll]
|
lambda ll: [strip_tags(l) for l in ll]
|
||||||
],
|
],
|
||||||
'type': 'list'
|
'type': 'list'
|
||||||
},
|
},
|
||||||
|
@ -266,8 +263,8 @@ class Imdb(SiteParser):
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
def readUrlUnicode(self, url, timeout):
|
def read_url(self, url, timeout):
|
||||||
return readUrlUnicode(url, timeout)
|
return read_url(url, timeout, unicode=True)
|
||||||
|
|
||||||
def __init__(self, id, timeout=-1):
|
def __init__(self, id, timeout=-1):
|
||||||
#use akas.imdb.com to always get original title:
|
#use akas.imdb.com to always get original title:
|
||||||
|
@ -276,7 +273,7 @@ class Imdb(SiteParser):
|
||||||
super(Imdb, self).__init__(timeout)
|
super(Imdb, self).__init__(timeout)
|
||||||
|
|
||||||
url = self.baseUrl + 'combined'
|
url = self.baseUrl + 'combined'
|
||||||
page = self.readUrlUnicode(url, timeout=-1)
|
page = self.read_url(url, timeout=-1)
|
||||||
if '<title>IMDb: Page not found</title>' in page \
|
if '<title>IMDb: Page not found</title>' in page \
|
||||||
or 'The requested URL was not found on our server.' in page:
|
or 'The requested URL was not found on our server.' in page:
|
||||||
return
|
return
|
||||||
|
@ -460,7 +457,7 @@ def getMovieIdByTitle(title, timeout=-1):
|
||||||
params['q'] = params['q'].encode('utf-8')
|
params['q'] = params['q'].encode('utf-8')
|
||||||
params = urllib.urlencode(params)
|
params = urllib.urlencode(params)
|
||||||
url = "http://akas.imdb.com/find?" + params
|
url = "http://akas.imdb.com/find?" + params
|
||||||
data = readUrlUnicode(url, timeout=timeout)
|
data = read_url(url, timeout=timeout, unicode=True)
|
||||||
#if search results in redirect, get id of current page
|
#if search results in redirect, get id of current page
|
||||||
r = '<meta property="og:url" content="http://www.imdb.com/title/tt(\d{7})/" />'
|
r = '<meta property="og:url" content="http://www.imdb.com/title/tt(\d{7})/" />'
|
||||||
results = re.compile(r).findall(data)
|
results = re.compile(r).findall(data)
|
||||||
|
@ -538,7 +535,7 @@ def getMovieId(title, director='', year='', timeout=-1):
|
||||||
url = "http://akas.imdb.com/find?" + params
|
url = "http://akas.imdb.com/find?" + params
|
||||||
#print url
|
#print url
|
||||||
|
|
||||||
data = readUrlUnicode(url, timeout=timeout)
|
data = read_url(url, timeout=timeout, unicode=True)
|
||||||
#if search results in redirect, get id of current page
|
#if search results in redirect, get id of current page
|
||||||
r = '<meta property="og:url" content="http://www.imdb.com/title/tt(\d{7})/" />'
|
r = '<meta property="og:url" content="http://www.imdb.com/title/tt(\d{7})/" />'
|
||||||
results = re.compile(r).findall(data)
|
results = re.compile(r).findall(data)
|
||||||
|
@ -569,7 +566,7 @@ def getMoviePoster(imdbId):
|
||||||
info = ImdbCombined(imdbId)
|
info = ImdbCombined(imdbId)
|
||||||
if 'posterId' in info:
|
if 'posterId' in info:
|
||||||
url = "http://www.imdb.com/rg/action-box-title/primary-photo/media/rm%s/tt%s" % (info['posterId'], imdbId)
|
url = "http://www.imdb.com/rg/action-box-title/primary-photo/media/rm%s/tt%s" % (info['posterId'], imdbId)
|
||||||
data = readUrl(url)
|
data = read_url(url)
|
||||||
poster = findRe(data, 'img id="primary-img".*?src="(.*?)"')
|
poster = findRe(data, 'img id="primary-img".*?src="(.*?)"')
|
||||||
return poster
|
return poster
|
||||||
elif 'series' in info:
|
elif 'series' in info:
|
||||||
|
@ -578,7 +575,7 @@ def getMoviePoster(imdbId):
|
||||||
|
|
||||||
def maxVotes():
|
def maxVotes():
|
||||||
url = 'http://www.imdb.com/search/title?num_votes=500000,&sort=num_votes,desc'
|
url = 'http://www.imdb.com/search/title?num_votes=500000,&sort=num_votes,desc'
|
||||||
data = ox.cache.readUrl(url)
|
data = ox.cache.read_url(url)
|
||||||
votes = max([int(v.replace(',', ''))
|
votes = max([int(v.replace(',', ''))
|
||||||
for v in re.compile('<td class="sort_col">([\d,]+)</td>').findall(data)])
|
for v in re.compile('<td class="sort_col">([\d,]+)</td>').findall(data)])
|
||||||
return votes
|
return votes
|
||||||
|
|
|
@ -2,8 +2,8 @@
|
||||||
# encoding: utf-8
|
# encoding: utf-8
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from ox.cache import readUrlUnicode
|
from ox.cache import read_url
|
||||||
from ox.html import stripTags
|
from ox.html import strip_tags
|
||||||
from ox.text import findRe
|
from ox.text import findRe
|
||||||
|
|
||||||
|
|
||||||
|
@ -21,11 +21,11 @@ def getData(id):
|
||||||
data = {
|
data = {
|
||||||
'url': getUrl(id)
|
'url': getUrl(id)
|
||||||
}
|
}
|
||||||
html = readUrlUnicode(data['url'])
|
html = read_url(data['url'], unicode=True)
|
||||||
data['imdbId'] = findRe(html, 'imdb.com/title/tt(\d{7})')
|
data['imdbId'] = findRe(html, 'imdb.com/title/tt(\d{7})')
|
||||||
if not data['imdbId']:
|
if not data['imdbId']:
|
||||||
data['imdbId'] = _id_map.get(id, '')
|
data['imdbId'] = _id_map.get(id, '')
|
||||||
data['title'] = stripTags(findRe(html, '<p class="name white">(.*?) \(<a href="alpha1.html">'))
|
data['title'] = strip_tags(findRe(html, '<p class="name white">(.*?) \(<a href="alpha1.html">'))
|
||||||
data['year'] = findRe(html, '\(<a href="alpha1.html">(.*?)</a>\)')
|
data['year'] = findRe(html, '\(<a href="alpha1.html">(.*?)</a>\)')
|
||||||
data['posters'] = []
|
data['posters'] = []
|
||||||
poster = findRe(html, '<img src="(posters.*?)"')
|
poster = findRe(html, '<img src="(posters.*?)"')
|
||||||
|
@ -36,11 +36,11 @@ def getData(id):
|
||||||
for result in results:
|
for result in results:
|
||||||
result = result.replace('_xlg.html', '.html')
|
result = result.replace('_xlg.html', '.html')
|
||||||
url = 'http://www.impawards.com/%s/%s' % (data['year'], result)
|
url = 'http://www.impawards.com/%s/%s' % (data['year'], result)
|
||||||
html = readUrlUnicode(url)
|
html = read_url(url, unicode=True)
|
||||||
result = findRe(html, '<a href = (\w*?_xlg.html)')
|
result = findRe(html, '<a href = (\w*?_xlg.html)')
|
||||||
if result:
|
if result:
|
||||||
url = 'http://www.impawards.com/%s/%s' % (data['year'], result)
|
url = 'http://www.impawards.com/%s/%s' % (data['year'], result)
|
||||||
html = readUrlUnicode(url)
|
html = read_url(url, unicode=True)
|
||||||
poster = 'http://www.impawards.com/%s/%s' % (data['year'], findRe(html, '<img SRC="(.*?)"'))
|
poster = 'http://www.impawards.com/%s/%s' % (data['year'], findRe(html, '<img SRC="(.*?)"'))
|
||||||
else:
|
else:
|
||||||
poster = 'http://www.impawards.com/%s/%s' % (data['year'], findRe(html, '<img src="(posters.*?)"'))
|
poster = 'http://www.impawards.com/%s/%s' % (data['year'], findRe(html, '<img src="(posters.*?)"'))
|
||||||
|
@ -61,7 +61,7 @@ def getId(url):
|
||||||
|
|
||||||
def getIds():
|
def getIds():
|
||||||
ids = []
|
ids = []
|
||||||
html = readUrlUnicode('http://www.impawards.com/archives/latest.html', timeout = 60*60)
|
html = read_url('http://www.impawards.com/archives/latest.html', timeout = 60*60, unicode=True)
|
||||||
pages = int(findRe(html, '<a href= page(.*?).html>')) + 1
|
pages = int(findRe(html, '<a href= page(.*?).html>')) + 1
|
||||||
for page in range(pages, 0, -1):
|
for page in range(pages, 0, -1):
|
||||||
for id in getIdsByPage(page):
|
for id in getIdsByPage(page):
|
||||||
|
@ -71,7 +71,7 @@ def getIds():
|
||||||
|
|
||||||
def getIdsByPage(page):
|
def getIdsByPage(page):
|
||||||
ids = []
|
ids = []
|
||||||
html = readUrlUnicode('http://www.impawards.com/archives/page%s.html' % page, timeout = -1)
|
html = read_url('http://www.impawards.com/archives/page%s.html' % page, timeout = -1, unicode=True)
|
||||||
results = re.compile('<a href = \.\./(.*?)>', re.DOTALL).findall(html)
|
results = re.compile('<a href = \.\./(.*?)>', re.DOTALL).findall(html)
|
||||||
for result in results:
|
for result in results:
|
||||||
url = 'http://impawards.com/%s' % result
|
url = 'http://impawards.com/%s' % result
|
||||||
|
@ -80,7 +80,7 @@ def getIdsByPage(page):
|
||||||
|
|
||||||
def getUrl(id):
|
def getUrl(id):
|
||||||
url = u"http://www.impawards.com/%s.html" % id
|
url = u"http://www.impawards.com/%s.html" % id
|
||||||
html = readUrlUnicode(url)
|
html = read_url(url, unicode=True)
|
||||||
if findRe(html, "No Movie Posters on This Page"):
|
if findRe(html, "No Movie Posters on This Page"):
|
||||||
url = u"http://www.impawards.com/%s_ver1.html" % id
|
url = u"http://www.impawards.com/%s_ver1.html" % id
|
||||||
return url
|
return url
|
||||||
|
|
|
@ -3,8 +3,8 @@
|
||||||
import re
|
import re
|
||||||
import urllib
|
import urllib
|
||||||
|
|
||||||
from ox.cache import readUrl
|
from ox.cache import read_url
|
||||||
from ox.html import decodeHtml, stripTags
|
from ox.html import decodeHtml, strip_tags
|
||||||
from ox.text import findRe
|
from ox.text import findRe
|
||||||
from ox.text import findString
|
from ox.text import findString
|
||||||
|
|
||||||
|
@ -113,20 +113,20 @@ class ItunesAlbum:
|
||||||
|
|
||||||
def getId(self):
|
def getId(self):
|
||||||
url = composeUrl('advancedSearch', {'media': 'music', 'title': self.title, 'artist': self.artist})
|
url = composeUrl('advancedSearch', {'media': 'music', 'title': self.title, 'artist': self.artist})
|
||||||
xml = readUrl(url, headers = ITUNES_HEADERS)
|
xml = read_url(url, headers = ITUNES_HEADERS)
|
||||||
id = findRe(xml, 'viewAlbum\?id=(.*?)&')
|
id = findRe(xml, 'viewAlbum\?id=(.*?)&')
|
||||||
return id
|
return id
|
||||||
|
|
||||||
def getData(self):
|
def getData(self):
|
||||||
data = {'id': self.id}
|
data = {'id': self.id}
|
||||||
url = composeUrl('viewAlbum', {'id': self.id})
|
url = composeUrl('viewAlbum', {'id': self.id})
|
||||||
xml = readUrl(url, None, ITUNES_HEADERS)
|
xml = read_url(url, None, ITUNES_HEADERS)
|
||||||
data['albumName'] = findRe(xml, '<B>(.*?)</B>')
|
data['albumName'] = findRe(xml, '<B>(.*?)</B>')
|
||||||
data['artistName'] = findRe(xml, '<b>(.*?)</b>')
|
data['artistName'] = findRe(xml, '<b>(.*?)</b>')
|
||||||
data['coverUrl'] = findRe(xml, 'reflection="." url="(.*?)"')
|
data['coverUrl'] = findRe(xml, 'reflection="." url="(.*?)"')
|
||||||
data['genre'] = findRe(xml, 'Genre:(.*?)<')
|
data['genre'] = findRe(xml, 'Genre:(.*?)<')
|
||||||
data['releaseDate'] = findRe(xml, 'Released(.*?)<')
|
data['releaseDate'] = findRe(xml, 'Released(.*?)<')
|
||||||
data['review'] = stripTags(findRe(xml, 'REVIEW</b>.*?<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
|
data['review'] = strip_tags(findRe(xml, 'REVIEW</b>.*?<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
|
||||||
data['tracks'] = []
|
data['tracks'] = []
|
||||||
strings = findRe(xml, '<key>items</key>.*?<dict>(.*?)$').split('<dict>')
|
strings = findRe(xml, '<key>items</key>.*?<dict>(.*?)$').split('<dict>')
|
||||||
for string in strings:
|
for string in strings:
|
||||||
|
@ -144,14 +144,14 @@ class ItunesMovie:
|
||||||
|
|
||||||
def getId(self):
|
def getId(self):
|
||||||
url = composeUrl('advancedSearch', {'media': 'movie', 'title': self.title, 'director': self.director})
|
url = composeUrl('advancedSearch', {'media': 'movie', 'title': self.title, 'director': self.director})
|
||||||
xml = readUrl(url, headers = ITUNES_HEADERS)
|
xml = read_url(url, headers = ITUNES_HEADERS)
|
||||||
id = findRe(xml, 'viewMovie\?id=(.*?)&')
|
id = findRe(xml, 'viewMovie\?id=(.*?)&')
|
||||||
return id
|
return id
|
||||||
|
|
||||||
def getData(self):
|
def getData(self):
|
||||||
data = {'id': self.id}
|
data = {'id': self.id}
|
||||||
url = composeUrl('viewMovie', {'id': self.id})
|
url = composeUrl('viewMovie', {'id': self.id})
|
||||||
xml = readUrl(url, None, ITUNES_HEADERS)
|
xml = read_url(url, None, ITUNES_HEADERS)
|
||||||
f = open('/Users/rolux/Desktop/iTunesData.xml', 'w')
|
f = open('/Users/rolux/Desktop/iTunesData.xml', 'w')
|
||||||
f.write(xml)
|
f.write(xml)
|
||||||
f.close()
|
f.close()
|
||||||
|
|
|
@ -1,15 +1,15 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
# vi:si:et:sw=4:sts=4:ts=4
|
# vi:si:et:sw=4:sts=4:ts=4
|
||||||
from ox.cache import readUrl
|
from ox.cache import read_url
|
||||||
from ox.html import decodeHtml
|
from ox.html import decodeHtml
|
||||||
from ox.text import findRe
|
from ox.text import findRe
|
||||||
|
|
||||||
|
|
||||||
def getLyrics(title, artist):
|
def getLyrics(title, artist):
|
||||||
html = readUrl('http://lyricsfly.com/api/')
|
html = read_url('http://lyricsfly.com/api/')
|
||||||
key = findRe(html, '<font color=green><b>(.*?)</b></font>')
|
key = findRe(html, '<font color=green><b>(.*?)</b></font>')
|
||||||
url = 'http://lyricsfly.com/api/api.php?i=%s&a=%s&t=%s' % (key, artist, title)
|
url = 'http://lyricsfly.com/api/api.php?i=%s&a=%s&t=%s' % (key, artist, title)
|
||||||
xml = readUrl(url)
|
xml = read_url(url)
|
||||||
lyrics = findRe(xml, '<tx>(.*?)\[br\] Lyrics [a-z]* by lyricsfly.com')
|
lyrics = findRe(xml, '<tx>(.*?)\[br\] Lyrics [a-z]* by lyricsfly.com')
|
||||||
lyrics = lyrics.replace('\n', '').replace('\r', '')
|
lyrics = lyrics.replace('\n', '').replace('\r', '')
|
||||||
lyrics = lyrics.replace('[br]', '\n').strip()
|
lyrics = lyrics.replace('[br]', '\n').strip()
|
||||||
|
|
|
@ -4,8 +4,8 @@ import re
|
||||||
from urllib import quote
|
from urllib import quote
|
||||||
from lxml.html import document_fromstring
|
from lxml.html import document_fromstring
|
||||||
|
|
||||||
from ox.cache import readUrl, readUrlUnicode
|
from ox.cache import read_url
|
||||||
from ox import findRe, stripTags
|
from ox import findRe, strip_tags
|
||||||
|
|
||||||
def getUrl(id):
|
def getUrl(id):
|
||||||
return 'http://www.metacritic.com/movie/%s' % id
|
return 'http://www.metacritic.com/movie/%s' % id
|
||||||
|
@ -15,18 +15,18 @@ def getId(url):
|
||||||
|
|
||||||
def getUrlByImdb(imdb):
|
def getUrlByImdb(imdb):
|
||||||
url = "http://www.imdb.com/title/tt%s/criticreviews" % imdb
|
url = "http://www.imdb.com/title/tt%s/criticreviews" % imdb
|
||||||
data = readUrl(url)
|
data = read_url(url)
|
||||||
metacritic_url = findRe(data, '"(http://www.metacritic.com/movie/.*?)"')
|
metacritic_url = findRe(data, '"(http://www.metacritic.com/movie/.*?)"')
|
||||||
return metacritic_url or None
|
return metacritic_url or None
|
||||||
|
|
||||||
def getMetacriticShowUrl(title):
|
def getMetacriticShowUrl(title):
|
||||||
title = quote(title)
|
title = quote(title)
|
||||||
url = "http://www.metacritic.com/search/process?ty=6&ts=%s&tfs=tvshow_title&x=0&y=0&sb=0&release_date_s=&release_date_e=&metascore_s=&metascore_e=" % title
|
url = "http://www.metacritic.com/search/process?ty=6&ts=%s&tfs=tvshow_title&x=0&y=0&sb=0&release_date_s=&release_date_e=&metascore_s=&metascore_e=" % title
|
||||||
data = readUrl(url)
|
data = read_url(url)
|
||||||
return findRe(data, '(http://www.metacritic.com/tv/shows/.*?)\?')
|
return findRe(data, '(http://www.metacritic.com/tv/shows/.*?)\?')
|
||||||
|
|
||||||
def getData(url):
|
def getData(url):
|
||||||
data = readUrlUnicode(url)
|
data = read_url(url, unicode=True)
|
||||||
doc = document_fromstring(data)
|
doc = document_fromstring(data)
|
||||||
score = filter(lambda s: s.attrib.get('property') == 'v:average',
|
score = filter(lambda s: s.attrib.get('property') == 'v:average',
|
||||||
doc.xpath('//span[@class="score_value"]'))
|
doc.xpath('//span[@class="score_value"]'))
|
||||||
|
@ -51,7 +51,7 @@ def getData(url):
|
||||||
'critic': authors[i],
|
'critic': authors[i],
|
||||||
'url': urls[i],
|
'url': urls[i],
|
||||||
'source': sources[i],
|
'source': sources[i],
|
||||||
'quote': stripTags(reviews[i]).strip(),
|
'quote': strip_tags(reviews[i]).strip(),
|
||||||
'score': scores[i],
|
'score': scores[i],
|
||||||
})
|
})
|
||||||
|
|
||||||
|
|
|
@ -5,8 +5,8 @@ import re
|
||||||
import socket
|
import socket
|
||||||
from urllib import quote
|
from urllib import quote
|
||||||
|
|
||||||
from ox.cache import readUrl, readUrlUnicode
|
from ox.cache import read_url
|
||||||
from ox import findRe, cache, stripTags, decodeHtml, getTorrentInfo, intValue, normalizeNewlines
|
from ox import findRe, cache, strip_tags, decodeHtml, getTorrentInfo, int_value, normalizeNewlines
|
||||||
from ox.normalize import normalizeImdbId
|
from ox.normalize import normalizeImdbId
|
||||||
import ox
|
import ox
|
||||||
|
|
||||||
|
@ -31,7 +31,7 @@ def findMovie(query, max_results=10):
|
||||||
'''search for torrents on mininova
|
'''search for torrents on mininova
|
||||||
'''
|
'''
|
||||||
url = "http://www.mininova.org/search/%s/seeds" % quote(query)
|
url = "http://www.mininova.org/search/%s/seeds" % quote(query)
|
||||||
data = readUrlUnicode(url)
|
data = read_url(url, unicode=True)
|
||||||
return _parseResultsPage(data, max_results)
|
return _parseResultsPage(data, max_results)
|
||||||
|
|
||||||
def findMovieByImdb(imdbId):
|
def findMovieByImdb(imdbId):
|
||||||
|
@ -39,7 +39,7 @@ def findMovieByImdb(imdbId):
|
||||||
'''
|
'''
|
||||||
results = []
|
results = []
|
||||||
imdbId = normalizeImdbId(imdbId)
|
imdbId = normalizeImdbId(imdbId)
|
||||||
data = readUrlUnicode("http://www.mininova.org/imdb/?imdb=%s" % imdbId)
|
data = read_url("http://www.mininova.org/imdb/?imdb=%s" % imdbId, unicode=True)
|
||||||
return _parseResultsPage(data)
|
return _parseResultsPage(data)
|
||||||
|
|
||||||
def getId(mininovaId):
|
def getId(mininovaId):
|
||||||
|
@ -55,7 +55,7 @@ def getId(mininovaId):
|
||||||
|
|
||||||
def exists(mininovaId):
|
def exists(mininovaId):
|
||||||
mininovaId = getId(mininovaId)
|
mininovaId = getId(mininovaId)
|
||||||
data = ox.net.readUrl("http://www.mininova.org/tor/%s" % mininovaId)
|
data = ox.net.read_url("http://www.mininova.org/tor/%s" % mininovaId)
|
||||||
if not data or 'Torrent not found...' in data:
|
if not data or 'Torrent not found...' in data:
|
||||||
return False
|
return False
|
||||||
if 'tracker</a> of this torrent requires registration.' in data:
|
if 'tracker</a> of this torrent requires registration.' in data:
|
||||||
|
@ -74,22 +74,22 @@ def getData(mininovaId):
|
||||||
torrent[u'torrent_link'] = "http://www.mininova.org/get/%s" % mininovaId
|
torrent[u'torrent_link'] = "http://www.mininova.org/get/%s" % mininovaId
|
||||||
torrent[u'details_link'] = "http://www.mininova.org/det/%s" % mininovaId
|
torrent[u'details_link'] = "http://www.mininova.org/det/%s" % mininovaId
|
||||||
|
|
||||||
data = readUrlUnicode(torrent['comment_link']) + readUrlUnicode(torrent['details_link'])
|
data = read_url(torrent['comment_link'], unicode=True) + read_url(torrent['details_link'], unicode=True)
|
||||||
if '<h1>Torrent not found...</h1>' in data:
|
if '<h1>Torrent not found...</h1>' in data:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
for d in re.compile('<p>.<strong>(.*?):</strong>(.*?)</p>', re.DOTALL).findall(data):
|
for d in re.compile('<p>.<strong>(.*?):</strong>(.*?)</p>', re.DOTALL).findall(data):
|
||||||
key = d[0].lower().strip()
|
key = d[0].lower().strip()
|
||||||
key = _key_map.get(key, key)
|
key = _key_map.get(key, key)
|
||||||
value = decodeHtml(stripTags(d[1].strip()))
|
value = decodeHtml(strip_tags(d[1].strip()))
|
||||||
torrent[key] = value
|
torrent[key] = value
|
||||||
|
|
||||||
torrent[u'title'] = findRe(data, '<title>(.*?):.*?</title>')
|
torrent[u'title'] = findRe(data, '<title>(.*?):.*?</title>')
|
||||||
torrent[u'imdbId'] = findRe(data, 'title/tt(\d{7})')
|
torrent[u'imdbId'] = findRe(data, 'title/tt(\d{7})')
|
||||||
torrent[u'description'] = findRe(data, '<div id="description">(.*?)</div>')
|
torrent[u'description'] = findRe(data, '<div id="description">(.*?)</div>')
|
||||||
if torrent['description']:
|
if torrent['description']:
|
||||||
torrent['description'] = normalizeNewlines(decodeHtml(stripTags(torrent['description']))).strip()
|
torrent['description'] = normalizeNewlines(decodeHtml(strip_tags(torrent['description']))).strip()
|
||||||
t = readUrl(torrent[u'torrent_link'])
|
t = read_url(torrent[u'torrent_link'])
|
||||||
torrent[u'torrent_info'] = getTorrentInfo(t)
|
torrent[u'torrent_info'] = getTorrentInfo(t)
|
||||||
return torrent
|
return torrent
|
||||||
|
|
||||||
|
@ -109,13 +109,13 @@ class Mininova(Torrent):
|
||||||
self['seeder'] = -1
|
self['seeder'] = -1
|
||||||
self['leecher'] = -1
|
self['leecher'] = -1
|
||||||
if len(ratio) == 2:
|
if len(ratio) == 2:
|
||||||
val = intValue(ratio[0].replace(',','').strip())
|
val = int_value(ratio[0].replace(',','').strip())
|
||||||
if val:
|
if val:
|
||||||
self['seeder'] = int(val)
|
self['seeder'] = int(val)
|
||||||
val = intValue(ratio[1].replace(',','').strip())
|
val = int_value(ratio[1].replace(',','').strip())
|
||||||
if val:
|
if val:
|
||||||
self['leecher'] = int(val)
|
self['leecher'] = int(val)
|
||||||
val = intValue(self.data['downloads'].replace(',','').strip())
|
val = int_value(self.data['downloads'].replace(',','').strip())
|
||||||
if val:
|
if val:
|
||||||
self['downloaded'] = int(val)
|
self['downloaded'] = int(val)
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
|
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from ox.cache import readUrlUnicode
|
from ox.cache import read_url
|
||||||
from ox import findRe
|
from ox import findRe
|
||||||
|
|
||||||
def getData(id):
|
def getData(id):
|
||||||
|
@ -24,7 +24,7 @@ def getId(url):
|
||||||
|
|
||||||
def getPostersByUrl(url, group=True, timeout=-1):
|
def getPostersByUrl(url, group=True, timeout=-1):
|
||||||
posters = []
|
posters = []
|
||||||
html = readUrlUnicode(url, timeout=timeout)
|
html = read_url(url, timeout=timeout, unicode=True)
|
||||||
if url in html:
|
if url in html:
|
||||||
if group:
|
if group:
|
||||||
results = re.compile('<a href="(http://www.movieposterdb.com/group/.+?)\??">', re.DOTALL).findall(html)
|
results = re.compile('<a href="(http://www.movieposterdb.com/group/.+?)\??">', re.DOTALL).findall(html)
|
||||||
|
@ -32,7 +32,7 @@ def getPostersByUrl(url, group=True, timeout=-1):
|
||||||
posters += getPostersByUrl(result, False)
|
posters += getPostersByUrl(result, False)
|
||||||
results = re.compile('<a href="(http://www.movieposterdb.com/poster/.+?)">', re.DOTALL).findall(html)
|
results = re.compile('<a href="(http://www.movieposterdb.com/poster/.+?)">', re.DOTALL).findall(html)
|
||||||
for result in results:
|
for result in results:
|
||||||
html = readUrlUnicode(result, timeout=timeout)
|
html = read_url(result, timeout=timeout, unicode=True)
|
||||||
posters.append(findRe(html, '"(http://www.movieposterdb.com/posters/.+?\.jpg)"'))
|
posters.append(findRe(html, '"(http://www.movieposterdb.com/posters/.+?\.jpg)"'))
|
||||||
return posters
|
return posters
|
||||||
|
|
||||||
|
|
|
@ -3,8 +3,8 @@
|
||||||
import re
|
import re
|
||||||
|
|
||||||
import feedparser
|
import feedparser
|
||||||
from ox.cache import readUrl, readUrlUnicode
|
from ox.cache import read_url
|
||||||
from ox import findRe, stripTags
|
from ox import findRe, strip_tags
|
||||||
from ox import langCode2To3, langTo3Code
|
from ox import langCode2To3, langTo3Code
|
||||||
|
|
||||||
def findSubtitlesByImdb(imdb, parts = 1, language = "eng"):
|
def findSubtitlesByImdb(imdb, parts = 1, language = "eng"):
|
||||||
|
@ -16,7 +16,7 @@ def findSubtitlesByImdb(imdb, parts = 1, language = "eng"):
|
||||||
if language:
|
if language:
|
||||||
url += "sublanguageid-%s/" % language
|
url += "sublanguageid-%s/" % language
|
||||||
url += "subsumcd-%s/subformat-srt/imdbid-%s/rss_2_00" % (parts, imdb)
|
url += "subsumcd-%s/subformat-srt/imdbid-%s/rss_2_00" % (parts, imdb)
|
||||||
data = readUrl(url)
|
data = read_url(url)
|
||||||
if "title>opensubtitles.com - search results</title" in data:
|
if "title>opensubtitles.com - search results</title" in data:
|
||||||
fd = feedparser.parse(data)
|
fd = feedparser.parse(data)
|
||||||
opensubtitleId = None
|
opensubtitleId = None
|
||||||
|
@ -31,11 +31,11 @@ def findSubtitlesByImdb(imdb, parts = 1, language = "eng"):
|
||||||
|
|
||||||
def downloadSubtitleById(opensubtitle_id):
|
def downloadSubtitleById(opensubtitle_id):
|
||||||
srts = {}
|
srts = {}
|
||||||
data = readUrl('http://www.opensubtitles.org/en/subtitles/%s' % opensubtitle_id)
|
data = read_url('http://www.opensubtitles.org/en/subtitles/%s' % opensubtitle_id)
|
||||||
reg_exp = 'href="(/en/download/file/.*?)">(.*?)</a>'
|
reg_exp = 'href="(/en/download/file/.*?)">(.*?)</a>'
|
||||||
for f in re.compile(reg_exp, re.DOTALL).findall(data):
|
for f in re.compile(reg_exp, re.DOTALL).findall(data):
|
||||||
name = stripTags(f[1]).split('\n')[0]
|
name = strip_tags(f[1]).split('\n')[0]
|
||||||
url = "http://www.opensubtitles.com%s" % f[0]
|
url = "http://www.opensubtitles.com%s" % f[0]
|
||||||
srts[name] = readUrlUnicode(url)
|
srts[name] = read_url(url, unicode=True)
|
||||||
return srts
|
return srts
|
||||||
|
|
||||||
|
|
|
@ -1,11 +1,11 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
# vi:si:et:sw=4:sts=4:ts=4
|
# vi:si:et:sw=4:sts=4:ts=4
|
||||||
import re
|
import re
|
||||||
from ox.net import readUrlUnicode
|
from ox.net import read_url
|
||||||
|
|
||||||
def getPosterUrl(id):
|
def getPosterUrl(id):
|
||||||
url = 'http://piratecinema.org/posters/'
|
url = 'http://piratecinema.org/posters/'
|
||||||
html = readUrlUnicode(url)
|
html = read_url(url, unicode=True)
|
||||||
results = re.compile('src="(.+)" title=".+\((\d{7})\)"').findall(html)
|
results = re.compile('src="(.+)" title=".+\((\d{7})\)"').findall(html)
|
||||||
for result in results:
|
for result in results:
|
||||||
if result[1] == id:
|
if result[1] == id:
|
||||||
|
|
|
@ -2,8 +2,8 @@
|
||||||
# vi:si:et:sw=4:sts=4:ts=4
|
# vi:si:et:sw=4:sts=4:ts=4
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from ox.cache import getHeaders, readUrl, readUrlUnicode
|
from ox.cache import getHeaders, read_url
|
||||||
from ox import findRe, stripTags
|
from ox import findRe, strip_tags
|
||||||
|
|
||||||
|
|
||||||
def getUrlByImdb(imdb):
|
def getUrlByImdb(imdb):
|
||||||
|
@ -14,7 +14,7 @@ def getUrlByImdb(imdb):
|
||||||
return u.url
|
return u.url
|
||||||
'''
|
'''
|
||||||
url = "http://www.rottentomatoes.com/alias?type=imdbid&s=%s" % imdb
|
url = "http://www.rottentomatoes.com/alias?type=imdbid&s=%s" % imdb
|
||||||
data = readUrl(url)
|
data = read_url(url)
|
||||||
if "movie_title" in data:
|
if "movie_title" in data:
|
||||||
movies = re.compile('(/m/.*?/)').findall(data)
|
movies = re.compile('(/m/.*?/)').findall(data)
|
||||||
if movies:
|
if movies:
|
||||||
|
@ -25,13 +25,13 @@ def get_og(data, key):
|
||||||
return findRe(data, '<meta property="og:%s".*?content="(.*?)"' % key)
|
return findRe(data, '<meta property="og:%s".*?content="(.*?)"' % key)
|
||||||
|
|
||||||
def getData(url):
|
def getData(url):
|
||||||
data = readUrl(url)
|
data = read_url(url)
|
||||||
r = {}
|
r = {}
|
||||||
r['title'] = findRe(data, '<h1 class="movie_title">(.*?)</h1>')
|
r['title'] = findRe(data, '<h1 class="movie_title">(.*?)</h1>')
|
||||||
if '(' in r['title']:
|
if '(' in r['title']:
|
||||||
r['year'] = findRe(r['title'], '\((\d*?)\)')
|
r['year'] = findRe(r['title'], '\((\d*?)\)')
|
||||||
r['title'] = stripTags(re.sub('\((\d*?)\)', '', r['title'])).strip()
|
r['title'] = strip_tags(re.sub('\((\d*?)\)', '', r['title'])).strip()
|
||||||
r['summary'] = stripTags(findRe(data, '<p id="movieSynopsis" class="movie_synopsis" itemprop="description">(.*?)</p>')).strip()
|
r['summary'] = strip_tags(findRe(data, '<p id="movieSynopsis" class="movie_synopsis" itemprop="description">(.*?)</p>')).strip()
|
||||||
r['summary'] = r['summary'].replace('\t', ' ').replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')
|
r['summary'] = r['summary'].replace('\t', ' ').replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')
|
||||||
if not r['summary']:
|
if not r['summary']:
|
||||||
r['summary'] = get_og(data, 'description')
|
r['summary'] = get_og(data, 'description')
|
||||||
|
|
|
@ -2,16 +2,16 @@
|
||||||
# vi:si:et:sw=4:sts=4:ts=4
|
# vi:si:et:sw=4:sts=4:ts=4
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from ..cache import readUrlUnicode
|
from ..cache import read_url
|
||||||
from .. import stripTags, decodeHtml
|
from .. import strip_tags, decodeHtml
|
||||||
from ..utils import datetime
|
from ..utils import datetime
|
||||||
|
|
||||||
|
|
||||||
def cleanup(key, data, data_type):
|
def cleanup(key, data, data_type):
|
||||||
if data:
|
if data:
|
||||||
if isinstance(data[0], basestring):
|
if isinstance(data[0], basestring):
|
||||||
#FIXME: some types need stripTags
|
#FIXME: some types need strip_tags
|
||||||
#data = [stripTags(decodeHtml(p)).strip() for p in data]
|
#data = [strip_tags(decodeHtml(p)).strip() for p in data]
|
||||||
data = [decodeHtml(p).strip() for p in data]
|
data = [decodeHtml(p).strip() for p in data]
|
||||||
elif isinstance(data[0], list) or isinstance(data[0], tuple):
|
elif isinstance(data[0], list) or isinstance(data[0], tuple):
|
||||||
data = [cleanup(key, p, data_type) for p in data]
|
data = [cleanup(key, p, data_type) for p in data]
|
||||||
|
@ -30,13 +30,13 @@ class SiteParser(dict):
|
||||||
def getUrl(self, page):
|
def getUrl(self, page):
|
||||||
return "%s%s" % (self.baseUrl, page)
|
return "%s%s" % (self.baseUrl, page)
|
||||||
|
|
||||||
def readUrlUnicode(self, url, timeout):
|
def read_url(self, url, timeout):
|
||||||
return readUrlUnicode(url, timeout=timeout)
|
return read_url(url, timeout=timeout, unicode=True)
|
||||||
|
|
||||||
def __init__(self, timeout=-1):
|
def __init__(self, timeout=-1):
|
||||||
for key in self.regex:
|
for key in self.regex:
|
||||||
url = self.getUrl(self.regex[key]['page'])
|
url = self.getUrl(self.regex[key]['page'])
|
||||||
data = self.readUrlUnicode(url, timeout)
|
data = self.read_url(url, timeout)
|
||||||
if isinstance(self.regex[key]['re'], basestring):
|
if isinstance(self.regex[key]['re'], basestring):
|
||||||
data = re.compile(self.regex[key]['re'], re.DOTALL).findall(data)
|
data = re.compile(self.regex[key]['re'], re.DOTALL).findall(data)
|
||||||
data = cleanup(key, data, self.regex[key]['type'])
|
data = cleanup(key, data, self.regex[key]['type'])
|
||||||
|
|
|
@ -5,7 +5,7 @@ import re
|
||||||
import time
|
import time
|
||||||
|
|
||||||
import ox.cache
|
import ox.cache
|
||||||
from ox.html import decodeHtml, stripTags
|
from ox.html import decodeHtml, strip_tags
|
||||||
import ox.net
|
import ox.net
|
||||||
|
|
||||||
|
|
||||||
|
@ -21,11 +21,11 @@ def getNews(year, month, day):
|
||||||
for section in sections:
|
for section in sections:
|
||||||
url = 'http://www.spiegel.de/%s/0,1518,archiv-%d-%03d,00.html' % (section, year, day)
|
url = 'http://www.spiegel.de/%s/0,1518,archiv-%d-%03d,00.html' % (section, year, day)
|
||||||
if date == time.strftime('%d.%m.%Y', time.localtime()):
|
if date == time.strftime('%d.%m.%Y', time.localtime()):
|
||||||
html = ox.net.readUrl(url)
|
html = ox.net.read_url(url)
|
||||||
else:
|
else:
|
||||||
html = ox.cache.readUrl(url)
|
html = ox.cache.read_url(url)
|
||||||
for item in re.compile('<div class="spTeaserCenterpage(.*?)</p>', re.DOTALL).findall(html):
|
for item in re.compile('<div class="spTeaserCenterpage(.*?)</p>', re.DOTALL).findall(html):
|
||||||
dateString = stripTags(re.compile('<div class="spDateTime">(.*?)</div>', re.DOTALL).findall(item)[0]).strip()
|
dateString = strip_tags(re.compile('<div class="spDateTime">(.*?)</div>', re.DOTALL).findall(item)[0]).strip()
|
||||||
try:
|
try:
|
||||||
description = formatString(re.compile('<p>(.*?)<', re.DOTALL).findall(item)[0])
|
description = formatString(re.compile('<p>(.*?)<', re.DOTALL).findall(item)[0])
|
||||||
except:
|
except:
|
||||||
|
@ -104,12 +104,12 @@ def getIssue(year, week):
|
||||||
return None
|
return None
|
||||||
url = 'http://service.spiegel.de/digas/servlet/epaper?Q=SP&JG=%d&AG=%d&SE=1&AN=INHALT' % (year, week)
|
url = 'http://service.spiegel.de/digas/servlet/epaper?Q=SP&JG=%d&AG=%d&SE=1&AN=INHALT' % (year, week)
|
||||||
contents = []
|
contents = []
|
||||||
data = ox.cache.readUrl(url)
|
data = ox.cache.read_url(url)
|
||||||
items = re.compile('<a.?href="http://service.spiegel.de/digas/servlet/epaper\?Q=SP&JG=".?>(.*?)</a>').findall(data)
|
items = re.compile('<a.?href="http://service.spiegel.de/digas/servlet/epaper\?Q=SP&JG=".?>(.*?)</a>').findall(data)
|
||||||
for item in items:
|
for item in items:
|
||||||
item = item[1]
|
item = item[1]
|
||||||
page = int(re.compile('&SE=(.*?)"').findall(item)[0])
|
page = int(re.compile('&SE=(.*?)"').findall(item)[0])
|
||||||
title = stripTags(item).strip()
|
title = strip_tags(item).strip()
|
||||||
contents.append({'title': title, 'page': page})
|
contents.append({'title': title, 'page': page})
|
||||||
pageUrl = {}
|
pageUrl = {}
|
||||||
pages = page + 2
|
pages = page + 2
|
||||||
|
@ -163,7 +163,7 @@ def archiveIssues():
|
||||||
f.close()
|
f.close()
|
||||||
filename = '%s/Der Spiegel %d %02d.jpg' % (dirname, y, w)
|
filename = '%s/Der Spiegel %d %02d.jpg' % (dirname, y, w)
|
||||||
if not os.path.exists(filename):
|
if not os.path.exists(filename):
|
||||||
data = ox.cache.readUrl(issue['coverUrl'])
|
data = ox.cache.read_url(issue['coverUrl'])
|
||||||
f = open(filename, 'w')
|
f = open(filename, 'w')
|
||||||
f.write(data)
|
f.write(data)
|
||||||
f.close()
|
f.close()
|
||||||
|
@ -172,7 +172,7 @@ def archiveIssues():
|
||||||
if url:
|
if url:
|
||||||
filename = '%s/Der Spiegel %d %02d %03d.jpg' % (dirname, y, w, page)
|
filename = '%s/Der Spiegel %d %02d %03d.jpg' % (dirname, y, w, page)
|
||||||
if not os.path.exists(filename):
|
if not os.path.exists(filename):
|
||||||
data = ox.cache.readUrl(url)
|
data = ox.cache.read_url(url)
|
||||||
f = open(filename, 'w')
|
f = open(filename, 'w')
|
||||||
f.write(data)
|
f.write(data)
|
||||||
f.close()
|
f.close()
|
||||||
|
@ -243,7 +243,7 @@ def archiveNews():
|
||||||
f.close()
|
f.close()
|
||||||
filename = dirname + '/' + new['imageUrl'].split('/')[-1]
|
filename = dirname + '/' + new['imageUrl'].split('/')[-1]
|
||||||
if not os.path.exists(filename):
|
if not os.path.exists(filename):
|
||||||
data = ox.cache.readUrl(new['imageUrl'])
|
data = ox.cache.read_url(new['imageUrl'])
|
||||||
f = open(filename, 'w')
|
f = open(filename, 'w')
|
||||||
f.write(data)
|
f.write(data)
|
||||||
f.close()
|
f.close()
|
||||||
|
|
|
@ -6,8 +6,7 @@ import socket
|
||||||
from urllib import quote, urlencode
|
from urllib import quote, urlencode
|
||||||
from urllib2 import URLError
|
from urllib2 import URLError
|
||||||
|
|
||||||
from ox.cache import readUrl, readUrlUnicode
|
from ox import findRe, cache, strip_tags, decodeHtml, getTorrentInfo, normalizeNewlines
|
||||||
from ox import findRe, cache, stripTags, decodeHtml, getTorrentInfo, normalizeNewlines
|
|
||||||
from ox.normalize import normalizeImdbId
|
from ox.normalize import normalizeImdbId
|
||||||
import ox
|
import ox
|
||||||
|
|
||||||
|
@ -18,13 +17,10 @@ cache_timeout = 24*60*60 # cache search only for 24 hours
|
||||||
season_episode = re.compile("S..E..", re.IGNORECASE)
|
season_episode = re.compile("S..E..", re.IGNORECASE)
|
||||||
|
|
||||||
|
|
||||||
def _readUrl(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None):
|
def read_url(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None, unicode=False):
|
||||||
headers = headers.copy()
|
headers = headers.copy()
|
||||||
headers['Cookie'] = 'language=en_EN'
|
headers['Cookie'] = 'language=en_EN'
|
||||||
return cache.readUrl(url, data, headers, timeout)
|
return cache.read_url(url, data, headers, timeout, unicode=unicode)
|
||||||
|
|
||||||
def _readUrlUnicode(url, timeout=cache.cache_timeout):
|
|
||||||
return cache.readUrlUnicode(url, _readUrl=_readUrl, timeout=timeout)
|
|
||||||
|
|
||||||
def findMovies(query, max_results=10):
|
def findMovies(query, max_results=10):
|
||||||
results = []
|
results = []
|
||||||
|
@ -37,7 +33,7 @@ def findMovies(query, max_results=10):
|
||||||
if not url.startswith('/'):
|
if not url.startswith('/'):
|
||||||
url = "/" + url
|
url = "/" + url
|
||||||
url = "http://thepiratebay.org" + url
|
url = "http://thepiratebay.org" + url
|
||||||
data = _readUrlUnicode(url, timeout=cache_timeout)
|
data = read_url(url, timeout=cache_timeout, unicode=True)
|
||||||
regexp = '''<tr.*?<td class="vertTh"><a href="/browse/(.*?)".*?<td><a href="(/torrent/.*?)" class="detLink".*?>(.*?)</a>.*?</tr>'''
|
regexp = '''<tr.*?<td class="vertTh"><a href="/browse/(.*?)".*?<td><a href="(/torrent/.*?)" class="detLink".*?>(.*?)</a>.*?</tr>'''
|
||||||
for row in re.compile(regexp, re.DOTALL).findall(data):
|
for row in re.compile(regexp, re.DOTALL).findall(data):
|
||||||
torrentType = row[0]
|
torrentType = row[0]
|
||||||
|
@ -83,7 +79,7 @@ def getData(piratebayId):
|
||||||
torrent[u'domain'] = 'thepiratebay.org'
|
torrent[u'domain'] = 'thepiratebay.org'
|
||||||
torrent[u'comment_link'] = 'http://thepiratebay.org/torrent/%s' % piratebayId
|
torrent[u'comment_link'] = 'http://thepiratebay.org/torrent/%s' % piratebayId
|
||||||
|
|
||||||
data = _readUrlUnicode(torrent['comment_link'])
|
data = read_url(torrent['comment_link'], unicode=True)
|
||||||
torrent[u'title'] = findRe(data, '<title>(.*?) \(download torrent\) - TPB</title>')
|
torrent[u'title'] = findRe(data, '<title>(.*?) \(download torrent\) - TPB</title>')
|
||||||
if not torrent[u'title']:
|
if not torrent[u'title']:
|
||||||
return None
|
return None
|
||||||
|
@ -94,12 +90,12 @@ def getData(piratebayId):
|
||||||
for d in re.compile('dt>(.*?):</dt>.*?<dd.*?>(.*?)</dd>', re.DOTALL).findall(data):
|
for d in re.compile('dt>(.*?):</dt>.*?<dd.*?>(.*?)</dd>', re.DOTALL).findall(data):
|
||||||
key = d[0].lower().strip()
|
key = d[0].lower().strip()
|
||||||
key = _key_map.get(key, key)
|
key = _key_map.get(key, key)
|
||||||
value = decodeHtml(stripTags(d[1].strip()))
|
value = decodeHtml(strip_tags(d[1].strip()))
|
||||||
torrent[key] = value
|
torrent[key] = value
|
||||||
torrent[u'description'] = findRe(data, '<div class="nfo">(.*?)</div>')
|
torrent[u'description'] = findRe(data, '<div class="nfo">(.*?)</div>')
|
||||||
if torrent[u'description']:
|
if torrent[u'description']:
|
||||||
torrent['description'] = normalizeNewlines(decodeHtml(stripTags(torrent['description']))).strip()
|
torrent['description'] = normalizeNewlines(decodeHtml(strip_tags(torrent['description']))).strip()
|
||||||
t = _readUrl(torrent[u'torrent_link'])
|
t = _read_url(torrent[u'torrent_link'])
|
||||||
torrent[u'torrent_info'] = getTorrentInfo(t)
|
torrent[u'torrent_info'] = getTorrentInfo(t)
|
||||||
return torrent
|
return torrent
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
# vi:si:et:sw=4:sts=4:ts=4
|
# vi:si:et:sw=4:sts=4:ts=4
|
||||||
from ox import intValue
|
from ox import int_value
|
||||||
|
|
||||||
|
|
||||||
class Torrent(dict):
|
class Torrent(dict):
|
||||||
|
@ -25,7 +25,7 @@ class Torrent(dict):
|
||||||
for key in self._int_keys:
|
for key in self._int_keys:
|
||||||
value = self.data.get(key, -1)
|
value = self.data.get(key, -1)
|
||||||
if not isinstance(value, int):
|
if not isinstance(value, int):
|
||||||
value = int(intValue(value))
|
value = int(int_value(value))
|
||||||
self[key] = value
|
self[key] = value
|
||||||
self['infohash'] = self.data['torrent_info'].get('hash', '')
|
self['infohash'] = self.data['torrent_info'].get('hash', '')
|
||||||
self['size'] = self.data['torrent_info'].get('size', -1)
|
self['size'] = self.data['torrent_info'].get('size', -1)
|
||||||
|
|
|
@ -3,8 +3,8 @@
|
||||||
import re
|
import re
|
||||||
import time
|
import time
|
||||||
|
|
||||||
from ox import stripTags, findRe
|
from ox import strip_tags, findRe
|
||||||
from ox.cache import readUrlUnicode
|
from ox.cache import read_url
|
||||||
|
|
||||||
|
|
||||||
def getEpisodeData(url):
|
def getEpisodeData(url):
|
||||||
|
@ -14,9 +14,9 @@ def getEpisodeData(url):
|
||||||
example:
|
example:
|
||||||
getEpisodeData('http://www.tv.com/lost/do-no-harm/episode/399310/summary.html')
|
getEpisodeData('http://www.tv.com/lost/do-no-harm/episode/399310/summary.html')
|
||||||
'''
|
'''
|
||||||
data = readUrlUnicode(url)
|
data = read_url(url, unicode=True)
|
||||||
r = {}
|
r = {}
|
||||||
r['description'] = stripTags(findRe(data, 'div id="main-col">.*?<div>(.*?)</div').split('\r')[0])
|
r['description'] = strip_tags(findRe(data, 'div id="main-col">.*?<div>(.*?)</div').split('\r')[0])
|
||||||
r['show'] = findRe(data, '<h1>(.*?)</h1>')
|
r['show'] = findRe(data, '<h1>(.*?)</h1>')
|
||||||
r['title'] = findRe(data, '<title>.*?: (.*?) - TV.com </title>')
|
r['title'] = findRe(data, '<title>.*?: (.*?) - TV.com </title>')
|
||||||
#episode score
|
#episode score
|
||||||
|
|
|
@ -4,13 +4,13 @@ import re
|
||||||
from StringIO import StringIO
|
from StringIO import StringIO
|
||||||
import xml.etree.ElementTree as ET
|
import xml.etree.ElementTree as ET
|
||||||
|
|
||||||
from ox.cache import readUrl, readUrlUnicode
|
from ox.cache import read_url
|
||||||
from ox import findString, findRe
|
from ox import findString, findRe
|
||||||
|
|
||||||
|
|
||||||
def getData(id):
|
def getData(id):
|
||||||
url = 'http://www.vimeo.com/moogaloop/load/clip:%s' %id
|
url = 'http://www.vimeo.com/moogaloop/load/clip:%s' %id
|
||||||
xml = readUrl(url)
|
xml = read_url(url)
|
||||||
tree = ET.parse(StringIO(xml))
|
tree = ET.parse(StringIO(xml))
|
||||||
request_signature = tree.find('request_signature').text
|
request_signature = tree.find('request_signature').text
|
||||||
request_signature_expires = tree.find('request_signature_expires').text
|
request_signature_expires = tree.find('request_signature_expires').text
|
||||||
|
|
|
@ -4,7 +4,7 @@ import re
|
||||||
from urllib import urlencode
|
from urllib import urlencode
|
||||||
|
|
||||||
from ox.utils import json
|
from ox.utils import json
|
||||||
from ox.cache import readUrl, readUrlUnicode
|
from ox.cache import read_url
|
||||||
from ox import findRe, decodeHtml
|
from ox import findRe, decodeHtml
|
||||||
|
|
||||||
|
|
||||||
|
@ -47,7 +47,7 @@ def getUrlByAllmovieId(allmovieId):
|
||||||
def getWikiData(wikipediaUrl):
|
def getWikiData(wikipediaUrl):
|
||||||
url = wikipediaUrl.replace('wikipedia.org/wiki/', 'wikipedia.org/w/index.php?title=')
|
url = wikipediaUrl.replace('wikipedia.org/wiki/', 'wikipedia.org/w/index.php?title=')
|
||||||
url = "%s&action=raw" % url
|
url = "%s&action=raw" % url
|
||||||
data = readUrl(url).decode('utf-8')
|
data = read_url(url).decode('utf-8')
|
||||||
return data
|
return data
|
||||||
|
|
||||||
def getMovieData(wikipediaUrl):
|
def getMovieData(wikipediaUrl):
|
||||||
|
@ -106,7 +106,7 @@ def getMovieData(wikipediaUrl):
|
||||||
|
|
||||||
def getImageUrl(name):
|
def getImageUrl(name):
|
||||||
url = 'http://en.wikipedia.org/wiki/Image:' + name.replace(' ', '%20')
|
url = 'http://en.wikipedia.org/wiki/Image:' + name.replace(' ', '%20')
|
||||||
data = readUrlUnicode(url)
|
data = read_url(url, unicode=True)
|
||||||
url = findRe(data, 'href="(http://upload.wikimedia.org/.*?)"')
|
url = findRe(data, 'href="(http://upload.wikimedia.org/.*?)"')
|
||||||
if not url:
|
if not url:
|
||||||
url = findRe(data, 'href="(//upload.wikimedia.org/.*?)"')
|
url = findRe(data, 'href="(//upload.wikimedia.org/.*?)"')
|
||||||
|
@ -133,9 +133,9 @@ def find(query, max_results=10):
|
||||||
query = {'action': 'query', 'list':'search', 'format': 'json',
|
query = {'action': 'query', 'list':'search', 'format': 'json',
|
||||||
'srlimit': max_results, 'srwhat': 'text', 'srsearch': query.encode('utf-8')}
|
'srlimit': max_results, 'srwhat': 'text', 'srsearch': query.encode('utf-8')}
|
||||||
url = "http://en.wikipedia.org/w/api.php?" + urlencode(query)
|
url = "http://en.wikipedia.org/w/api.php?" + urlencode(query)
|
||||||
data = readUrl(url)
|
data = read_url(url)
|
||||||
if not data:
|
if not data:
|
||||||
data = readUrl(url, timeout=0)
|
data = read_url(url, timeout=0)
|
||||||
result = json.loads(data)
|
result = json.loads(data)
|
||||||
results = []
|
results = []
|
||||||
if result and 'query' in result:
|
if result and 'query' in result:
|
||||||
|
|
|
@ -5,7 +5,7 @@ import re
|
||||||
from xml.dom.minidom import parseString
|
from xml.dom.minidom import parseString
|
||||||
|
|
||||||
import feedparser
|
import feedparser
|
||||||
from ox.cache import readUrl, cache_timeout
|
from ox.cache import read_url, cache_timeout
|
||||||
|
|
||||||
|
|
||||||
def getVideoUrl(youtubeId, format='mp4', timeout=cache_timeout):
|
def getVideoUrl(youtubeId, format='mp4', timeout=cache_timeout):
|
||||||
|
@ -33,7 +33,7 @@ def getVideoUrl(youtubeId, format='mp4', timeout=cache_timeout):
|
||||||
def find(query, max_results=10, offset=1, orderBy='relevance'):
|
def find(query, max_results=10, offset=1, orderBy='relevance'):
|
||||||
query = quote(query)
|
query = quote(query)
|
||||||
url = "http://gdata.youtube.com/feeds/api/videos?vq=%s&orderby=%s&start-index=%s&max-results=%s" % (query, orderBy, offset, max_results)
|
url = "http://gdata.youtube.com/feeds/api/videos?vq=%s&orderby=%s&start-index=%s&max-results=%s" % (query, orderBy, offset, max_results)
|
||||||
data = readUrl(url)
|
data = read_url(url)
|
||||||
fd = feedparser.parse(data)
|
fd = feedparser.parse(data)
|
||||||
videos = []
|
videos = []
|
||||||
for item in fd.entries:
|
for item in fd.entries:
|
||||||
|
@ -48,7 +48,7 @@ def find(query, max_results=10, offset=1, orderBy='relevance'):
|
||||||
def info(id):
|
def info(id):
|
||||||
info = {}
|
info = {}
|
||||||
url = "http://gdata.youtube.com/feeds/api/videos/%s?v=2" % id
|
url = "http://gdata.youtube.com/feeds/api/videos/%s?v=2" % id
|
||||||
data = readUrl(url)
|
data = read_url(url)
|
||||||
xml = parseString(data)
|
xml = parseString(data)
|
||||||
info['url'] = 'http://www.youtube.com/watch?v=%s' % id
|
info['url'] = 'http://www.youtube.com/watch?v=%s' % id
|
||||||
info['title'] = xml.getElementsByTagName('title')[0].firstChild.data
|
info['title'] = xml.getElementsByTagName('title')[0].firstChild.data
|
||||||
|
@ -62,21 +62,21 @@ def info(id):
|
||||||
|
|
||||||
info['keywords'] = xml.getElementsByTagName('media:keywords')[0].firstChild.data.split(', ')
|
info['keywords'] = xml.getElementsByTagName('media:keywords')[0].firstChild.data.split(', ')
|
||||||
url = "http://www.youtube.com/watch?v=%s" % id
|
url = "http://www.youtube.com/watch?v=%s" % id
|
||||||
data = readUrl(url)
|
data = read_url(url)
|
||||||
match = re.compile('<h4>License:</h4>(.*?)</p>', re.DOTALL).findall(data)
|
match = re.compile('<h4>License:</h4>(.*?)</p>', re.DOTALL).findall(data)
|
||||||
if match:
|
if match:
|
||||||
info['license'] = match[0].strip()
|
info['license'] = match[0].strip()
|
||||||
info['license'] = re.sub('<.+?>', '', info['license']).strip()
|
info['license'] = re.sub('<.+?>', '', info['license']).strip()
|
||||||
|
|
||||||
url = "http://www.youtube.com/api/timedtext?hl=en&type=list&tlangs=1&v=%s&asrs=1"%id
|
url = "http://www.youtube.com/api/timedtext?hl=en&type=list&tlangs=1&v=%s&asrs=1"%id
|
||||||
data = readUrl(url)
|
data = read_url(url)
|
||||||
xml = parseString(data)
|
xml = parseString(data)
|
||||||
languages = [t.getAttribute('lang_code') for t in xml.getElementsByTagName('track')]
|
languages = [t.getAttribute('lang_code') for t in xml.getElementsByTagName('track')]
|
||||||
if languages:
|
if languages:
|
||||||
info['subtitles'] = {}
|
info['subtitles'] = {}
|
||||||
for language in languages:
|
for language in languages:
|
||||||
url = "http://www.youtube.com/api/timedtext?hl=en&v=%s&type=track&lang=%s&name&kind"%(id, language)
|
url = "http://www.youtube.com/api/timedtext?hl=en&v=%s&type=track&lang=%s&name&kind"%(id, language)
|
||||||
data = readUrl(url)
|
data = read_url(url)
|
||||||
xml = parseString(data)
|
xml = parseString(data)
|
||||||
subs = []
|
subs = []
|
||||||
for t in xml.getElementsByTagName('text'):
|
for t in xml.getElementsByTagName('text'):
|
||||||
|
@ -101,7 +101,7 @@ def videos(id, format=''):
|
||||||
'mp4': 'video/mp4'
|
'mp4': 'video/mp4'
|
||||||
}.get(format)
|
}.get(format)
|
||||||
url = "http://www.youtube.com/watch?v=%s" % id
|
url = "http://www.youtube.com/watch?v=%s" % id
|
||||||
data = readUrl(url)
|
data = read_url(url)
|
||||||
match = re.compile('"url_encoded_fmt_stream_map": "(.*?)"').findall(data)
|
match = re.compile('"url_encoded_fmt_stream_map": "(.*?)"').findall(data)
|
||||||
streams = {}
|
streams = {}
|
||||||
for x in match[0].split(','):
|
for x in match[0].split(','):
|
||||||
|
|
Loading…
Reference in a new issue