net/cache readUrl->read_url / Unicode -> unicode=True
format replace all CammelCase with under_score
This commit is contained in:
parent
c1d0fc6242
commit
2de989e188
33 changed files with 243 additions and 254 deletions
42
ox/cache.py
42
ox/cache.py
|
@ -18,7 +18,7 @@ from utils import json
|
|||
from .file import makedirs
|
||||
|
||||
import net
|
||||
from net import DEFAULT_HEADERS, getEncoding
|
||||
from net import DEFAULT_HEADERS, detect_encoding
|
||||
|
||||
cache_timeout = 30*24*60*60 # default is 30 days
|
||||
|
||||
|
@ -40,7 +40,7 @@ def status(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):
|
|||
>>> status('http://google.com/mysearch')
|
||||
404
|
||||
'''
|
||||
headers = getHeaders(url, data, headers)
|
||||
headers = get_headers(url, data, headers)
|
||||
return int(headers['status'])
|
||||
|
||||
def exists(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):
|
||||
|
@ -55,10 +55,10 @@ def exists(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):
|
|||
return True
|
||||
return False
|
||||
|
||||
def getHeaders(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):
|
||||
def get_headers(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):
|
||||
url_headers = store.get(url, data, headers, timeout, "headers")
|
||||
if not url_headers:
|
||||
url_headers = net.getHeaders(url, data, headers)
|
||||
url_headers = net.get_headers(url, data, headers)
|
||||
store.set(url, data, -1, url_headers)
|
||||
return url_headers
|
||||
|
||||
|
@ -68,7 +68,7 @@ class InvalidResult(Exception):
|
|||
self.result = result
|
||||
self.headers = headers
|
||||
|
||||
def readUrl(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout, valid=None):
|
||||
def read_url(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout, valid=None, unicode=False):
|
||||
'''
|
||||
url - url to load
|
||||
data - possible post data
|
||||
|
@ -80,31 +80,29 @@ def readUrl(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout, vali
|
|||
#FIXME: send last-modified / etag from cache and only update if needed
|
||||
if isinstance(url, unicode):
|
||||
url = url.encode('utf-8')
|
||||
result = store.get(url, data, headers, timeout)
|
||||
if not result:
|
||||
data = store.get(url, data, headers, timeout)
|
||||
if not data:
|
||||
#print "get data", url
|
||||
try:
|
||||
url_headers, result = net.readUrl(url, data, headers, returnHeaders=True)
|
||||
url_headers, data = net.read_url(url, data, headers, return_headers=True)
|
||||
except urllib2.HTTPError, e:
|
||||
e.headers['Status'] = "%s" % e.code
|
||||
url_headers = dict(e.headers)
|
||||
result = e.read()
|
||||
data = e.read()
|
||||
if url_headers.get('content-encoding', None) == 'gzip':
|
||||
result = gzip.GzipFile(fileobj=StringIO.StringIO(result)).read()
|
||||
if not valid or valid(result, url_headers):
|
||||
store.set(url, data, result, url_headers)
|
||||
data = gzip.GzipFile(fileobj=StringIO.StringIO(data)).read()
|
||||
if not valid or valid(data, url_headers):
|
||||
store.set(url, data, data, url_headers)
|
||||
else:
|
||||
raise InvalidResult(result, url_headers)
|
||||
return result
|
||||
raise InvalidResult(data, url_headers)
|
||||
if unicode:
|
||||
encoding = detect_encoding(data)
|
||||
if not encoding:
|
||||
encoding = 'latin-1'
|
||||
data = data.decode(encoding)
|
||||
return data
|
||||
|
||||
def readUrlUnicode(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout, _readUrl=readUrl, valid=None):
|
||||
data = _readUrl(url, data, headers, timeout, valid)
|
||||
encoding = getEncoding(data)
|
||||
if not encoding:
|
||||
encoding = 'latin-1'
|
||||
return unicode(data, encoding)
|
||||
|
||||
def saveUrl(url, filename, overwrite=False):
|
||||
def save_url(url, filename, overwrite=False):
|
||||
if not os.path.exists(filename) or overwrite:
|
||||
dirname = os.path.dirname(filename)
|
||||
if not os.path.exists(dirname):
|
||||
|
|
84
ox/format.py
84
ox/format.py
|
@ -217,15 +217,15 @@ def to36(q):
|
|||
def from36(q):
|
||||
return int(q, 36)
|
||||
|
||||
def intValue(strValue, default=u''):
|
||||
def int_value(strValue, default=u''):
|
||||
"""
|
||||
>>> intValue('abc23')
|
||||
>>> int_value('abc23')
|
||||
u'23'
|
||||
|
||||
>>> intValue(' abc23')
|
||||
>>> int_value(' abc23')
|
||||
u'23'
|
||||
|
||||
>>> intValue('ab')
|
||||
>>> int_value('ab')
|
||||
u''
|
||||
"""
|
||||
try:
|
||||
|
@ -234,15 +234,15 @@ def intValue(strValue, default=u''):
|
|||
val = default
|
||||
return val
|
||||
|
||||
def floatValue(strValue, default=u''):
|
||||
def float_value(strValue, default=u''):
|
||||
"""
|
||||
>>> floatValue('abc23.4')
|
||||
>>> float_value('abc23.4')
|
||||
u'23.4'
|
||||
|
||||
>>> floatValue(' abc23.4')
|
||||
>>> float_value(' abc23.4')
|
||||
u'23.4'
|
||||
|
||||
>>> floatValue('ab')
|
||||
>>> float_value('ab')
|
||||
u''
|
||||
"""
|
||||
try:
|
||||
|
@ -251,46 +251,46 @@ def floatValue(strValue, default=u''):
|
|||
val = default
|
||||
return val
|
||||
|
||||
def formatNumber(number, longName, shortName):
|
||||
def format_number(number, longName, shortName):
|
||||
"""
|
||||
Return the number in a human-readable format (23 KB, 23.4 MB, 23.42 GB)
|
||||
|
||||
>>> formatNumber(123, 'Byte', 'B')
|
||||
>>> format_number(123, 'Byte', 'B')
|
||||
'123 Bytes'
|
||||
|
||||
>>> formatNumber(1234, 'Byte', 'B')
|
||||
>>> format_number(1234, 'Byte', 'B')
|
||||
'1 KB'
|
||||
|
||||
>>> formatNumber(1234567, 'Byte', 'B')
|
||||
>>> format_number(1234567, 'Byte', 'B')
|
||||
'1.2 MB'
|
||||
|
||||
>>> formatNumber(1234567890, 'Byte', 'B')
|
||||
>>> format_number(1234567890, 'Byte', 'B')
|
||||
'1.15 GB'
|
||||
|
||||
>>> formatNumber(1234567890123456789, 'Byte', 'B')
|
||||
>>> format_number(1234567890123456789, 'Byte', 'B')
|
||||
'1,096.5166 PB'
|
||||
|
||||
>>> formatNumber(-1234567890123456789, 'Byte', 'B')
|
||||
>>> format_number(-1234567890123456789, 'Byte', 'B')
|
||||
'-1,096.5166 PB'
|
||||
|
||||
"""
|
||||
if abs(number) < 1024:
|
||||
return '%s %s%s' % (formatThousands(number), longName, number != 1 and 's' or '')
|
||||
return '%s %s%s' % (format_thousands(number), longName, number != 1 and 's' or '')
|
||||
prefix = ['K', 'M', 'G', 'T', 'P']
|
||||
for i in range(5):
|
||||
if abs(number) < math.pow(1024, i + 2) or i == 4:
|
||||
n = number / math.pow(1024, i + 1)
|
||||
return '%s %s%s' % (formatThousands('%.*f' % (i, n)), prefix[i], shortName)
|
||||
return '%s %s%s' % (format_thousands('%.*f' % (i, n)), prefix[i], shortName)
|
||||
|
||||
def formatThousands(number, separator = ','):
|
||||
def format_thousands(number, separator = ','):
|
||||
"""
|
||||
Return the number with separators (1,000,000)
|
||||
|
||||
>>> formatThousands(1)
|
||||
>>> format_thousands(1)
|
||||
'1'
|
||||
>>> formatThousands(1000)
|
||||
>>> format_thousands(1000)
|
||||
'1,000'
|
||||
>>> formatThousands(1000000)
|
||||
>>> format_thousands(1000000)
|
||||
'1,000,000'
|
||||
"""
|
||||
string = str(number).split('.')
|
||||
|
@ -302,16 +302,16 @@ def formatThousands(number, separator = ','):
|
|||
string[0] = ''.join(l)
|
||||
return '.'.join(string)
|
||||
|
||||
def formatBits(number):
|
||||
return formatNumber(number, 'bit', 'b')
|
||||
def format_bits(number):
|
||||
return format_number(number, 'bit', 'b')
|
||||
|
||||
def formatBytes(number):
|
||||
return formatNumber(number, 'byte', 'B')
|
||||
def format_bytes(number):
|
||||
return format_number(number, 'byte', 'B')
|
||||
|
||||
def formatPixels(number):
|
||||
return formatNumber(number, 'pixel', 'px')
|
||||
def format_pixels(number):
|
||||
return format_number(number, 'pixel', 'px')
|
||||
|
||||
def formatCurrency(amount, currency="$"):
|
||||
def format_currency(amount, currency="$"):
|
||||
if amount:
|
||||
temp = "%.2f" % amount
|
||||
profile=re.compile(r"(\d)(\d\d\d[.,])")
|
||||
|
@ -336,9 +336,9 @@ def plural(amount, unit, plural='s'):
|
|||
if plural == 's':
|
||||
unit = unit + plural
|
||||
else: unit = plural
|
||||
return "%s %s" % (formatThousands(amount), unit)
|
||||
return "%s %s" % (format_thousands(amount), unit)
|
||||
|
||||
def formatDuration(ms, verbosity=0, years=True, hours=True, milliseconds=True):
|
||||
def format_duration(ms, verbosity=0, years=True, hours=True, milliseconds=True):
|
||||
'''
|
||||
verbosity
|
||||
0: D:HH:MM:SS
|
||||
|
@ -353,13 +353,13 @@ def formatDuration(ms, verbosity=0, years=True, hours=True, milliseconds=True):
|
|||
milliseconds
|
||||
True: always display milliseconds
|
||||
False: never display milliseconds
|
||||
>>> formatDuration(1000 * 60 * 60 * 24 * 366)
|
||||
>>> format_duration(1000 * 60 * 60 * 24 * 366)
|
||||
'1:001:00:00:00.000'
|
||||
>>> formatDuration(1000 * 60 * 60 * 24 * 366, years=False)
|
||||
>>> format_duration(1000 * 60 * 60 * 24 * 366, years=False)
|
||||
'366:00:00:00.000'
|
||||
>>> formatDuration(1000 * 60 * 60 * 24 * 365 + 2003, verbosity=2)
|
||||
>>> format_duration(1000 * 60 * 60 * 24 * 365 + 2003, verbosity=2)
|
||||
'1 year 2 seconds 3 milliseconds'
|
||||
>>> formatDuration(1000 * 30, hours=False, milliseconds=False)
|
||||
>>> format_duration(1000 * 30, hours=False, milliseconds=False)
|
||||
'00:30'
|
||||
'''
|
||||
if not ms and ms != 0:
|
||||
|
@ -403,7 +403,7 @@ def formatDuration(ms, verbosity=0, years=True, hours=True, milliseconds=True):
|
|||
return duration
|
||||
|
||||
def ms2runtime(ms, shortenLong=False):
|
||||
# deprecated - use formatDuration
|
||||
# deprecated - use format_duration
|
||||
'''
|
||||
>>> ms2runtime(5000)
|
||||
'5 seconds'
|
||||
|
@ -415,11 +415,11 @@ def ms2runtime(ms, shortenLong=False):
|
|||
'13 hours 53 minutes'
|
||||
'''
|
||||
if shortenLong and ms > 1000 * 60 * 60 * 24 * 464:
|
||||
return formatDuration(ms, verbosity=1, milliseconds=False)
|
||||
return formatDuration(ms, verbosity=2, milliseconds=False)
|
||||
return format_duration(ms, verbosity=1, milliseconds=False)
|
||||
return format_duration(ms, verbosity=2, milliseconds=False)
|
||||
|
||||
def ms2playtime(ms, hours=False):
|
||||
# deprecated - use formatDuration
|
||||
# deprecated - use format_duration
|
||||
'''
|
||||
>>> ms2playtime(5000)
|
||||
'00:05'
|
||||
|
@ -428,15 +428,15 @@ def ms2playtime(ms, hours=False):
|
|||
>>> ms2playtime(50000000)
|
||||
'13:53:20'
|
||||
'''
|
||||
return formatDuration(ms, hours=False, years=False, milliseconds=False)
|
||||
return format_duration(ms, hours=False, years=False, milliseconds=False)
|
||||
|
||||
def ms2time(ms):
|
||||
# deprecated - use formatDuration
|
||||
# deprecated - use format_duration
|
||||
'''
|
||||
>>> ms2time(44592123)
|
||||
'12:23:12.123'
|
||||
'''
|
||||
return formatDuration(ms, years=False)
|
||||
return format_duration(ms, years=False)
|
||||
|
||||
def time2ms(timeString):
|
||||
'''
|
||||
|
@ -451,7 +451,7 @@ def time2ms(timeString):
|
|||
ms = ms * 60 + float(_p)
|
||||
return int(ms * 1000)
|
||||
|
||||
def shiftTime(offset, timeString):
|
||||
def shift_time(offset, timeString):
|
||||
newTime = time2ms(timeString) + offset
|
||||
return ms2time(newTime)
|
||||
|
||||
|
|
30
ox/net.py
30
ox/net.py
|
@ -22,7 +22,7 @@ DEFAULT_HEADERS = {
|
|||
|
||||
def status(url, data=None, headers=DEFAULT_HEADERS):
|
||||
try:
|
||||
f = openUrl(url, data, headers)
|
||||
f = open_url(url, data, headers)
|
||||
s = f.code
|
||||
except urllib2.HTTPError, e:
|
||||
s = e.code
|
||||
|
@ -34,9 +34,9 @@ def exists(url, data=None, headers=DEFAULT_HEADERS):
|
|||
return True
|
||||
return False
|
||||
|
||||
def getHeaders(url, data=None, headers=DEFAULT_HEADERS):
|
||||
def headers(url, data=None, headers=DEFAULT_HEADERS):
|
||||
try:
|
||||
f = openUrl(url, data, headers)
|
||||
f = open_url(url, data, headers)
|
||||
f.headers['Status'] = "%s" % f.code
|
||||
headers = f.headers
|
||||
f.close()
|
||||
|
@ -45,30 +45,28 @@ def getHeaders(url, data=None, headers=DEFAULT_HEADERS):
|
|||
headers = e.headers
|
||||
return dict(headers)
|
||||
|
||||
def openUrl(url, data=None, headers=DEFAULT_HEADERS):
|
||||
def open_url(url, data=None, headers=DEFAULT_HEADERS):
|
||||
url = url.replace(' ', '%20')
|
||||
req = urllib2.Request(url, data, headers)
|
||||
return urllib2.urlopen(req)
|
||||
|
||||
def readUrl(url, data=None, headers=DEFAULT_HEADERS, returnHeaders=False):
|
||||
f = openUrl(url, data, headers)
|
||||
def read_url(url, data=None, headers=DEFAULT_HEADERS, return_headers=False, unicode=False):
|
||||
f = open_url(url, data, headers)
|
||||
data = f.read()
|
||||
f.close()
|
||||
if f.headers.get('content-encoding', None) == 'gzip':
|
||||
data = gzip.GzipFile(fileobj=StringIO.StringIO(data)).read()
|
||||
if returnHeaders:
|
||||
if unicode:
|
||||
encoding = detect_encoding(data)
|
||||
if not encoding:
|
||||
encoding = 'latin-1'
|
||||
data = data.decode(encoding)
|
||||
if return_headers:
|
||||
f.headers['Status'] = "%s" % f.code
|
||||
return dict(f.headers), data
|
||||
return data
|
||||
|
||||
def readUrlUnicode(url, data=None, headers=DEFAULT_HEADERS):
|
||||
data = readUrl(url, data, headers)
|
||||
encoding = getEncoding(data)
|
||||
if not encoding:
|
||||
encoding = 'latin-1'
|
||||
return unicode(data, encoding)
|
||||
|
||||
def getEncoding(data):
|
||||
def detect_encoding(data):
|
||||
if 'content="text/html; charset=utf-8"' in data:
|
||||
return 'utf-8'
|
||||
elif 'content="text/html; charset=iso-8859-1"' in data:
|
||||
|
@ -81,7 +79,7 @@ def getEncoding(data):
|
|||
detector.close()
|
||||
return detector.result['encoding']
|
||||
|
||||
def saveUrl(url, filename, overwrite=False):
|
||||
def save_url(url, filename, overwrite=False):
|
||||
if not os.path.exists(filename) or overwrite:
|
||||
dirname = os.path.dirname(filename)
|
||||
if not os.path.exists(dirname):
|
||||
|
|
|
@ -97,8 +97,8 @@ def encode(data):
|
|||
for s in data:
|
||||
srt += '%d\r\n%s --> %s\r\n%s\r\n\r\n' % (
|
||||
i,
|
||||
ox.formatDuration(s['in']*1000, years=False).replace('.', ','),
|
||||
ox.formatDuration(s['out']*1000, years=False).replace('.', ','),
|
||||
ox.format_duration(s['in']*1000, years=False).replace('.', ','),
|
||||
ox.format_duration(s['out']*1000, years=False).replace('.', ','),
|
||||
s['value'].replace('\n', '\r\n').strip()
|
||||
)
|
||||
i += 1
|
||||
|
|
|
@ -3,8 +3,8 @@
|
|||
import re
|
||||
import time
|
||||
|
||||
from ox import stripTags, findRe
|
||||
from ox.cache import readUrlUnicode
|
||||
from ox import strip_tags, findRe
|
||||
from ox.cache import read_url
|
||||
|
||||
|
||||
def getId(url):
|
||||
|
@ -26,7 +26,7 @@ def getData(id):
|
|||
data = {
|
||||
"url": getUrl(id)
|
||||
}
|
||||
html = readUrlUnicode(data["url"])
|
||||
html = read_url(data["url"], unicode=True)
|
||||
data['aka'] = parseList(html, 'AKA')
|
||||
data['category'] = findRe(html, '<dt>category</dt>.*?<dd>(.*?)</dd>')
|
||||
data['countries'] = parseList(html, 'countries')
|
||||
|
@ -40,18 +40,18 @@ def getData(id):
|
|||
data['releasedate'] = parseList(html, 'release date')
|
||||
data['runtime'] = parseEntry(html, 'run time').replace('min.', '').strip()
|
||||
data['set'] = parseEntry(html, 'set in')
|
||||
data['synopsis'] = stripTags(findRe(html, '<div class="toggle-text" itemprop="description">(.*?)</div>')).strip()
|
||||
data['synopsis'] = strip_tags(findRe(html, '<div class="toggle-text" itemprop="description">(.*?)</div>')).strip()
|
||||
data['themes'] = parseList(html, 'themes')
|
||||
data['types'] = parseList(html, 'types')
|
||||
data['year'] = findRe(html, '<span class="year">.*?(\d+)')
|
||||
#data['stills'] = [re.sub('_derived.*?/', '', i) for i in re.compile('<a href="#" title="movie still".*?<img src="(.*?)"', re.DOTALL).findall(html)]
|
||||
data['stills'] = re.compile('<a href="#" title="movie still".*?<img src="(.*?)"', re.DOTALL).findall(html)
|
||||
#html = readUrlUnicode("http://allmovie.com/work/%s/cast" % id)
|
||||
#html = read_url("http://allmovie.com/work/%s/cast" % id, unicode=True)
|
||||
#data['cast'] = parseTable(html)
|
||||
#html = readUrlUnicode("http://allmovie.com/work/%s/credits" % id)
|
||||
#html = read_url("http://allmovie.com/work/%s/credits" % id, unicode=True)
|
||||
#data['credits'] = parseTable(html)
|
||||
html = readUrlUnicode("http://allmovie.com/work/%s/review" % id)
|
||||
data['review'] = stripTags(findRe(html, '<div class="toggle-text" itemprop="description">(.*?)</div>')).strip()
|
||||
html = read_url("http://allmovie.com/work/%s/review" % id, unicode=True)
|
||||
data['review'] = strip_tags(findRe(html, '<div class="toggle-text" itemprop="description">(.*?)</div>')).strip()
|
||||
return data
|
||||
|
||||
def getUrl(id):
|
||||
|
@ -59,26 +59,26 @@ def getUrl(id):
|
|||
|
||||
def parseEntry(html, title):
|
||||
html = findRe(html, '<dt>%s</dt>.*?<dd>(.*?)</dd>' % title)
|
||||
return stripTags(html).strip()
|
||||
return strip_tags(html).strip()
|
||||
|
||||
def parseList(html, title):
|
||||
html = findRe(html, '<dt>%s</dt>.*?<dd>(.*?)</dd>' % title.lower())
|
||||
r = map(lambda x: stripTags(x), re.compile('<li>(.*?)</li>', re.DOTALL).findall(html))
|
||||
r = map(lambda x: strip_tags(x), re.compile('<li>(.*?)</li>', re.DOTALL).findall(html))
|
||||
if not r and html:
|
||||
r = [stripTags(html)]
|
||||
r = [strip_tags(html)]
|
||||
return r
|
||||
|
||||
def parseTable(html):
|
||||
return map(
|
||||
lambda x: map(
|
||||
lambda x: stripTags(x).strip().replace(' ', ''),
|
||||
lambda x: strip_tags(x).strip().replace(' ', ''),
|
||||
x.split('<td width="305">-')
|
||||
),
|
||||
findRe(html, '<div id="results-table">(.*?)</table>').split('</tr>')[:-1]
|
||||
)
|
||||
|
||||
def parseText(html, title):
|
||||
return stripTags(findRe(html, '%s</td>.*?<td colspan="2"><p>(.*?)</td>' % title)).strip()
|
||||
return strip_tags(findRe(html, '%s</td>.*?<td colspan="2"><p>(.*?)</td>' % title)).strip()
|
||||
|
||||
if __name__ == '__main__':
|
||||
print getData('129689')
|
||||
|
|
|
@ -3,14 +3,14 @@
|
|||
import re
|
||||
from urllib import quote
|
||||
|
||||
from ox import findRe, stripTags, decodeHtml
|
||||
from ox.cache import readUrlUnicode
|
||||
from ox import findRe, strip_tags, decodeHtml
|
||||
from ox.cache import read_url
|
||||
|
||||
|
||||
def findISBN(title, author):
|
||||
q = '%s %s' % (title, author)
|
||||
url = "http://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Dstripbooks&field-keywords=" + "%s&x=0&y=0" % quote(q)
|
||||
data = readUrlUnicode(url)
|
||||
data = read_url(url, unicode=True)
|
||||
links = re.compile('href="(http://www.amazon.com/.*?/dp/.*?)"').findall(data)
|
||||
id = findRe(re.compile('href="(http://www.amazon.com/.*?/dp/.*?)"').findall(data)[0], '/dp/(.*?)/')
|
||||
data = getData(id)
|
||||
|
@ -20,7 +20,7 @@ def findISBN(title, author):
|
|||
|
||||
def getData(id):
|
||||
url = "http://www.amazon.com/title/dp/%s/" % id
|
||||
data = readUrlUnicode(url)
|
||||
data = read_url(url, unicode=True)
|
||||
|
||||
|
||||
def findData(key):
|
||||
|
@ -44,9 +44,9 @@ def getData(id):
|
|||
if not r['pages']:
|
||||
r['pages'] = findData('Hardcover')
|
||||
|
||||
r['review'] = stripTags(findRe(data, '<h3 class="productDescriptionSource">Review</h3>.*?<div class="productDescriptionWrapper">(.*?)</div>').replace('<br />', '\n')).strip()
|
||||
r['review'] = strip_tags(findRe(data, '<h3 class="productDescriptionSource">Review</h3>.*?<div class="productDescriptionWrapper">(.*?)</div>').replace('<br />', '\n')).strip()
|
||||
|
||||
r['description'] = stripTags(findRe(data, '<h3 class="productDescriptionSource">Product Description</h3>.*?<div class="productDescriptionWrapper">(.*?)</div>').replace('<br />', '\n')).strip()
|
||||
r['description'] = strip_tags(findRe(data, '<h3 class="productDescriptionSource">Product Description</h3>.*?<div class="productDescriptionWrapper">(.*?)</div>').replace('<br />', '\n')).strip()
|
||||
|
||||
r['cover'] = re.findall('src="(.*?)" id="prodImage"', data)
|
||||
if r['cover']:
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
import json
|
||||
import re
|
||||
|
||||
from ox.cache import readUrlUnicode
|
||||
from ox.cache import read_url
|
||||
|
||||
HEADERS = {
|
||||
'User-Agent': 'iTunes/10.4 (Macintosh; Intel Mac OS X 10.7) AppleWebKit/534.48.3',
|
||||
|
@ -26,21 +26,21 @@ def getMovieData(title, director):
|
|||
url += '&actorNames=&directorProducerName=' + director
|
||||
url += '&releaseYearTerm=&descriptionTerm=&genreIndex=1&ratingIndex=1'
|
||||
HEADERS['Referer'] = url
|
||||
html = readUrlUnicode(url, headers=HEADERS)
|
||||
html = read_url(url, headers=HEADERS, unicode=True)
|
||||
regexp = '<a href="(http://itunes.apple.com/us/movie/.*?)" class="artwork-link"><div class="artwork">'
|
||||
regexp += '<img width=".*?" height=".*?" alt=".*?" class="artwork" src="(.*?)" /></div></a>'
|
||||
results = re.compile(regexp).findall(html)
|
||||
if results:
|
||||
data['link'] = results[0][0]
|
||||
data['poster'] = results[0][1].replace('140x140', '600x600')
|
||||
html = readUrlUnicode(data['link'], headers=HEADERS)
|
||||
html = read_url(data['link'], headers=HEADERS, unicode=True)
|
||||
results = re.compile('video-preview-url="(.*?)"').findall(html)
|
||||
if results:
|
||||
data['trailer'] = results[0]
|
||||
# trailers section (preferred source for poster and trailer)
|
||||
host = 'http://trailers.apple.com'
|
||||
url = host + '/trailers/home/scripts/quickfind.php?callback=searchCallback&q=' + title
|
||||
js = json.loads(readUrlUnicode(url)[16:-4])
|
||||
js = json.loads(read_url(url, unicode=True)[16:-4])
|
||||
results = js['results']
|
||||
if results:
|
||||
url = host + results[0]['location']
|
||||
|
@ -49,11 +49,11 @@ def getMovieData(title, director):
|
|||
headers = {
|
||||
'User-Agent': USER_AGENT
|
||||
}
|
||||
html = readUrlUnicode(url, headers=headers)
|
||||
html = read_url(url, headers=headers, unicode=True)
|
||||
results = re.compile('"(' + host + '.*?poster\.jpg)"').findall(html)
|
||||
if results:
|
||||
data['poster'] = results[0].replace('poster.jpg', 'poster-xlarge.jpg')
|
||||
html = readUrlUnicode(url + 'includes/playlists/web.inc', headers=headers)
|
||||
html = read_url(url + 'includes/playlists/web.inc', headers=headers, unicode=True)
|
||||
results = re.compile('"(' + host + '\S+\.mov)"').findall(html)
|
||||
if results:
|
||||
data['trailer'] = results[-1]
|
||||
|
|
|
@ -12,7 +12,7 @@ def getUrl(id):
|
|||
def getData(id):
|
||||
data = {}
|
||||
url = getUrl(id)
|
||||
details = cache.readUrl('%s?output=json' % url)
|
||||
details = cache.read_url('%s?output=json' % url)
|
||||
details = json.loads(details)
|
||||
for key in ('title', 'description', 'runtime'):
|
||||
data[key] = details['metadata'][key]
|
||||
|
|
|
@ -3,8 +3,8 @@
|
|||
import re
|
||||
|
||||
import ox.cache
|
||||
from ox.cache import readUrlUnicode
|
||||
from ox.html import stripTags
|
||||
from ox.cache import read_url
|
||||
from ox.html import strip_tags
|
||||
from ox.text import findRe, removeSpecialCharacters
|
||||
|
||||
import imdb
|
||||
|
@ -30,19 +30,19 @@ def getData(id, timeout=ox.cache.cache_timeout, get_imdb=False):
|
|||
"url": getUrl(id)
|
||||
}
|
||||
try:
|
||||
html = readUrlUnicode(data["url"], timeout=timeout)
|
||||
html = read_url(data["url"], timeout=timeout, unicode=True)
|
||||
except:
|
||||
html = ox.cache.readUrl(data["url"], timeout=timeout)
|
||||
html = ox.cache.read_url(data["url"], timeout=timeout)
|
||||
data["number"] = findRe(html, "<li>Spine #(\d+)")
|
||||
|
||||
data["title"] = findRe(html, "<meta property=['\"]og:title['\"] content=['\"](.*?)['\"]")
|
||||
data["title"] = data["title"].split(u' \u2014 The Television Version')[0]
|
||||
data["director"] = stripTags(findRe(html, "<h2 class=\"director\">(.*?)</h2>"))
|
||||
data["director"] = strip_tags(findRe(html, "<h2 class=\"director\">(.*?)</h2>"))
|
||||
results = findRe(html, '<div class="left_column">(.*?)</div>')
|
||||
results = re.compile("<li>(.*?)</li>").findall(results)
|
||||
data["country"] = results[0]
|
||||
data["year"] = results[1]
|
||||
data["synopsis"] = stripTags(findRe(html, "<p><strong>SYNOPSIS:</strong> (.*?)</p>"))
|
||||
data["synopsis"] = strip_tags(findRe(html, "<p><strong>SYNOPSIS:</strong> (.*?)</p>"))
|
||||
|
||||
result = findRe(html, "<div class=\"purchase\">(.*?)</div>")
|
||||
if 'Blu-Ray' in result or 'Essential Art House DVD' in result:
|
||||
|
@ -53,7 +53,7 @@ def getData(id, timeout=ox.cache.cache_timeout, get_imdb=False):
|
|||
if not "/boxsets/" in result:
|
||||
data["posters"] = [result]
|
||||
else:
|
||||
html_ = readUrlUnicode(result)
|
||||
html_ = read_url(result, unicode=True)
|
||||
result = findRe(html_, '<a href="http://www.criterion.com/films/%s.*?">(.*?)</a>' % id)
|
||||
result = findRe(result, "src=\"(.*?)\"")
|
||||
if result:
|
||||
|
@ -77,7 +77,7 @@ def getData(id, timeout=ox.cache.cache_timeout, get_imdb=False):
|
|||
|
||||
def getIds():
|
||||
ids = []
|
||||
html = readUrlUnicode("http://www.criterion.com/library/expanded_view?m=dvd&p=1&pp=50&s=spine")
|
||||
html = read_url("http://www.criterion.com/library/expanded_view?m=dvd&p=1&pp=50&s=spine", unicode=True)
|
||||
results = re.compile("\&p=(\d+)\&").findall(html)
|
||||
pages = max(map(int, results))
|
||||
for page in range(1, pages):
|
||||
|
@ -88,13 +88,13 @@ def getIds():
|
|||
def getIdsByPage(page):
|
||||
ids = []
|
||||
url = "http://www.criterion.com/library/expanded_view?m=dvd&p=%s&pp=50&s=spine" % page
|
||||
html = readUrlUnicode(url)
|
||||
html = read_url(url, unicode=True)
|
||||
results = re.compile("films/(\d+)").findall(html)
|
||||
for result in results:
|
||||
ids.append(result)
|
||||
results = re.compile("boxsets/(.*?)\"").findall(html)
|
||||
for result in results:
|
||||
html = readUrlUnicode("http://www.criterion.com/boxsets/" + result)
|
||||
html = read_url("http://www.criterion.com/boxsets/" + result, unicode=True)
|
||||
results = re.compile("films/(\d+)").findall(html)
|
||||
for result in results:
|
||||
ids.append(result)
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
import re
|
||||
from urllib import unquote
|
||||
from ox.cache import readUrl
|
||||
from ox.cache import read_url
|
||||
|
||||
|
||||
def getVideoUrl(url):
|
||||
|
@ -13,7 +13,7 @@ def getVideoUrl(url):
|
|||
>>> getVideoUrl('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3ou94_priere-pour-refuznik-2-jeanluc-goda_shortfilms').split('?auth')[0]
|
||||
'http://www.dailymotion.com/cdn/FLV-320x240/video/x3ou94_priere-pour-refuznik-2-jean-luc-god_shortfilms.flv'
|
||||
'''
|
||||
data = readUrl(url)
|
||||
data = read_url(url)
|
||||
video = re.compile('''video", "(.*?)"''').findall(data)
|
||||
for v in video:
|
||||
v = unquote(v).split('@@')[0]
|
||||
|
|
|
@ -3,9 +3,9 @@
|
|||
import re
|
||||
import urllib
|
||||
import ox
|
||||
from ox import stripTags, decodeHtml
|
||||
from ox import strip_tags, decodeHtml
|
||||
from ox.utils import json
|
||||
from ox.cache import readUrlUnicode
|
||||
from ox.cache import read_url
|
||||
|
||||
|
||||
def find(query, timeout=ox.cache.cache_timeout):
|
||||
|
@ -13,10 +13,10 @@ def find(query, timeout=ox.cache.cache_timeout):
|
|||
query = query.encode('utf-8')
|
||||
params = urllib.urlencode({'q': query})
|
||||
url = 'http://duckduckgo.com/html/?' + params
|
||||
data = readUrlUnicode(url, timeout=timeout)
|
||||
data = read_url(url, timeout=timeout, unicode=True)
|
||||
results = []
|
||||
regex = '<a .*?class="l le" href="(.+?)">(.*?)</a>.*?<div class="cra">(.*?)</div>'
|
||||
for r in re.compile(regex, re.DOTALL).findall(data):
|
||||
results.append((stripTags(decodeHtml(r[1])), r[0], stripTags(decodeHtml(r[2]))))
|
||||
results.append((strip_tags(decodeHtml(r[1])), r[0], strip_tags(decodeHtml(r[2]))))
|
||||
return results
|
||||
|
||||
|
|
|
@ -3,8 +3,8 @@
|
|||
import re
|
||||
import time
|
||||
|
||||
from ox import stripTags, findRe
|
||||
from ox.cache import readUrlUnicode
|
||||
from ox import strip_tags, findRe
|
||||
from ox.cache import read_url
|
||||
|
||||
import google
|
||||
|
||||
|
@ -21,9 +21,9 @@ def getShowUrl(title):
|
|||
return None
|
||||
|
||||
def getShowData(url):
|
||||
data = readUrlUnicode(url)
|
||||
data = read_url(url, unicode=True)
|
||||
r = {}
|
||||
r['title'] = stripTags(findRe(data, '<h1>(.*?)</h1>'))
|
||||
r['title'] = strip_tags(findRe(data, '<h1>(.*?)</h1>'))
|
||||
r['imdb'] = findRe(data, '<h1><a href=".*?/title/tt(\d.*?)">.*?</a></h1>')
|
||||
r['episodes'] = {}
|
||||
#1. 1- 1 1001 7 Aug 05 You Can't Miss the Bear
|
||||
|
|
|
@ -4,8 +4,8 @@
|
|||
import re
|
||||
from lxml.html import document_fromstring
|
||||
|
||||
from ox.cache import readUrlUnicode
|
||||
from ox import findRe, stripTags
|
||||
from ox.cache import read_url
|
||||
from ox import findRe, strip_tags
|
||||
from ox.web.imdb import ImdbCombined
|
||||
|
||||
|
||||
|
@ -32,7 +32,7 @@ def getData(id, timeout=-1):
|
|||
data = {
|
||||
"url": getUrl(id),
|
||||
}
|
||||
html = readUrlUnicode(data['url'], timeout=timeout)
|
||||
html = read_url(data['url'], timeout=timeout, timeout=True)
|
||||
doc = document_fromstring(html)
|
||||
|
||||
props = {
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
import json
|
||||
|
||||
from ox.cache import readUrlUnicode
|
||||
from ox.cache import read_url
|
||||
from ox import findRe
|
||||
|
||||
class Imdb(dict):
|
||||
|
@ -12,7 +12,7 @@ class Imdb(dict):
|
|||
"http://graph.freebase.com/imdb.title.tt%s" % id
|
||||
might also be of interest at some point, right now not much info
|
||||
'''
|
||||
data = readUrlUnicode(url)
|
||||
data = read_url(url, unicode=True)
|
||||
try:
|
||||
data = json.loads(data)
|
||||
except ValueError:
|
||||
|
|
|
@ -4,13 +4,13 @@ import re
|
|||
import urllib
|
||||
|
||||
import ox
|
||||
from ox import stripTags, decodeHtml
|
||||
from ox import strip_tags, decodeHtml
|
||||
|
||||
DEFAULT_MAX_RESULTS = 10
|
||||
DEFAULT_TIMEOUT = 24*60*60
|
||||
|
||||
def readUrlUnicode(url, data=None, headers=ox.net.DEFAULT_HEADERS, timeout=DEFAULT_TIMEOUT):
|
||||
return ox.cache.readUrlUnicode(url, data, headers, timeout)
|
||||
def read_url(url, data=None, headers=ox.net.DEFAULT_HEADERS, timeout=DEFAULT_TIMEOUT):
|
||||
return ox.cache.read_url(url, data, headers, timeout, unicode=True)
|
||||
|
||||
def quote_plus(s):
|
||||
if not isinstance(s, str):
|
||||
|
@ -28,13 +28,13 @@ def find(query, max_results=DEFAULT_MAX_RESULTS, timeout=DEFAULT_TIMEOUT):
|
|||
u'http://www.imdb.com/title/tt0133093/'
|
||||
"""
|
||||
url = 'http://google.com/search?q=%s' % quote_plus(query)
|
||||
data = readUrlUnicode(url, timeout=timeout)
|
||||
data = read_url(url, timeout=timeout)
|
||||
results = []
|
||||
data = re.sub('<span class="f">(.*?)</span>', '\\1', data)
|
||||
for a in re.compile(
|
||||
'<a href="(\S+?)" class=l .*?>(.*?)</a>.*?<span class="st">(.*?)<\/span>'
|
||||
).findall(data):
|
||||
results.append((stripTags(decodeHtml(a[1])), a[0], stripTags(decodeHtml(a[2]))))
|
||||
results.append((strip_tags(decodeHtml(a[1])), a[0], strip_tags(decodeHtml(a[2]))))
|
||||
if len(results) >= max_results:
|
||||
break
|
||||
return results
|
||||
|
|
|
@ -8,7 +8,7 @@ import time
|
|||
import unicodedata
|
||||
|
||||
import ox
|
||||
from ox import findRe, stripTags
|
||||
from ox import findRe, strip_tags
|
||||
from ox.normalize import normalizeTitle, normalizeImdbId
|
||||
import ox.cache
|
||||
|
||||
|
@ -16,12 +16,9 @@ from siteparser import SiteParser
|
|||
import google
|
||||
|
||||
|
||||
def readUrl(url, data=None, headers=ox.cache.DEFAULT_HEADERS, timeout=ox.cache.cache_timeout, valid=None):
|
||||
def read_url(url, data=None, headers=ox.cache.DEFAULT_HEADERS, timeout=ox.cache.cache_timeout, valid=None, unicode=False):
|
||||
headers = headers.copy()
|
||||
return ox.cache.readUrl(url, data, headers, timeout)
|
||||
|
||||
def readUrlUnicode(url, timeout=ox.cache.cache_timeout):
|
||||
return ox.cache.readUrlUnicode(url, _readUrl=readUrl, timeout=timeout)
|
||||
return ox.cache.read_url(url, data, headers, timeout, unicode=unicode)
|
||||
|
||||
def getUrl(id):
|
||||
return "http://www.imdb.com/title/tt%s/" % id
|
||||
|
@ -61,7 +58,7 @@ class Imdb(SiteParser):
|
|||
'page': 'combined',
|
||||
're': [
|
||||
'<td class="nm">.*?>(.*?)</a>.*?<td class="char">(.*?)</td>',
|
||||
lambda ll: [stripTags(l) for l in ll]
|
||||
lambda ll: [strip_tags(l) for l in ll]
|
||||
],
|
||||
'type': 'list'
|
||||
},
|
||||
|
@ -266,8 +263,8 @@ class Imdb(SiteParser):
|
|||
}
|
||||
}
|
||||
|
||||
def readUrlUnicode(self, url, timeout):
|
||||
return readUrlUnicode(url, timeout)
|
||||
def read_url(self, url, timeout):
|
||||
return read_url(url, timeout, unicode=True)
|
||||
|
||||
def __init__(self, id, timeout=-1):
|
||||
#use akas.imdb.com to always get original title:
|
||||
|
@ -276,7 +273,7 @@ class Imdb(SiteParser):
|
|||
super(Imdb, self).__init__(timeout)
|
||||
|
||||
url = self.baseUrl + 'combined'
|
||||
page = self.readUrlUnicode(url, timeout=-1)
|
||||
page = self.read_url(url, timeout=-1)
|
||||
if '<title>IMDb: Page not found</title>' in page \
|
||||
or 'The requested URL was not found on our server.' in page:
|
||||
return
|
||||
|
@ -460,7 +457,7 @@ def getMovieIdByTitle(title, timeout=-1):
|
|||
params['q'] = params['q'].encode('utf-8')
|
||||
params = urllib.urlencode(params)
|
||||
url = "http://akas.imdb.com/find?" + params
|
||||
data = readUrlUnicode(url, timeout=timeout)
|
||||
data = read_url(url, timeout=timeout, unicode=True)
|
||||
#if search results in redirect, get id of current page
|
||||
r = '<meta property="og:url" content="http://www.imdb.com/title/tt(\d{7})/" />'
|
||||
results = re.compile(r).findall(data)
|
||||
|
@ -538,7 +535,7 @@ def getMovieId(title, director='', year='', timeout=-1):
|
|||
url = "http://akas.imdb.com/find?" + params
|
||||
#print url
|
||||
|
||||
data = readUrlUnicode(url, timeout=timeout)
|
||||
data = read_url(url, timeout=timeout, unicode=True)
|
||||
#if search results in redirect, get id of current page
|
||||
r = '<meta property="og:url" content="http://www.imdb.com/title/tt(\d{7})/" />'
|
||||
results = re.compile(r).findall(data)
|
||||
|
@ -569,7 +566,7 @@ def getMoviePoster(imdbId):
|
|||
info = ImdbCombined(imdbId)
|
||||
if 'posterId' in info:
|
||||
url = "http://www.imdb.com/rg/action-box-title/primary-photo/media/rm%s/tt%s" % (info['posterId'], imdbId)
|
||||
data = readUrl(url)
|
||||
data = read_url(url)
|
||||
poster = findRe(data, 'img id="primary-img".*?src="(.*?)"')
|
||||
return poster
|
||||
elif 'series' in info:
|
||||
|
@ -578,7 +575,7 @@ def getMoviePoster(imdbId):
|
|||
|
||||
def maxVotes():
|
||||
url = 'http://www.imdb.com/search/title?num_votes=500000,&sort=num_votes,desc'
|
||||
data = ox.cache.readUrl(url)
|
||||
data = ox.cache.read_url(url)
|
||||
votes = max([int(v.replace(',', ''))
|
||||
for v in re.compile('<td class="sort_col">([\d,]+)</td>').findall(data)])
|
||||
return votes
|
||||
|
|
|
@ -2,8 +2,8 @@
|
|||
# encoding: utf-8
|
||||
import re
|
||||
|
||||
from ox.cache import readUrlUnicode
|
||||
from ox.html import stripTags
|
||||
from ox.cache import read_url
|
||||
from ox.html import strip_tags
|
||||
from ox.text import findRe
|
||||
|
||||
|
||||
|
@ -21,11 +21,11 @@ def getData(id):
|
|||
data = {
|
||||
'url': getUrl(id)
|
||||
}
|
||||
html = readUrlUnicode(data['url'])
|
||||
html = read_url(data['url'], unicode=True)
|
||||
data['imdbId'] = findRe(html, 'imdb.com/title/tt(\d{7})')
|
||||
if not data['imdbId']:
|
||||
data['imdbId'] = _id_map.get(id, '')
|
||||
data['title'] = stripTags(findRe(html, '<p class="name white">(.*?) \(<a href="alpha1.html">'))
|
||||
data['title'] = strip_tags(findRe(html, '<p class="name white">(.*?) \(<a href="alpha1.html">'))
|
||||
data['year'] = findRe(html, '\(<a href="alpha1.html">(.*?)</a>\)')
|
||||
data['posters'] = []
|
||||
poster = findRe(html, '<img src="(posters.*?)"')
|
||||
|
@ -36,11 +36,11 @@ def getData(id):
|
|||
for result in results:
|
||||
result = result.replace('_xlg.html', '.html')
|
||||
url = 'http://www.impawards.com/%s/%s' % (data['year'], result)
|
||||
html = readUrlUnicode(url)
|
||||
html = read_url(url, unicode=True)
|
||||
result = findRe(html, '<a href = (\w*?_xlg.html)')
|
||||
if result:
|
||||
url = 'http://www.impawards.com/%s/%s' % (data['year'], result)
|
||||
html = readUrlUnicode(url)
|
||||
html = read_url(url, unicode=True)
|
||||
poster = 'http://www.impawards.com/%s/%s' % (data['year'], findRe(html, '<img SRC="(.*?)"'))
|
||||
else:
|
||||
poster = 'http://www.impawards.com/%s/%s' % (data['year'], findRe(html, '<img src="(posters.*?)"'))
|
||||
|
@ -61,7 +61,7 @@ def getId(url):
|
|||
|
||||
def getIds():
|
||||
ids = []
|
||||
html = readUrlUnicode('http://www.impawards.com/archives/latest.html', timeout = 60*60)
|
||||
html = read_url('http://www.impawards.com/archives/latest.html', timeout = 60*60, unicode=True)
|
||||
pages = int(findRe(html, '<a href= page(.*?).html>')) + 1
|
||||
for page in range(pages, 0, -1):
|
||||
for id in getIdsByPage(page):
|
||||
|
@ -71,7 +71,7 @@ def getIds():
|
|||
|
||||
def getIdsByPage(page):
|
||||
ids = []
|
||||
html = readUrlUnicode('http://www.impawards.com/archives/page%s.html' % page, timeout = -1)
|
||||
html = read_url('http://www.impawards.com/archives/page%s.html' % page, timeout = -1, unicode=True)
|
||||
results = re.compile('<a href = \.\./(.*?)>', re.DOTALL).findall(html)
|
||||
for result in results:
|
||||
url = 'http://impawards.com/%s' % result
|
||||
|
@ -80,7 +80,7 @@ def getIdsByPage(page):
|
|||
|
||||
def getUrl(id):
|
||||
url = u"http://www.impawards.com/%s.html" % id
|
||||
html = readUrlUnicode(url)
|
||||
html = read_url(url, unicode=True)
|
||||
if findRe(html, "No Movie Posters on This Page"):
|
||||
url = u"http://www.impawards.com/%s_ver1.html" % id
|
||||
return url
|
||||
|
|
|
@ -3,8 +3,8 @@
|
|||
import re
|
||||
import urllib
|
||||
|
||||
from ox.cache import readUrl
|
||||
from ox.html import decodeHtml, stripTags
|
||||
from ox.cache import read_url
|
||||
from ox.html import decodeHtml, strip_tags
|
||||
from ox.text import findRe
|
||||
from ox.text import findString
|
||||
|
||||
|
@ -113,20 +113,20 @@ class ItunesAlbum:
|
|||
|
||||
def getId(self):
|
||||
url = composeUrl('advancedSearch', {'media': 'music', 'title': self.title, 'artist': self.artist})
|
||||
xml = readUrl(url, headers = ITUNES_HEADERS)
|
||||
xml = read_url(url, headers = ITUNES_HEADERS)
|
||||
id = findRe(xml, 'viewAlbum\?id=(.*?)&')
|
||||
return id
|
||||
|
||||
def getData(self):
|
||||
data = {'id': self.id}
|
||||
url = composeUrl('viewAlbum', {'id': self.id})
|
||||
xml = readUrl(url, None, ITUNES_HEADERS)
|
||||
xml = read_url(url, None, ITUNES_HEADERS)
|
||||
data['albumName'] = findRe(xml, '<B>(.*?)</B>')
|
||||
data['artistName'] = findRe(xml, '<b>(.*?)</b>')
|
||||
data['coverUrl'] = findRe(xml, 'reflection="." url="(.*?)"')
|
||||
data['genre'] = findRe(xml, 'Genre:(.*?)<')
|
||||
data['releaseDate'] = findRe(xml, 'Released(.*?)<')
|
||||
data['review'] = stripTags(findRe(xml, 'REVIEW</b>.*?<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
|
||||
data['review'] = strip_tags(findRe(xml, 'REVIEW</b>.*?<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
|
||||
data['tracks'] = []
|
||||
strings = findRe(xml, '<key>items</key>.*?<dict>(.*?)$').split('<dict>')
|
||||
for string in strings:
|
||||
|
@ -144,14 +144,14 @@ class ItunesMovie:
|
|||
|
||||
def getId(self):
|
||||
url = composeUrl('advancedSearch', {'media': 'movie', 'title': self.title, 'director': self.director})
|
||||
xml = readUrl(url, headers = ITUNES_HEADERS)
|
||||
xml = read_url(url, headers = ITUNES_HEADERS)
|
||||
id = findRe(xml, 'viewMovie\?id=(.*?)&')
|
||||
return id
|
||||
|
||||
def getData(self):
|
||||
data = {'id': self.id}
|
||||
url = composeUrl('viewMovie', {'id': self.id})
|
||||
xml = readUrl(url, None, ITUNES_HEADERS)
|
||||
xml = read_url(url, None, ITUNES_HEADERS)
|
||||
f = open('/Users/rolux/Desktop/iTunesData.xml', 'w')
|
||||
f.write(xml)
|
||||
f.close()
|
||||
|
|
|
@ -1,15 +1,15 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
from ox.cache import readUrl
|
||||
from ox.cache import read_url
|
||||
from ox.html import decodeHtml
|
||||
from ox.text import findRe
|
||||
|
||||
|
||||
def getLyrics(title, artist):
|
||||
html = readUrl('http://lyricsfly.com/api/')
|
||||
html = read_url('http://lyricsfly.com/api/')
|
||||
key = findRe(html, '<font color=green><b>(.*?)</b></font>')
|
||||
url = 'http://lyricsfly.com/api/api.php?i=%s&a=%s&t=%s' % (key, artist, title)
|
||||
xml = readUrl(url)
|
||||
xml = read_url(url)
|
||||
lyrics = findRe(xml, '<tx>(.*?)\[br\] Lyrics [a-z]* by lyricsfly.com')
|
||||
lyrics = lyrics.replace('\n', '').replace('\r', '')
|
||||
lyrics = lyrics.replace('[br]', '\n').strip()
|
||||
|
|
|
@ -4,8 +4,8 @@ import re
|
|||
from urllib import quote
|
||||
from lxml.html import document_fromstring
|
||||
|
||||
from ox.cache import readUrl, readUrlUnicode
|
||||
from ox import findRe, stripTags
|
||||
from ox.cache import read_url
|
||||
from ox import findRe, strip_tags
|
||||
|
||||
def getUrl(id):
|
||||
return 'http://www.metacritic.com/movie/%s' % id
|
||||
|
@ -15,18 +15,18 @@ def getId(url):
|
|||
|
||||
def getUrlByImdb(imdb):
|
||||
url = "http://www.imdb.com/title/tt%s/criticreviews" % imdb
|
||||
data = readUrl(url)
|
||||
data = read_url(url)
|
||||
metacritic_url = findRe(data, '"(http://www.metacritic.com/movie/.*?)"')
|
||||
return metacritic_url or None
|
||||
|
||||
def getMetacriticShowUrl(title):
|
||||
title = quote(title)
|
||||
url = "http://www.metacritic.com/search/process?ty=6&ts=%s&tfs=tvshow_title&x=0&y=0&sb=0&release_date_s=&release_date_e=&metascore_s=&metascore_e=" % title
|
||||
data = readUrl(url)
|
||||
data = read_url(url)
|
||||
return findRe(data, '(http://www.metacritic.com/tv/shows/.*?)\?')
|
||||
|
||||
def getData(url):
|
||||
data = readUrlUnicode(url)
|
||||
data = read_url(url, unicode=True)
|
||||
doc = document_fromstring(data)
|
||||
score = filter(lambda s: s.attrib.get('property') == 'v:average',
|
||||
doc.xpath('//span[@class="score_value"]'))
|
||||
|
@ -51,7 +51,7 @@ def getData(url):
|
|||
'critic': authors[i],
|
||||
'url': urls[i],
|
||||
'source': sources[i],
|
||||
'quote': stripTags(reviews[i]).strip(),
|
||||
'quote': strip_tags(reviews[i]).strip(),
|
||||
'score': scores[i],
|
||||
})
|
||||
|
||||
|
|
|
@ -5,8 +5,8 @@ import re
|
|||
import socket
|
||||
from urllib import quote
|
||||
|
||||
from ox.cache import readUrl, readUrlUnicode
|
||||
from ox import findRe, cache, stripTags, decodeHtml, getTorrentInfo, intValue, normalizeNewlines
|
||||
from ox.cache import read_url
|
||||
from ox import findRe, cache, strip_tags, decodeHtml, getTorrentInfo, int_value, normalizeNewlines
|
||||
from ox.normalize import normalizeImdbId
|
||||
import ox
|
||||
|
||||
|
@ -31,7 +31,7 @@ def findMovie(query, max_results=10):
|
|||
'''search for torrents on mininova
|
||||
'''
|
||||
url = "http://www.mininova.org/search/%s/seeds" % quote(query)
|
||||
data = readUrlUnicode(url)
|
||||
data = read_url(url, unicode=True)
|
||||
return _parseResultsPage(data, max_results)
|
||||
|
||||
def findMovieByImdb(imdbId):
|
||||
|
@ -39,7 +39,7 @@ def findMovieByImdb(imdbId):
|
|||
'''
|
||||
results = []
|
||||
imdbId = normalizeImdbId(imdbId)
|
||||
data = readUrlUnicode("http://www.mininova.org/imdb/?imdb=%s" % imdbId)
|
||||
data = read_url("http://www.mininova.org/imdb/?imdb=%s" % imdbId, unicode=True)
|
||||
return _parseResultsPage(data)
|
||||
|
||||
def getId(mininovaId):
|
||||
|
@ -55,7 +55,7 @@ def getId(mininovaId):
|
|||
|
||||
def exists(mininovaId):
|
||||
mininovaId = getId(mininovaId)
|
||||
data = ox.net.readUrl("http://www.mininova.org/tor/%s" % mininovaId)
|
||||
data = ox.net.read_url("http://www.mininova.org/tor/%s" % mininovaId)
|
||||
if not data or 'Torrent not found...' in data:
|
||||
return False
|
||||
if 'tracker</a> of this torrent requires registration.' in data:
|
||||
|
@ -74,22 +74,22 @@ def getData(mininovaId):
|
|||
torrent[u'torrent_link'] = "http://www.mininova.org/get/%s" % mininovaId
|
||||
torrent[u'details_link'] = "http://www.mininova.org/det/%s" % mininovaId
|
||||
|
||||
data = readUrlUnicode(torrent['comment_link']) + readUrlUnicode(torrent['details_link'])
|
||||
data = read_url(torrent['comment_link'], unicode=True) + read_url(torrent['details_link'], unicode=True)
|
||||
if '<h1>Torrent not found...</h1>' in data:
|
||||
return None
|
||||
|
||||
for d in re.compile('<p>.<strong>(.*?):</strong>(.*?)</p>', re.DOTALL).findall(data):
|
||||
key = d[0].lower().strip()
|
||||
key = _key_map.get(key, key)
|
||||
value = decodeHtml(stripTags(d[1].strip()))
|
||||
value = decodeHtml(strip_tags(d[1].strip()))
|
||||
torrent[key] = value
|
||||
|
||||
torrent[u'title'] = findRe(data, '<title>(.*?):.*?</title>')
|
||||
torrent[u'imdbId'] = findRe(data, 'title/tt(\d{7})')
|
||||
torrent[u'description'] = findRe(data, '<div id="description">(.*?)</div>')
|
||||
if torrent['description']:
|
||||
torrent['description'] = normalizeNewlines(decodeHtml(stripTags(torrent['description']))).strip()
|
||||
t = readUrl(torrent[u'torrent_link'])
|
||||
torrent['description'] = normalizeNewlines(decodeHtml(strip_tags(torrent['description']))).strip()
|
||||
t = read_url(torrent[u'torrent_link'])
|
||||
torrent[u'torrent_info'] = getTorrentInfo(t)
|
||||
return torrent
|
||||
|
||||
|
@ -109,13 +109,13 @@ class Mininova(Torrent):
|
|||
self['seeder'] = -1
|
||||
self['leecher'] = -1
|
||||
if len(ratio) == 2:
|
||||
val = intValue(ratio[0].replace(',','').strip())
|
||||
val = int_value(ratio[0].replace(',','').strip())
|
||||
if val:
|
||||
self['seeder'] = int(val)
|
||||
val = intValue(ratio[1].replace(',','').strip())
|
||||
val = int_value(ratio[1].replace(',','').strip())
|
||||
if val:
|
||||
self['leecher'] = int(val)
|
||||
val = intValue(self.data['downloads'].replace(',','').strip())
|
||||
val = int_value(self.data['downloads'].replace(',','').strip())
|
||||
if val:
|
||||
self['downloaded'] = int(val)
|
||||
else:
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
|
||||
import re
|
||||
|
||||
from ox.cache import readUrlUnicode
|
||||
from ox.cache import read_url
|
||||
from ox import findRe
|
||||
|
||||
def getData(id):
|
||||
|
@ -24,7 +24,7 @@ def getId(url):
|
|||
|
||||
def getPostersByUrl(url, group=True, timeout=-1):
|
||||
posters = []
|
||||
html = readUrlUnicode(url, timeout=timeout)
|
||||
html = read_url(url, timeout=timeout, unicode=True)
|
||||
if url in html:
|
||||
if group:
|
||||
results = re.compile('<a href="(http://www.movieposterdb.com/group/.+?)\??">', re.DOTALL).findall(html)
|
||||
|
@ -32,7 +32,7 @@ def getPostersByUrl(url, group=True, timeout=-1):
|
|||
posters += getPostersByUrl(result, False)
|
||||
results = re.compile('<a href="(http://www.movieposterdb.com/poster/.+?)">', re.DOTALL).findall(html)
|
||||
for result in results:
|
||||
html = readUrlUnicode(result, timeout=timeout)
|
||||
html = read_url(result, timeout=timeout, unicode=True)
|
||||
posters.append(findRe(html, '"(http://www.movieposterdb.com/posters/.+?\.jpg)"'))
|
||||
return posters
|
||||
|
||||
|
|
|
@ -3,8 +3,8 @@
|
|||
import re
|
||||
|
||||
import feedparser
|
||||
from ox.cache import readUrl, readUrlUnicode
|
||||
from ox import findRe, stripTags
|
||||
from ox.cache import read_url
|
||||
from ox import findRe, strip_tags
|
||||
from ox import langCode2To3, langTo3Code
|
||||
|
||||
def findSubtitlesByImdb(imdb, parts = 1, language = "eng"):
|
||||
|
@ -16,7 +16,7 @@ def findSubtitlesByImdb(imdb, parts = 1, language = "eng"):
|
|||
if language:
|
||||
url += "sublanguageid-%s/" % language
|
||||
url += "subsumcd-%s/subformat-srt/imdbid-%s/rss_2_00" % (parts, imdb)
|
||||
data = readUrl(url)
|
||||
data = read_url(url)
|
||||
if "title>opensubtitles.com - search results</title" in data:
|
||||
fd = feedparser.parse(data)
|
||||
opensubtitleId = None
|
||||
|
@ -31,11 +31,11 @@ def findSubtitlesByImdb(imdb, parts = 1, language = "eng"):
|
|||
|
||||
def downloadSubtitleById(opensubtitle_id):
|
||||
srts = {}
|
||||
data = readUrl('http://www.opensubtitles.org/en/subtitles/%s' % opensubtitle_id)
|
||||
data = read_url('http://www.opensubtitles.org/en/subtitles/%s' % opensubtitle_id)
|
||||
reg_exp = 'href="(/en/download/file/.*?)">(.*?)</a>'
|
||||
for f in re.compile(reg_exp, re.DOTALL).findall(data):
|
||||
name = stripTags(f[1]).split('\n')[0]
|
||||
name = strip_tags(f[1]).split('\n')[0]
|
||||
url = "http://www.opensubtitles.com%s" % f[0]
|
||||
srts[name] = readUrlUnicode(url)
|
||||
srts[name] = read_url(url, unicode=True)
|
||||
return srts
|
||||
|
||||
|
|
|
@ -1,11 +1,11 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
import re
|
||||
from ox.net import readUrlUnicode
|
||||
from ox.net import read_url
|
||||
|
||||
def getPosterUrl(id):
|
||||
url = 'http://piratecinema.org/posters/'
|
||||
html = readUrlUnicode(url)
|
||||
html = read_url(url, unicode=True)
|
||||
results = re.compile('src="(.+)" title=".+\((\d{7})\)"').findall(html)
|
||||
for result in results:
|
||||
if result[1] == id:
|
||||
|
|
|
@ -2,8 +2,8 @@
|
|||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
import re
|
||||
|
||||
from ox.cache import getHeaders, readUrl, readUrlUnicode
|
||||
from ox import findRe, stripTags
|
||||
from ox.cache import getHeaders, read_url
|
||||
from ox import findRe, strip_tags
|
||||
|
||||
|
||||
def getUrlByImdb(imdb):
|
||||
|
@ -14,7 +14,7 @@ def getUrlByImdb(imdb):
|
|||
return u.url
|
||||
'''
|
||||
url = "http://www.rottentomatoes.com/alias?type=imdbid&s=%s" % imdb
|
||||
data = readUrl(url)
|
||||
data = read_url(url)
|
||||
if "movie_title" in data:
|
||||
movies = re.compile('(/m/.*?/)').findall(data)
|
||||
if movies:
|
||||
|
@ -25,13 +25,13 @@ def get_og(data, key):
|
|||
return findRe(data, '<meta property="og:%s".*?content="(.*?)"' % key)
|
||||
|
||||
def getData(url):
|
||||
data = readUrl(url)
|
||||
data = read_url(url)
|
||||
r = {}
|
||||
r['title'] = findRe(data, '<h1 class="movie_title">(.*?)</h1>')
|
||||
if '(' in r['title']:
|
||||
r['year'] = findRe(r['title'], '\((\d*?)\)')
|
||||
r['title'] = stripTags(re.sub('\((\d*?)\)', '', r['title'])).strip()
|
||||
r['summary'] = stripTags(findRe(data, '<p id="movieSynopsis" class="movie_synopsis" itemprop="description">(.*?)</p>')).strip()
|
||||
r['title'] = strip_tags(re.sub('\((\d*?)\)', '', r['title'])).strip()
|
||||
r['summary'] = strip_tags(findRe(data, '<p id="movieSynopsis" class="movie_synopsis" itemprop="description">(.*?)</p>')).strip()
|
||||
r['summary'] = r['summary'].replace('\t', ' ').replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')
|
||||
if not r['summary']:
|
||||
r['summary'] = get_og(data, 'description')
|
||||
|
|
|
@ -2,16 +2,16 @@
|
|||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
import re
|
||||
|
||||
from ..cache import readUrlUnicode
|
||||
from .. import stripTags, decodeHtml
|
||||
from ..cache import read_url
|
||||
from .. import strip_tags, decodeHtml
|
||||
from ..utils import datetime
|
||||
|
||||
|
||||
def cleanup(key, data, data_type):
|
||||
if data:
|
||||
if isinstance(data[0], basestring):
|
||||
#FIXME: some types need stripTags
|
||||
#data = [stripTags(decodeHtml(p)).strip() for p in data]
|
||||
#FIXME: some types need strip_tags
|
||||
#data = [strip_tags(decodeHtml(p)).strip() for p in data]
|
||||
data = [decodeHtml(p).strip() for p in data]
|
||||
elif isinstance(data[0], list) or isinstance(data[0], tuple):
|
||||
data = [cleanup(key, p, data_type) for p in data]
|
||||
|
@ -30,13 +30,13 @@ class SiteParser(dict):
|
|||
def getUrl(self, page):
|
||||
return "%s%s" % (self.baseUrl, page)
|
||||
|
||||
def readUrlUnicode(self, url, timeout):
|
||||
return readUrlUnicode(url, timeout=timeout)
|
||||
def read_url(self, url, timeout):
|
||||
return read_url(url, timeout=timeout, unicode=True)
|
||||
|
||||
def __init__(self, timeout=-1):
|
||||
for key in self.regex:
|
||||
url = self.getUrl(self.regex[key]['page'])
|
||||
data = self.readUrlUnicode(url, timeout)
|
||||
data = self.read_url(url, timeout)
|
||||
if isinstance(self.regex[key]['re'], basestring):
|
||||
data = re.compile(self.regex[key]['re'], re.DOTALL).findall(data)
|
||||
data = cleanup(key, data, self.regex[key]['type'])
|
||||
|
|
|
@ -5,7 +5,7 @@ import re
|
|||
import time
|
||||
|
||||
import ox.cache
|
||||
from ox.html import decodeHtml, stripTags
|
||||
from ox.html import decodeHtml, strip_tags
|
||||
import ox.net
|
||||
|
||||
|
||||
|
@ -21,11 +21,11 @@ def getNews(year, month, day):
|
|||
for section in sections:
|
||||
url = 'http://www.spiegel.de/%s/0,1518,archiv-%d-%03d,00.html' % (section, year, day)
|
||||
if date == time.strftime('%d.%m.%Y', time.localtime()):
|
||||
html = ox.net.readUrl(url)
|
||||
html = ox.net.read_url(url)
|
||||
else:
|
||||
html = ox.cache.readUrl(url)
|
||||
html = ox.cache.read_url(url)
|
||||
for item in re.compile('<div class="spTeaserCenterpage(.*?)</p>', re.DOTALL).findall(html):
|
||||
dateString = stripTags(re.compile('<div class="spDateTime">(.*?)</div>', re.DOTALL).findall(item)[0]).strip()
|
||||
dateString = strip_tags(re.compile('<div class="spDateTime">(.*?)</div>', re.DOTALL).findall(item)[0]).strip()
|
||||
try:
|
||||
description = formatString(re.compile('<p>(.*?)<', re.DOTALL).findall(item)[0])
|
||||
except:
|
||||
|
@ -104,12 +104,12 @@ def getIssue(year, week):
|
|||
return None
|
||||
url = 'http://service.spiegel.de/digas/servlet/epaper?Q=SP&JG=%d&AG=%d&SE=1&AN=INHALT' % (year, week)
|
||||
contents = []
|
||||
data = ox.cache.readUrl(url)
|
||||
data = ox.cache.read_url(url)
|
||||
items = re.compile('<a.?href="http://service.spiegel.de/digas/servlet/epaper\?Q=SP&JG=".?>(.*?)</a>').findall(data)
|
||||
for item in items:
|
||||
item = item[1]
|
||||
page = int(re.compile('&SE=(.*?)"').findall(item)[0])
|
||||
title = stripTags(item).strip()
|
||||
title = strip_tags(item).strip()
|
||||
contents.append({'title': title, 'page': page})
|
||||
pageUrl = {}
|
||||
pages = page + 2
|
||||
|
@ -163,7 +163,7 @@ def archiveIssues():
|
|||
f.close()
|
||||
filename = '%s/Der Spiegel %d %02d.jpg' % (dirname, y, w)
|
||||
if not os.path.exists(filename):
|
||||
data = ox.cache.readUrl(issue['coverUrl'])
|
||||
data = ox.cache.read_url(issue['coverUrl'])
|
||||
f = open(filename, 'w')
|
||||
f.write(data)
|
||||
f.close()
|
||||
|
@ -172,7 +172,7 @@ def archiveIssues():
|
|||
if url:
|
||||
filename = '%s/Der Spiegel %d %02d %03d.jpg' % (dirname, y, w, page)
|
||||
if not os.path.exists(filename):
|
||||
data = ox.cache.readUrl(url)
|
||||
data = ox.cache.read_url(url)
|
||||
f = open(filename, 'w')
|
||||
f.write(data)
|
||||
f.close()
|
||||
|
@ -243,7 +243,7 @@ def archiveNews():
|
|||
f.close()
|
||||
filename = dirname + '/' + new['imageUrl'].split('/')[-1]
|
||||
if not os.path.exists(filename):
|
||||
data = ox.cache.readUrl(new['imageUrl'])
|
||||
data = ox.cache.read_url(new['imageUrl'])
|
||||
f = open(filename, 'w')
|
||||
f.write(data)
|
||||
f.close()
|
||||
|
|
|
@ -6,8 +6,7 @@ import socket
|
|||
from urllib import quote, urlencode
|
||||
from urllib2 import URLError
|
||||
|
||||
from ox.cache import readUrl, readUrlUnicode
|
||||
from ox import findRe, cache, stripTags, decodeHtml, getTorrentInfo, normalizeNewlines
|
||||
from ox import findRe, cache, strip_tags, decodeHtml, getTorrentInfo, normalizeNewlines
|
||||
from ox.normalize import normalizeImdbId
|
||||
import ox
|
||||
|
||||
|
@ -18,13 +17,10 @@ cache_timeout = 24*60*60 # cache search only for 24 hours
|
|||
season_episode = re.compile("S..E..", re.IGNORECASE)
|
||||
|
||||
|
||||
def _readUrl(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None):
|
||||
def read_url(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None, unicode=False):
|
||||
headers = headers.copy()
|
||||
headers['Cookie'] = 'language=en_EN'
|
||||
return cache.readUrl(url, data, headers, timeout)
|
||||
|
||||
def _readUrlUnicode(url, timeout=cache.cache_timeout):
|
||||
return cache.readUrlUnicode(url, _readUrl=_readUrl, timeout=timeout)
|
||||
return cache.read_url(url, data, headers, timeout, unicode=unicode)
|
||||
|
||||
def findMovies(query, max_results=10):
|
||||
results = []
|
||||
|
@ -37,7 +33,7 @@ def findMovies(query, max_results=10):
|
|||
if not url.startswith('/'):
|
||||
url = "/" + url
|
||||
url = "http://thepiratebay.org" + url
|
||||
data = _readUrlUnicode(url, timeout=cache_timeout)
|
||||
data = read_url(url, timeout=cache_timeout, unicode=True)
|
||||
regexp = '''<tr.*?<td class="vertTh"><a href="/browse/(.*?)".*?<td><a href="(/torrent/.*?)" class="detLink".*?>(.*?)</a>.*?</tr>'''
|
||||
for row in re.compile(regexp, re.DOTALL).findall(data):
|
||||
torrentType = row[0]
|
||||
|
@ -83,7 +79,7 @@ def getData(piratebayId):
|
|||
torrent[u'domain'] = 'thepiratebay.org'
|
||||
torrent[u'comment_link'] = 'http://thepiratebay.org/torrent/%s' % piratebayId
|
||||
|
||||
data = _readUrlUnicode(torrent['comment_link'])
|
||||
data = read_url(torrent['comment_link'], unicode=True)
|
||||
torrent[u'title'] = findRe(data, '<title>(.*?) \(download torrent\) - TPB</title>')
|
||||
if not torrent[u'title']:
|
||||
return None
|
||||
|
@ -94,12 +90,12 @@ def getData(piratebayId):
|
|||
for d in re.compile('dt>(.*?):</dt>.*?<dd.*?>(.*?)</dd>', re.DOTALL).findall(data):
|
||||
key = d[0].lower().strip()
|
||||
key = _key_map.get(key, key)
|
||||
value = decodeHtml(stripTags(d[1].strip()))
|
||||
value = decodeHtml(strip_tags(d[1].strip()))
|
||||
torrent[key] = value
|
||||
torrent[u'description'] = findRe(data, '<div class="nfo">(.*?)</div>')
|
||||
if torrent[u'description']:
|
||||
torrent['description'] = normalizeNewlines(decodeHtml(stripTags(torrent['description']))).strip()
|
||||
t = _readUrl(torrent[u'torrent_link'])
|
||||
torrent['description'] = normalizeNewlines(decodeHtml(strip_tags(torrent['description']))).strip()
|
||||
t = _read_url(torrent[u'torrent_link'])
|
||||
torrent[u'torrent_info'] = getTorrentInfo(t)
|
||||
return torrent
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
from ox import intValue
|
||||
from ox import int_value
|
||||
|
||||
|
||||
class Torrent(dict):
|
||||
|
@ -25,7 +25,7 @@ class Torrent(dict):
|
|||
for key in self._int_keys:
|
||||
value = self.data.get(key, -1)
|
||||
if not isinstance(value, int):
|
||||
value = int(intValue(value))
|
||||
value = int(int_value(value))
|
||||
self[key] = value
|
||||
self['infohash'] = self.data['torrent_info'].get('hash', '')
|
||||
self['size'] = self.data['torrent_info'].get('size', -1)
|
||||
|
|
|
@ -3,8 +3,8 @@
|
|||
import re
|
||||
import time
|
||||
|
||||
from ox import stripTags, findRe
|
||||
from ox.cache import readUrlUnicode
|
||||
from ox import strip_tags, findRe
|
||||
from ox.cache import read_url
|
||||
|
||||
|
||||
def getEpisodeData(url):
|
||||
|
@ -14,9 +14,9 @@ def getEpisodeData(url):
|
|||
example:
|
||||
getEpisodeData('http://www.tv.com/lost/do-no-harm/episode/399310/summary.html')
|
||||
'''
|
||||
data = readUrlUnicode(url)
|
||||
data = read_url(url, unicode=True)
|
||||
r = {}
|
||||
r['description'] = stripTags(findRe(data, 'div id="main-col">.*?<div>(.*?)</div').split('\r')[0])
|
||||
r['description'] = strip_tags(findRe(data, 'div id="main-col">.*?<div>(.*?)</div').split('\r')[0])
|
||||
r['show'] = findRe(data, '<h1>(.*?)</h1>')
|
||||
r['title'] = findRe(data, '<title>.*?: (.*?) - TV.com </title>')
|
||||
#episode score
|
||||
|
|
|
@ -4,13 +4,13 @@ import re
|
|||
from StringIO import StringIO
|
||||
import xml.etree.ElementTree as ET
|
||||
|
||||
from ox.cache import readUrl, readUrlUnicode
|
||||
from ox.cache import read_url
|
||||
from ox import findString, findRe
|
||||
|
||||
|
||||
def getData(id):
|
||||
url = 'http://www.vimeo.com/moogaloop/load/clip:%s' %id
|
||||
xml = readUrl(url)
|
||||
xml = read_url(url)
|
||||
tree = ET.parse(StringIO(xml))
|
||||
request_signature = tree.find('request_signature').text
|
||||
request_signature_expires = tree.find('request_signature_expires').text
|
||||
|
|
|
@ -4,7 +4,7 @@ import re
|
|||
from urllib import urlencode
|
||||
|
||||
from ox.utils import json
|
||||
from ox.cache import readUrl, readUrlUnicode
|
||||
from ox.cache import read_url
|
||||
from ox import findRe, decodeHtml
|
||||
|
||||
|
||||
|
@ -47,7 +47,7 @@ def getUrlByAllmovieId(allmovieId):
|
|||
def getWikiData(wikipediaUrl):
|
||||
url = wikipediaUrl.replace('wikipedia.org/wiki/', 'wikipedia.org/w/index.php?title=')
|
||||
url = "%s&action=raw" % url
|
||||
data = readUrl(url).decode('utf-8')
|
||||
data = read_url(url).decode('utf-8')
|
||||
return data
|
||||
|
||||
def getMovieData(wikipediaUrl):
|
||||
|
@ -106,7 +106,7 @@ def getMovieData(wikipediaUrl):
|
|||
|
||||
def getImageUrl(name):
|
||||
url = 'http://en.wikipedia.org/wiki/Image:' + name.replace(' ', '%20')
|
||||
data = readUrlUnicode(url)
|
||||
data = read_url(url, unicode=True)
|
||||
url = findRe(data, 'href="(http://upload.wikimedia.org/.*?)"')
|
||||
if not url:
|
||||
url = findRe(data, 'href="(//upload.wikimedia.org/.*?)"')
|
||||
|
@ -133,9 +133,9 @@ def find(query, max_results=10):
|
|||
query = {'action': 'query', 'list':'search', 'format': 'json',
|
||||
'srlimit': max_results, 'srwhat': 'text', 'srsearch': query.encode('utf-8')}
|
||||
url = "http://en.wikipedia.org/w/api.php?" + urlencode(query)
|
||||
data = readUrl(url)
|
||||
data = read_url(url)
|
||||
if not data:
|
||||
data = readUrl(url, timeout=0)
|
||||
data = read_url(url, timeout=0)
|
||||
result = json.loads(data)
|
||||
results = []
|
||||
if result and 'query' in result:
|
||||
|
|
|
@ -5,7 +5,7 @@ import re
|
|||
from xml.dom.minidom import parseString
|
||||
|
||||
import feedparser
|
||||
from ox.cache import readUrl, cache_timeout
|
||||
from ox.cache import read_url, cache_timeout
|
||||
|
||||
|
||||
def getVideoUrl(youtubeId, format='mp4', timeout=cache_timeout):
|
||||
|
@ -33,7 +33,7 @@ def getVideoUrl(youtubeId, format='mp4', timeout=cache_timeout):
|
|||
def find(query, max_results=10, offset=1, orderBy='relevance'):
|
||||
query = quote(query)
|
||||
url = "http://gdata.youtube.com/feeds/api/videos?vq=%s&orderby=%s&start-index=%s&max-results=%s" % (query, orderBy, offset, max_results)
|
||||
data = readUrl(url)
|
||||
data = read_url(url)
|
||||
fd = feedparser.parse(data)
|
||||
videos = []
|
||||
for item in fd.entries:
|
||||
|
@ -48,7 +48,7 @@ def find(query, max_results=10, offset=1, orderBy='relevance'):
|
|||
def info(id):
|
||||
info = {}
|
||||
url = "http://gdata.youtube.com/feeds/api/videos/%s?v=2" % id
|
||||
data = readUrl(url)
|
||||
data = read_url(url)
|
||||
xml = parseString(data)
|
||||
info['url'] = 'http://www.youtube.com/watch?v=%s' % id
|
||||
info['title'] = xml.getElementsByTagName('title')[0].firstChild.data
|
||||
|
@ -62,21 +62,21 @@ def info(id):
|
|||
|
||||
info['keywords'] = xml.getElementsByTagName('media:keywords')[0].firstChild.data.split(', ')
|
||||
url = "http://www.youtube.com/watch?v=%s" % id
|
||||
data = readUrl(url)
|
||||
data = read_url(url)
|
||||
match = re.compile('<h4>License:</h4>(.*?)</p>', re.DOTALL).findall(data)
|
||||
if match:
|
||||
info['license'] = match[0].strip()
|
||||
info['license'] = re.sub('<.+?>', '', info['license']).strip()
|
||||
|
||||
url = "http://www.youtube.com/api/timedtext?hl=en&type=list&tlangs=1&v=%s&asrs=1"%id
|
||||
data = readUrl(url)
|
||||
data = read_url(url)
|
||||
xml = parseString(data)
|
||||
languages = [t.getAttribute('lang_code') for t in xml.getElementsByTagName('track')]
|
||||
if languages:
|
||||
info['subtitles'] = {}
|
||||
for language in languages:
|
||||
url = "http://www.youtube.com/api/timedtext?hl=en&v=%s&type=track&lang=%s&name&kind"%(id, language)
|
||||
data = readUrl(url)
|
||||
data = read_url(url)
|
||||
xml = parseString(data)
|
||||
subs = []
|
||||
for t in xml.getElementsByTagName('text'):
|
||||
|
@ -101,7 +101,7 @@ def videos(id, format=''):
|
|||
'mp4': 'video/mp4'
|
||||
}.get(format)
|
||||
url = "http://www.youtube.com/watch?v=%s" % id
|
||||
data = readUrl(url)
|
||||
data = read_url(url)
|
||||
match = re.compile('"url_encoded_fmt_stream_map": "(.*?)"').findall(data)
|
||||
streams = {}
|
||||
for x in match[0].split(','):
|
||||
|
|
Loading…
Reference in a new issue