net/cache readUrl->read_url / Unicode -> unicode=True

format replace all CammelCase with under_score
This commit is contained in:
j 2012-08-14 15:58:05 +02:00
parent c1d0fc6242
commit 2de989e188
33 changed files with 243 additions and 254 deletions

View file

@ -18,7 +18,7 @@ from utils import json
from .file import makedirs
import net
from net import DEFAULT_HEADERS, getEncoding
from net import DEFAULT_HEADERS, detect_encoding
cache_timeout = 30*24*60*60 # default is 30 days
@ -40,7 +40,7 @@ def status(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):
>>> status('http://google.com/mysearch')
404
'''
headers = getHeaders(url, data, headers)
headers = get_headers(url, data, headers)
return int(headers['status'])
def exists(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):
@ -55,10 +55,10 @@ def exists(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):
return True
return False
def getHeaders(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):
def get_headers(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):
url_headers = store.get(url, data, headers, timeout, "headers")
if not url_headers:
url_headers = net.getHeaders(url, data, headers)
url_headers = net.get_headers(url, data, headers)
store.set(url, data, -1, url_headers)
return url_headers
@ -68,7 +68,7 @@ class InvalidResult(Exception):
self.result = result
self.headers = headers
def readUrl(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout, valid=None):
def read_url(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout, valid=None, unicode=False):
'''
url - url to load
data - possible post data
@ -80,31 +80,29 @@ def readUrl(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout, vali
#FIXME: send last-modified / etag from cache and only update if needed
if isinstance(url, unicode):
url = url.encode('utf-8')
result = store.get(url, data, headers, timeout)
if not result:
data = store.get(url, data, headers, timeout)
if not data:
#print "get data", url
try:
url_headers, result = net.readUrl(url, data, headers, returnHeaders=True)
url_headers, data = net.read_url(url, data, headers, return_headers=True)
except urllib2.HTTPError, e:
e.headers['Status'] = "%s" % e.code
url_headers = dict(e.headers)
result = e.read()
data = e.read()
if url_headers.get('content-encoding', None) == 'gzip':
result = gzip.GzipFile(fileobj=StringIO.StringIO(result)).read()
if not valid or valid(result, url_headers):
store.set(url, data, result, url_headers)
data = gzip.GzipFile(fileobj=StringIO.StringIO(data)).read()
if not valid or valid(data, url_headers):
store.set(url, data, data, url_headers)
else:
raise InvalidResult(result, url_headers)
return result
raise InvalidResult(data, url_headers)
if unicode:
encoding = detect_encoding(data)
if not encoding:
encoding = 'latin-1'
data = data.decode(encoding)
return data
def readUrlUnicode(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout, _readUrl=readUrl, valid=None):
data = _readUrl(url, data, headers, timeout, valid)
encoding = getEncoding(data)
if not encoding:
encoding = 'latin-1'
return unicode(data, encoding)
def saveUrl(url, filename, overwrite=False):
def save_url(url, filename, overwrite=False):
if not os.path.exists(filename) or overwrite:
dirname = os.path.dirname(filename)
if not os.path.exists(dirname):

View file

@ -217,15 +217,15 @@ def to36(q):
def from36(q):
return int(q, 36)
def intValue(strValue, default=u''):
def int_value(strValue, default=u''):
"""
>>> intValue('abc23')
>>> int_value('abc23')
u'23'
>>> intValue(' abc23')
>>> int_value(' abc23')
u'23'
>>> intValue('ab')
>>> int_value('ab')
u''
"""
try:
@ -234,15 +234,15 @@ def intValue(strValue, default=u''):
val = default
return val
def floatValue(strValue, default=u''):
def float_value(strValue, default=u''):
"""
>>> floatValue('abc23.4')
>>> float_value('abc23.4')
u'23.4'
>>> floatValue(' abc23.4')
>>> float_value(' abc23.4')
u'23.4'
>>> floatValue('ab')
>>> float_value('ab')
u''
"""
try:
@ -251,46 +251,46 @@ def floatValue(strValue, default=u''):
val = default
return val
def formatNumber(number, longName, shortName):
def format_number(number, longName, shortName):
"""
Return the number in a human-readable format (23 KB, 23.4 MB, 23.42 GB)
>>> formatNumber(123, 'Byte', 'B')
>>> format_number(123, 'Byte', 'B')
'123 Bytes'
>>> formatNumber(1234, 'Byte', 'B')
>>> format_number(1234, 'Byte', 'B')
'1 KB'
>>> formatNumber(1234567, 'Byte', 'B')
>>> format_number(1234567, 'Byte', 'B')
'1.2 MB'
>>> formatNumber(1234567890, 'Byte', 'B')
>>> format_number(1234567890, 'Byte', 'B')
'1.15 GB'
>>> formatNumber(1234567890123456789, 'Byte', 'B')
>>> format_number(1234567890123456789, 'Byte', 'B')
'1,096.5166 PB'
>>> formatNumber(-1234567890123456789, 'Byte', 'B')
>>> format_number(-1234567890123456789, 'Byte', 'B')
'-1,096.5166 PB'
"""
if abs(number) < 1024:
return '%s %s%s' % (formatThousands(number), longName, number != 1 and 's' or '')
return '%s %s%s' % (format_thousands(number), longName, number != 1 and 's' or '')
prefix = ['K', 'M', 'G', 'T', 'P']
for i in range(5):
if abs(number) < math.pow(1024, i + 2) or i == 4:
n = number / math.pow(1024, i + 1)
return '%s %s%s' % (formatThousands('%.*f' % (i, n)), prefix[i], shortName)
return '%s %s%s' % (format_thousands('%.*f' % (i, n)), prefix[i], shortName)
def formatThousands(number, separator = ','):
def format_thousands(number, separator = ','):
"""
Return the number with separators (1,000,000)
>>> formatThousands(1)
>>> format_thousands(1)
'1'
>>> formatThousands(1000)
>>> format_thousands(1000)
'1,000'
>>> formatThousands(1000000)
>>> format_thousands(1000000)
'1,000,000'
"""
string = str(number).split('.')
@ -302,16 +302,16 @@ def formatThousands(number, separator = ','):
string[0] = ''.join(l)
return '.'.join(string)
def formatBits(number):
return formatNumber(number, 'bit', 'b')
def format_bits(number):
return format_number(number, 'bit', 'b')
def formatBytes(number):
return formatNumber(number, 'byte', 'B')
def format_bytes(number):
return format_number(number, 'byte', 'B')
def formatPixels(number):
return formatNumber(number, 'pixel', 'px')
def format_pixels(number):
return format_number(number, 'pixel', 'px')
def formatCurrency(amount, currency="$"):
def format_currency(amount, currency="$"):
if amount:
temp = "%.2f" % amount
profile=re.compile(r"(\d)(\d\d\d[.,])")
@ -336,9 +336,9 @@ def plural(amount, unit, plural='s'):
if plural == 's':
unit = unit + plural
else: unit = plural
return "%s %s" % (formatThousands(amount), unit)
return "%s %s" % (format_thousands(amount), unit)
def formatDuration(ms, verbosity=0, years=True, hours=True, milliseconds=True):
def format_duration(ms, verbosity=0, years=True, hours=True, milliseconds=True):
'''
verbosity
0: D:HH:MM:SS
@ -353,13 +353,13 @@ def formatDuration(ms, verbosity=0, years=True, hours=True, milliseconds=True):
milliseconds
True: always display milliseconds
False: never display milliseconds
>>> formatDuration(1000 * 60 * 60 * 24 * 366)
>>> format_duration(1000 * 60 * 60 * 24 * 366)
'1:001:00:00:00.000'
>>> formatDuration(1000 * 60 * 60 * 24 * 366, years=False)
>>> format_duration(1000 * 60 * 60 * 24 * 366, years=False)
'366:00:00:00.000'
>>> formatDuration(1000 * 60 * 60 * 24 * 365 + 2003, verbosity=2)
>>> format_duration(1000 * 60 * 60 * 24 * 365 + 2003, verbosity=2)
'1 year 2 seconds 3 milliseconds'
>>> formatDuration(1000 * 30, hours=False, milliseconds=False)
>>> format_duration(1000 * 30, hours=False, milliseconds=False)
'00:30'
'''
if not ms and ms != 0:
@ -403,7 +403,7 @@ def formatDuration(ms, verbosity=0, years=True, hours=True, milliseconds=True):
return duration
def ms2runtime(ms, shortenLong=False):
# deprecated - use formatDuration
# deprecated - use format_duration
'''
>>> ms2runtime(5000)
'5 seconds'
@ -415,11 +415,11 @@ def ms2runtime(ms, shortenLong=False):
'13 hours 53 minutes'
'''
if shortenLong and ms > 1000 * 60 * 60 * 24 * 464:
return formatDuration(ms, verbosity=1, milliseconds=False)
return formatDuration(ms, verbosity=2, milliseconds=False)
return format_duration(ms, verbosity=1, milliseconds=False)
return format_duration(ms, verbosity=2, milliseconds=False)
def ms2playtime(ms, hours=False):
# deprecated - use formatDuration
# deprecated - use format_duration
'''
>>> ms2playtime(5000)
'00:05'
@ -428,15 +428,15 @@ def ms2playtime(ms, hours=False):
>>> ms2playtime(50000000)
'13:53:20'
'''
return formatDuration(ms, hours=False, years=False, milliseconds=False)
return format_duration(ms, hours=False, years=False, milliseconds=False)
def ms2time(ms):
# deprecated - use formatDuration
# deprecated - use format_duration
'''
>>> ms2time(44592123)
'12:23:12.123'
'''
return formatDuration(ms, years=False)
return format_duration(ms, years=False)
def time2ms(timeString):
'''
@ -451,7 +451,7 @@ def time2ms(timeString):
ms = ms * 60 + float(_p)
return int(ms * 1000)
def shiftTime(offset, timeString):
def shift_time(offset, timeString):
newTime = time2ms(timeString) + offset
return ms2time(newTime)

View file

@ -22,7 +22,7 @@ DEFAULT_HEADERS = {
def status(url, data=None, headers=DEFAULT_HEADERS):
try:
f = openUrl(url, data, headers)
f = open_url(url, data, headers)
s = f.code
except urllib2.HTTPError, e:
s = e.code
@ -34,9 +34,9 @@ def exists(url, data=None, headers=DEFAULT_HEADERS):
return True
return False
def getHeaders(url, data=None, headers=DEFAULT_HEADERS):
def headers(url, data=None, headers=DEFAULT_HEADERS):
try:
f = openUrl(url, data, headers)
f = open_url(url, data, headers)
f.headers['Status'] = "%s" % f.code
headers = f.headers
f.close()
@ -45,30 +45,28 @@ def getHeaders(url, data=None, headers=DEFAULT_HEADERS):
headers = e.headers
return dict(headers)
def openUrl(url, data=None, headers=DEFAULT_HEADERS):
def open_url(url, data=None, headers=DEFAULT_HEADERS):
url = url.replace(' ', '%20')
req = urllib2.Request(url, data, headers)
return urllib2.urlopen(req)
def readUrl(url, data=None, headers=DEFAULT_HEADERS, returnHeaders=False):
f = openUrl(url, data, headers)
def read_url(url, data=None, headers=DEFAULT_HEADERS, return_headers=False, unicode=False):
f = open_url(url, data, headers)
data = f.read()
f.close()
if f.headers.get('content-encoding', None) == 'gzip':
data = gzip.GzipFile(fileobj=StringIO.StringIO(data)).read()
if returnHeaders:
if unicode:
encoding = detect_encoding(data)
if not encoding:
encoding = 'latin-1'
data = data.decode(encoding)
if return_headers:
f.headers['Status'] = "%s" % f.code
return dict(f.headers), data
return data
def readUrlUnicode(url, data=None, headers=DEFAULT_HEADERS):
data = readUrl(url, data, headers)
encoding = getEncoding(data)
if not encoding:
encoding = 'latin-1'
return unicode(data, encoding)
def getEncoding(data):
def detect_encoding(data):
if 'content="text/html; charset=utf-8"' in data:
return 'utf-8'
elif 'content="text/html; charset=iso-8859-1"' in data:
@ -81,7 +79,7 @@ def getEncoding(data):
detector.close()
return detector.result['encoding']
def saveUrl(url, filename, overwrite=False):
def save_url(url, filename, overwrite=False):
if not os.path.exists(filename) or overwrite:
dirname = os.path.dirname(filename)
if not os.path.exists(dirname):

View file

@ -97,8 +97,8 @@ def encode(data):
for s in data:
srt += '%d\r\n%s --> %s\r\n%s\r\n\r\n' % (
i,
ox.formatDuration(s['in']*1000, years=False).replace('.', ','),
ox.formatDuration(s['out']*1000, years=False).replace('.', ','),
ox.format_duration(s['in']*1000, years=False).replace('.', ','),
ox.format_duration(s['out']*1000, years=False).replace('.', ','),
s['value'].replace('\n', '\r\n').strip()
)
i += 1

View file

@ -3,8 +3,8 @@
import re
import time
from ox import stripTags, findRe
from ox.cache import readUrlUnicode
from ox import strip_tags, findRe
from ox.cache import read_url
def getId(url):
@ -26,7 +26,7 @@ def getData(id):
data = {
"url": getUrl(id)
}
html = readUrlUnicode(data["url"])
html = read_url(data["url"], unicode=True)
data['aka'] = parseList(html, 'AKA')
data['category'] = findRe(html, '<dt>category</dt>.*?<dd>(.*?)</dd>')
data['countries'] = parseList(html, 'countries')
@ -40,18 +40,18 @@ def getData(id):
data['releasedate'] = parseList(html, 'release date')
data['runtime'] = parseEntry(html, 'run time').replace('min.', '').strip()
data['set'] = parseEntry(html, 'set in')
data['synopsis'] = stripTags(findRe(html, '<div class="toggle-text" itemprop="description">(.*?)</div>')).strip()
data['synopsis'] = strip_tags(findRe(html, '<div class="toggle-text" itemprop="description">(.*?)</div>')).strip()
data['themes'] = parseList(html, 'themes')
data['types'] = parseList(html, 'types')
data['year'] = findRe(html, '<span class="year">.*?(\d+)')
#data['stills'] = [re.sub('_derived.*?/', '', i) for i in re.compile('<a href="#" title="movie still".*?<img src="(.*?)"', re.DOTALL).findall(html)]
data['stills'] = re.compile('<a href="#" title="movie still".*?<img src="(.*?)"', re.DOTALL).findall(html)
#html = readUrlUnicode("http://allmovie.com/work/%s/cast" % id)
#html = read_url("http://allmovie.com/work/%s/cast" % id, unicode=True)
#data['cast'] = parseTable(html)
#html = readUrlUnicode("http://allmovie.com/work/%s/credits" % id)
#html = read_url("http://allmovie.com/work/%s/credits" % id, unicode=True)
#data['credits'] = parseTable(html)
html = readUrlUnicode("http://allmovie.com/work/%s/review" % id)
data['review'] = stripTags(findRe(html, '<div class="toggle-text" itemprop="description">(.*?)</div>')).strip()
html = read_url("http://allmovie.com/work/%s/review" % id, unicode=True)
data['review'] = strip_tags(findRe(html, '<div class="toggle-text" itemprop="description">(.*?)</div>')).strip()
return data
def getUrl(id):
@ -59,26 +59,26 @@ def getUrl(id):
def parseEntry(html, title):
html = findRe(html, '<dt>%s</dt>.*?<dd>(.*?)</dd>' % title)
return stripTags(html).strip()
return strip_tags(html).strip()
def parseList(html, title):
html = findRe(html, '<dt>%s</dt>.*?<dd>(.*?)</dd>' % title.lower())
r = map(lambda x: stripTags(x), re.compile('<li>(.*?)</li>', re.DOTALL).findall(html))
r = map(lambda x: strip_tags(x), re.compile('<li>(.*?)</li>', re.DOTALL).findall(html))
if not r and html:
r = [stripTags(html)]
r = [strip_tags(html)]
return r
def parseTable(html):
return map(
lambda x: map(
lambda x: stripTags(x).strip().replace('&nbsp;', ''),
lambda x: strip_tags(x).strip().replace('&nbsp;', ''),
x.split('<td width="305">-')
),
findRe(html, '<div id="results-table">(.*?)</table>').split('</tr>')[:-1]
)
def parseText(html, title):
return stripTags(findRe(html, '%s</td>.*?<td colspan="2"><p>(.*?)</td>' % title)).strip()
return strip_tags(findRe(html, '%s</td>.*?<td colspan="2"><p>(.*?)</td>' % title)).strip()
if __name__ == '__main__':
print getData('129689')

View file

@ -3,14 +3,14 @@
import re
from urllib import quote
from ox import findRe, stripTags, decodeHtml
from ox.cache import readUrlUnicode
from ox import findRe, strip_tags, decodeHtml
from ox.cache import read_url
def findISBN(title, author):
q = '%s %s' % (title, author)
url = "http://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Dstripbooks&field-keywords=" + "%s&x=0&y=0" % quote(q)
data = readUrlUnicode(url)
data = read_url(url, unicode=True)
links = re.compile('href="(http://www.amazon.com/.*?/dp/.*?)"').findall(data)
id = findRe(re.compile('href="(http://www.amazon.com/.*?/dp/.*?)"').findall(data)[0], '/dp/(.*?)/')
data = getData(id)
@ -20,7 +20,7 @@ def findISBN(title, author):
def getData(id):
url = "http://www.amazon.com/title/dp/%s/" % id
data = readUrlUnicode(url)
data = read_url(url, unicode=True)
def findData(key):
@ -44,9 +44,9 @@ def getData(id):
if not r['pages']:
r['pages'] = findData('Hardcover')
r['review'] = stripTags(findRe(data, '<h3 class="productDescriptionSource">Review</h3>.*?<div class="productDescriptionWrapper">(.*?)</div>').replace('<br />', '\n')).strip()
r['review'] = strip_tags(findRe(data, '<h3 class="productDescriptionSource">Review</h3>.*?<div class="productDescriptionWrapper">(.*?)</div>').replace('<br />', '\n')).strip()
r['description'] = stripTags(findRe(data, '<h3 class="productDescriptionSource">Product Description</h3>.*?<div class="productDescriptionWrapper">(.*?)</div>').replace('<br />', '\n')).strip()
r['description'] = strip_tags(findRe(data, '<h3 class="productDescriptionSource">Product Description</h3>.*?<div class="productDescriptionWrapper">(.*?)</div>').replace('<br />', '\n')).strip()
r['cover'] = re.findall('src="(.*?)" id="prodImage"', data)
if r['cover']:

View file

@ -1,7 +1,7 @@
import json
import re
from ox.cache import readUrlUnicode
from ox.cache import read_url
HEADERS = {
'User-Agent': 'iTunes/10.4 (Macintosh; Intel Mac OS X 10.7) AppleWebKit/534.48.3',
@ -26,21 +26,21 @@ def getMovieData(title, director):
url += '&actorNames=&directorProducerName=' + director
url += '&releaseYearTerm=&descriptionTerm=&genreIndex=1&ratingIndex=1'
HEADERS['Referer'] = url
html = readUrlUnicode(url, headers=HEADERS)
html = read_url(url, headers=HEADERS, unicode=True)
regexp = '<a href="(http://itunes.apple.com/us/movie/.*?)" class="artwork-link"><div class="artwork">'
regexp += '<img width=".*?" height=".*?" alt=".*?" class="artwork" src="(.*?)" /></div></a>'
results = re.compile(regexp).findall(html)
if results:
data['link'] = results[0][0]
data['poster'] = results[0][1].replace('140x140', '600x600')
html = readUrlUnicode(data['link'], headers=HEADERS)
html = read_url(data['link'], headers=HEADERS, unicode=True)
results = re.compile('video-preview-url="(.*?)"').findall(html)
if results:
data['trailer'] = results[0]
# trailers section (preferred source for poster and trailer)
host = 'http://trailers.apple.com'
url = host + '/trailers/home/scripts/quickfind.php?callback=searchCallback&q=' + title
js = json.loads(readUrlUnicode(url)[16:-4])
js = json.loads(read_url(url, unicode=True)[16:-4])
results = js['results']
if results:
url = host + results[0]['location']
@ -49,11 +49,11 @@ def getMovieData(title, director):
headers = {
'User-Agent': USER_AGENT
}
html = readUrlUnicode(url, headers=headers)
html = read_url(url, headers=headers, unicode=True)
results = re.compile('"(' + host + '.*?poster\.jpg)"').findall(html)
if results:
data['poster'] = results[0].replace('poster.jpg', 'poster-xlarge.jpg')
html = readUrlUnicode(url + 'includes/playlists/web.inc', headers=headers)
html = read_url(url + 'includes/playlists/web.inc', headers=headers, unicode=True)
results = re.compile('"(' + host + '\S+\.mov)"').findall(html)
if results:
data['trailer'] = results[-1]

View file

@ -12,7 +12,7 @@ def getUrl(id):
def getData(id):
data = {}
url = getUrl(id)
details = cache.readUrl('%s?output=json' % url)
details = cache.read_url('%s?output=json' % url)
details = json.loads(details)
for key in ('title', 'description', 'runtime'):
data[key] = details['metadata'][key]

View file

@ -3,8 +3,8 @@
import re
import ox.cache
from ox.cache import readUrlUnicode
from ox.html import stripTags
from ox.cache import read_url
from ox.html import strip_tags
from ox.text import findRe, removeSpecialCharacters
import imdb
@ -30,19 +30,19 @@ def getData(id, timeout=ox.cache.cache_timeout, get_imdb=False):
"url": getUrl(id)
}
try:
html = readUrlUnicode(data["url"], timeout=timeout)
html = read_url(data["url"], timeout=timeout, unicode=True)
except:
html = ox.cache.readUrl(data["url"], timeout=timeout)
html = ox.cache.read_url(data["url"], timeout=timeout)
data["number"] = findRe(html, "<li>Spine #(\d+)")
data["title"] = findRe(html, "<meta property=['\"]og:title['\"] content=['\"](.*?)['\"]")
data["title"] = data["title"].split(u' \u2014 The Television Version')[0]
data["director"] = stripTags(findRe(html, "<h2 class=\"director\">(.*?)</h2>"))
data["director"] = strip_tags(findRe(html, "<h2 class=\"director\">(.*?)</h2>"))
results = findRe(html, '<div class="left_column">(.*?)</div>')
results = re.compile("<li>(.*?)</li>").findall(results)
data["country"] = results[0]
data["year"] = results[1]
data["synopsis"] = stripTags(findRe(html, "<p><strong>SYNOPSIS:</strong> (.*?)</p>"))
data["synopsis"] = strip_tags(findRe(html, "<p><strong>SYNOPSIS:</strong> (.*?)</p>"))
result = findRe(html, "<div class=\"purchase\">(.*?)</div>")
if 'Blu-Ray' in result or 'Essential Art House DVD' in result:
@ -53,7 +53,7 @@ def getData(id, timeout=ox.cache.cache_timeout, get_imdb=False):
if not "/boxsets/" in result:
data["posters"] = [result]
else:
html_ = readUrlUnicode(result)
html_ = read_url(result, unicode=True)
result = findRe(html_, '<a href="http://www.criterion.com/films/%s.*?">(.*?)</a>' % id)
result = findRe(result, "src=\"(.*?)\"")
if result:
@ -77,7 +77,7 @@ def getData(id, timeout=ox.cache.cache_timeout, get_imdb=False):
def getIds():
ids = []
html = readUrlUnicode("http://www.criterion.com/library/expanded_view?m=dvd&p=1&pp=50&s=spine")
html = read_url("http://www.criterion.com/library/expanded_view?m=dvd&p=1&pp=50&s=spine", unicode=True)
results = re.compile("\&amp;p=(\d+)\&").findall(html)
pages = max(map(int, results))
for page in range(1, pages):
@ -88,13 +88,13 @@ def getIds():
def getIdsByPage(page):
ids = []
url = "http://www.criterion.com/library/expanded_view?m=dvd&p=%s&pp=50&s=spine" % page
html = readUrlUnicode(url)
html = read_url(url, unicode=True)
results = re.compile("films/(\d+)").findall(html)
for result in results:
ids.append(result)
results = re.compile("boxsets/(.*?)\"").findall(html)
for result in results:
html = readUrlUnicode("http://www.criterion.com/boxsets/" + result)
html = read_url("http://www.criterion.com/boxsets/" + result, unicode=True)
results = re.compile("films/(\d+)").findall(html)
for result in results:
ids.append(result)

View file

@ -2,7 +2,7 @@
# vi:si:et:sw=4:sts=4:ts=4
import re
from urllib import unquote
from ox.cache import readUrl
from ox.cache import read_url
def getVideoUrl(url):
@ -13,7 +13,7 @@ def getVideoUrl(url):
>>> getVideoUrl('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3ou94_priere-pour-refuznik-2-jeanluc-goda_shortfilms').split('?auth')[0]
'http://www.dailymotion.com/cdn/FLV-320x240/video/x3ou94_priere-pour-refuznik-2-jean-luc-god_shortfilms.flv'
'''
data = readUrl(url)
data = read_url(url)
video = re.compile('''video", "(.*?)"''').findall(data)
for v in video:
v = unquote(v).split('@@')[0]

View file

@ -3,9 +3,9 @@
import re
import urllib
import ox
from ox import stripTags, decodeHtml
from ox import strip_tags, decodeHtml
from ox.utils import json
from ox.cache import readUrlUnicode
from ox.cache import read_url
def find(query, timeout=ox.cache.cache_timeout):
@ -13,10 +13,10 @@ def find(query, timeout=ox.cache.cache_timeout):
query = query.encode('utf-8')
params = urllib.urlencode({'q': query})
url = 'http://duckduckgo.com/html/?' + params
data = readUrlUnicode(url, timeout=timeout)
data = read_url(url, timeout=timeout, unicode=True)
results = []
regex = '<a .*?class="l le" href="(.+?)">(.*?)</a>.*?<div class="cra">(.*?)</div>'
for r in re.compile(regex, re.DOTALL).findall(data):
results.append((stripTags(decodeHtml(r[1])), r[0], stripTags(decodeHtml(r[2]))))
results.append((strip_tags(decodeHtml(r[1])), r[0], strip_tags(decodeHtml(r[2]))))
return results

View file

@ -3,8 +3,8 @@
import re
import time
from ox import stripTags, findRe
from ox.cache import readUrlUnicode
from ox import strip_tags, findRe
from ox.cache import read_url
import google
@ -21,9 +21,9 @@ def getShowUrl(title):
return None
def getShowData(url):
data = readUrlUnicode(url)
data = read_url(url, unicode=True)
r = {}
r['title'] = stripTags(findRe(data, '<h1>(.*?)</h1>'))
r['title'] = strip_tags(findRe(data, '<h1>(.*?)</h1>'))
r['imdb'] = findRe(data, '<h1><a href=".*?/title/tt(\d.*?)">.*?</a></h1>')
r['episodes'] = {}
#1. 1- 1 1001 7 Aug 05 You Can't Miss the Bear

View file

@ -4,8 +4,8 @@
import re
from lxml.html import document_fromstring
from ox.cache import readUrlUnicode
from ox import findRe, stripTags
from ox.cache import read_url
from ox import findRe, strip_tags
from ox.web.imdb import ImdbCombined
@ -32,7 +32,7 @@ def getData(id, timeout=-1):
data = {
"url": getUrl(id),
}
html = readUrlUnicode(data['url'], timeout=timeout)
html = read_url(data['url'], timeout=timeout, timeout=True)
doc = document_fromstring(html)
props = {

View file

@ -2,7 +2,7 @@
# vi:si:et:sw=4:sts=4:ts=4
import json
from ox.cache import readUrlUnicode
from ox.cache import read_url
from ox import findRe
class Imdb(dict):
@ -12,7 +12,7 @@ class Imdb(dict):
"http://graph.freebase.com/imdb.title.tt%s" % id
might also be of interest at some point, right now not much info
'''
data = readUrlUnicode(url)
data = read_url(url, unicode=True)
try:
data = json.loads(data)
except ValueError:

View file

@ -4,13 +4,13 @@ import re
import urllib
import ox
from ox import stripTags, decodeHtml
from ox import strip_tags, decodeHtml
DEFAULT_MAX_RESULTS = 10
DEFAULT_TIMEOUT = 24*60*60
def readUrlUnicode(url, data=None, headers=ox.net.DEFAULT_HEADERS, timeout=DEFAULT_TIMEOUT):
return ox.cache.readUrlUnicode(url, data, headers, timeout)
def read_url(url, data=None, headers=ox.net.DEFAULT_HEADERS, timeout=DEFAULT_TIMEOUT):
return ox.cache.read_url(url, data, headers, timeout, unicode=True)
def quote_plus(s):
if not isinstance(s, str):
@ -28,13 +28,13 @@ def find(query, max_results=DEFAULT_MAX_RESULTS, timeout=DEFAULT_TIMEOUT):
u'http://www.imdb.com/title/tt0133093/'
"""
url = 'http://google.com/search?q=%s' % quote_plus(query)
data = readUrlUnicode(url, timeout=timeout)
data = read_url(url, timeout=timeout)
results = []
data = re.sub('<span class="f">(.*?)</span>', '\\1', data)
for a in re.compile(
'<a href="(\S+?)" class=l .*?>(.*?)</a>.*?<span class="st">(.*?)<\/span>'
).findall(data):
results.append((stripTags(decodeHtml(a[1])), a[0], stripTags(decodeHtml(a[2]))))
results.append((strip_tags(decodeHtml(a[1])), a[0], strip_tags(decodeHtml(a[2]))))
if len(results) >= max_results:
break
return results

View file

@ -8,7 +8,7 @@ import time
import unicodedata
import ox
from ox import findRe, stripTags
from ox import findRe, strip_tags
from ox.normalize import normalizeTitle, normalizeImdbId
import ox.cache
@ -16,12 +16,9 @@ from siteparser import SiteParser
import google
def readUrl(url, data=None, headers=ox.cache.DEFAULT_HEADERS, timeout=ox.cache.cache_timeout, valid=None):
def read_url(url, data=None, headers=ox.cache.DEFAULT_HEADERS, timeout=ox.cache.cache_timeout, valid=None, unicode=False):
headers = headers.copy()
return ox.cache.readUrl(url, data, headers, timeout)
def readUrlUnicode(url, timeout=ox.cache.cache_timeout):
return ox.cache.readUrlUnicode(url, _readUrl=readUrl, timeout=timeout)
return ox.cache.read_url(url, data, headers, timeout, unicode=unicode)
def getUrl(id):
return "http://www.imdb.com/title/tt%s/" % id
@ -61,7 +58,7 @@ class Imdb(SiteParser):
'page': 'combined',
're': [
'<td class="nm">.*?>(.*?)</a>.*?<td class="char">(.*?)</td>',
lambda ll: [stripTags(l) for l in ll]
lambda ll: [strip_tags(l) for l in ll]
],
'type': 'list'
},
@ -266,8 +263,8 @@ class Imdb(SiteParser):
}
}
def readUrlUnicode(self, url, timeout):
return readUrlUnicode(url, timeout)
def read_url(self, url, timeout):
return read_url(url, timeout, unicode=True)
def __init__(self, id, timeout=-1):
#use akas.imdb.com to always get original title:
@ -276,7 +273,7 @@ class Imdb(SiteParser):
super(Imdb, self).__init__(timeout)
url = self.baseUrl + 'combined'
page = self.readUrlUnicode(url, timeout=-1)
page = self.read_url(url, timeout=-1)
if '<title>IMDb: Page not found</title>' in page \
or 'The requested URL was not found on our server.' in page:
return
@ -460,7 +457,7 @@ def getMovieIdByTitle(title, timeout=-1):
params['q'] = params['q'].encode('utf-8')
params = urllib.urlencode(params)
url = "http://akas.imdb.com/find?" + params
data = readUrlUnicode(url, timeout=timeout)
data = read_url(url, timeout=timeout, unicode=True)
#if search results in redirect, get id of current page
r = '<meta property="og:url" content="http://www.imdb.com/title/tt(\d{7})/" />'
results = re.compile(r).findall(data)
@ -538,7 +535,7 @@ def getMovieId(title, director='', year='', timeout=-1):
url = "http://akas.imdb.com/find?" + params
#print url
data = readUrlUnicode(url, timeout=timeout)
data = read_url(url, timeout=timeout, unicode=True)
#if search results in redirect, get id of current page
r = '<meta property="og:url" content="http://www.imdb.com/title/tt(\d{7})/" />'
results = re.compile(r).findall(data)
@ -569,7 +566,7 @@ def getMoviePoster(imdbId):
info = ImdbCombined(imdbId)
if 'posterId' in info:
url = "http://www.imdb.com/rg/action-box-title/primary-photo/media/rm%s/tt%s" % (info['posterId'], imdbId)
data = readUrl(url)
data = read_url(url)
poster = findRe(data, 'img id="primary-img".*?src="(.*?)"')
return poster
elif 'series' in info:
@ -578,7 +575,7 @@ def getMoviePoster(imdbId):
def maxVotes():
url = 'http://www.imdb.com/search/title?num_votes=500000,&sort=num_votes,desc'
data = ox.cache.readUrl(url)
data = ox.cache.read_url(url)
votes = max([int(v.replace(',', ''))
for v in re.compile('<td class="sort_col">([\d,]+)</td>').findall(data)])
return votes

View file

@ -2,8 +2,8 @@
# encoding: utf-8
import re
from ox.cache import readUrlUnicode
from ox.html import stripTags
from ox.cache import read_url
from ox.html import strip_tags
from ox.text import findRe
@ -21,11 +21,11 @@ def getData(id):
data = {
'url': getUrl(id)
}
html = readUrlUnicode(data['url'])
html = read_url(data['url'], unicode=True)
data['imdbId'] = findRe(html, 'imdb.com/title/tt(\d{7})')
if not data['imdbId']:
data['imdbId'] = _id_map.get(id, '')
data['title'] = stripTags(findRe(html, '<p class="name white">(.*?) \(<a href="alpha1.html">'))
data['title'] = strip_tags(findRe(html, '<p class="name white">(.*?) \(<a href="alpha1.html">'))
data['year'] = findRe(html, '\(<a href="alpha1.html">(.*?)</a>\)')
data['posters'] = []
poster = findRe(html, '<img src="(posters.*?)"')
@ -36,11 +36,11 @@ def getData(id):
for result in results:
result = result.replace('_xlg.html', '.html')
url = 'http://www.impawards.com/%s/%s' % (data['year'], result)
html = readUrlUnicode(url)
html = read_url(url, unicode=True)
result = findRe(html, '<a href = (\w*?_xlg.html)')
if result:
url = 'http://www.impawards.com/%s/%s' % (data['year'], result)
html = readUrlUnicode(url)
html = read_url(url, unicode=True)
poster = 'http://www.impawards.com/%s/%s' % (data['year'], findRe(html, '<img SRC="(.*?)"'))
else:
poster = 'http://www.impawards.com/%s/%s' % (data['year'], findRe(html, '<img src="(posters.*?)"'))
@ -61,7 +61,7 @@ def getId(url):
def getIds():
ids = []
html = readUrlUnicode('http://www.impawards.com/archives/latest.html', timeout = 60*60)
html = read_url('http://www.impawards.com/archives/latest.html', timeout = 60*60, unicode=True)
pages = int(findRe(html, '<a href= page(.*?).html>')) + 1
for page in range(pages, 0, -1):
for id in getIdsByPage(page):
@ -71,7 +71,7 @@ def getIds():
def getIdsByPage(page):
ids = []
html = readUrlUnicode('http://www.impawards.com/archives/page%s.html' % page, timeout = -1)
html = read_url('http://www.impawards.com/archives/page%s.html' % page, timeout = -1, unicode=True)
results = re.compile('<a href = \.\./(.*?)>', re.DOTALL).findall(html)
for result in results:
url = 'http://impawards.com/%s' % result
@ -80,7 +80,7 @@ def getIdsByPage(page):
def getUrl(id):
url = u"http://www.impawards.com/%s.html" % id
html = readUrlUnicode(url)
html = read_url(url, unicode=True)
if findRe(html, "No Movie Posters on This Page"):
url = u"http://www.impawards.com/%s_ver1.html" % id
return url

View file

@ -3,8 +3,8 @@
import re
import urllib
from ox.cache import readUrl
from ox.html import decodeHtml, stripTags
from ox.cache import read_url
from ox.html import decodeHtml, strip_tags
from ox.text import findRe
from ox.text import findString
@ -113,20 +113,20 @@ class ItunesAlbum:
def getId(self):
url = composeUrl('advancedSearch', {'media': 'music', 'title': self.title, 'artist': self.artist})
xml = readUrl(url, headers = ITUNES_HEADERS)
xml = read_url(url, headers = ITUNES_HEADERS)
id = findRe(xml, 'viewAlbum\?id=(.*?)&')
return id
def getData(self):
data = {'id': self.id}
url = composeUrl('viewAlbum', {'id': self.id})
xml = readUrl(url, None, ITUNES_HEADERS)
xml = read_url(url, None, ITUNES_HEADERS)
data['albumName'] = findRe(xml, '<B>(.*?)</B>')
data['artistName'] = findRe(xml, '<b>(.*?)</b>')
data['coverUrl'] = findRe(xml, 'reflection="." url="(.*?)"')
data['genre'] = findRe(xml, 'Genre:(.*?)<')
data['releaseDate'] = findRe(xml, 'Released(.*?)<')
data['review'] = stripTags(findRe(xml, 'REVIEW</b>.*?<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
data['review'] = strip_tags(findRe(xml, 'REVIEW</b>.*?<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
data['tracks'] = []
strings = findRe(xml, '<key>items</key>.*?<dict>(.*?)$').split('<dict>')
for string in strings:
@ -144,14 +144,14 @@ class ItunesMovie:
def getId(self):
url = composeUrl('advancedSearch', {'media': 'movie', 'title': self.title, 'director': self.director})
xml = readUrl(url, headers = ITUNES_HEADERS)
xml = read_url(url, headers = ITUNES_HEADERS)
id = findRe(xml, 'viewMovie\?id=(.*?)&')
return id
def getData(self):
data = {'id': self.id}
url = composeUrl('viewMovie', {'id': self.id})
xml = readUrl(url, None, ITUNES_HEADERS)
xml = read_url(url, None, ITUNES_HEADERS)
f = open('/Users/rolux/Desktop/iTunesData.xml', 'w')
f.write(xml)
f.close()

View file

@ -1,15 +1,15 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
from ox.cache import readUrl
from ox.cache import read_url
from ox.html import decodeHtml
from ox.text import findRe
def getLyrics(title, artist):
html = readUrl('http://lyricsfly.com/api/')
html = read_url('http://lyricsfly.com/api/')
key = findRe(html, '<font color=green><b>(.*?)</b></font>')
url = 'http://lyricsfly.com/api/api.php?i=%s&a=%s&t=%s' % (key, artist, title)
xml = readUrl(url)
xml = read_url(url)
lyrics = findRe(xml, '<tx>(.*?)\[br\] Lyrics [a-z]* by lyricsfly.com')
lyrics = lyrics.replace('\n', '').replace('\r', '')
lyrics = lyrics.replace('[br]', '\n').strip()

View file

@ -4,8 +4,8 @@ import re
from urllib import quote
from lxml.html import document_fromstring
from ox.cache import readUrl, readUrlUnicode
from ox import findRe, stripTags
from ox.cache import read_url
from ox import findRe, strip_tags
def getUrl(id):
return 'http://www.metacritic.com/movie/%s' % id
@ -15,18 +15,18 @@ def getId(url):
def getUrlByImdb(imdb):
url = "http://www.imdb.com/title/tt%s/criticreviews" % imdb
data = readUrl(url)
data = read_url(url)
metacritic_url = findRe(data, '"(http://www.metacritic.com/movie/.*?)"')
return metacritic_url or None
def getMetacriticShowUrl(title):
title = quote(title)
url = "http://www.metacritic.com/search/process?ty=6&ts=%s&tfs=tvshow_title&x=0&y=0&sb=0&release_date_s=&release_date_e=&metascore_s=&metascore_e=" % title
data = readUrl(url)
data = read_url(url)
return findRe(data, '(http://www.metacritic.com/tv/shows/.*?)\?')
def getData(url):
data = readUrlUnicode(url)
data = read_url(url, unicode=True)
doc = document_fromstring(data)
score = filter(lambda s: s.attrib.get('property') == 'v:average',
doc.xpath('//span[@class="score_value"]'))
@ -51,7 +51,7 @@ def getData(url):
'critic': authors[i],
'url': urls[i],
'source': sources[i],
'quote': stripTags(reviews[i]).strip(),
'quote': strip_tags(reviews[i]).strip(),
'score': scores[i],
})

View file

@ -5,8 +5,8 @@ import re
import socket
from urllib import quote
from ox.cache import readUrl, readUrlUnicode
from ox import findRe, cache, stripTags, decodeHtml, getTorrentInfo, intValue, normalizeNewlines
from ox.cache import read_url
from ox import findRe, cache, strip_tags, decodeHtml, getTorrentInfo, int_value, normalizeNewlines
from ox.normalize import normalizeImdbId
import ox
@ -31,7 +31,7 @@ def findMovie(query, max_results=10):
'''search for torrents on mininova
'''
url = "http://www.mininova.org/search/%s/seeds" % quote(query)
data = readUrlUnicode(url)
data = read_url(url, unicode=True)
return _parseResultsPage(data, max_results)
def findMovieByImdb(imdbId):
@ -39,7 +39,7 @@ def findMovieByImdb(imdbId):
'''
results = []
imdbId = normalizeImdbId(imdbId)
data = readUrlUnicode("http://www.mininova.org/imdb/?imdb=%s" % imdbId)
data = read_url("http://www.mininova.org/imdb/?imdb=%s" % imdbId, unicode=True)
return _parseResultsPage(data)
def getId(mininovaId):
@ -55,7 +55,7 @@ def getId(mininovaId):
def exists(mininovaId):
mininovaId = getId(mininovaId)
data = ox.net.readUrl("http://www.mininova.org/tor/%s" % mininovaId)
data = ox.net.read_url("http://www.mininova.org/tor/%s" % mininovaId)
if not data or 'Torrent not found...' in data:
return False
if 'tracker</a> of this torrent requires registration.' in data:
@ -74,22 +74,22 @@ def getData(mininovaId):
torrent[u'torrent_link'] = "http://www.mininova.org/get/%s" % mininovaId
torrent[u'details_link'] = "http://www.mininova.org/det/%s" % mininovaId
data = readUrlUnicode(torrent['comment_link']) + readUrlUnicode(torrent['details_link'])
data = read_url(torrent['comment_link'], unicode=True) + read_url(torrent['details_link'], unicode=True)
if '<h1>Torrent not found...</h1>' in data:
return None
for d in re.compile('<p>.<strong>(.*?):</strong>(.*?)</p>', re.DOTALL).findall(data):
key = d[0].lower().strip()
key = _key_map.get(key, key)
value = decodeHtml(stripTags(d[1].strip()))
value = decodeHtml(strip_tags(d[1].strip()))
torrent[key] = value
torrent[u'title'] = findRe(data, '<title>(.*?):.*?</title>')
torrent[u'imdbId'] = findRe(data, 'title/tt(\d{7})')
torrent[u'description'] = findRe(data, '<div id="description">(.*?)</div>')
if torrent['description']:
torrent['description'] = normalizeNewlines(decodeHtml(stripTags(torrent['description']))).strip()
t = readUrl(torrent[u'torrent_link'])
torrent['description'] = normalizeNewlines(decodeHtml(strip_tags(torrent['description']))).strip()
t = read_url(torrent[u'torrent_link'])
torrent[u'torrent_info'] = getTorrentInfo(t)
return torrent
@ -109,13 +109,13 @@ class Mininova(Torrent):
self['seeder'] = -1
self['leecher'] = -1
if len(ratio) == 2:
val = intValue(ratio[0].replace(',','').strip())
val = int_value(ratio[0].replace(',','').strip())
if val:
self['seeder'] = int(val)
val = intValue(ratio[1].replace(',','').strip())
val = int_value(ratio[1].replace(',','').strip())
if val:
self['leecher'] = int(val)
val = intValue(self.data['downloads'].replace(',','').strip())
val = int_value(self.data['downloads'].replace(',','').strip())
if val:
self['downloaded'] = int(val)
else:

View file

@ -3,7 +3,7 @@
import re
from ox.cache import readUrlUnicode
from ox.cache import read_url
from ox import findRe
def getData(id):
@ -24,7 +24,7 @@ def getId(url):
def getPostersByUrl(url, group=True, timeout=-1):
posters = []
html = readUrlUnicode(url, timeout=timeout)
html = read_url(url, timeout=timeout, unicode=True)
if url in html:
if group:
results = re.compile('<a href="(http://www.movieposterdb.com/group/.+?)\??">', re.DOTALL).findall(html)
@ -32,7 +32,7 @@ def getPostersByUrl(url, group=True, timeout=-1):
posters += getPostersByUrl(result, False)
results = re.compile('<a href="(http://www.movieposterdb.com/poster/.+?)">', re.DOTALL).findall(html)
for result in results:
html = readUrlUnicode(result, timeout=timeout)
html = read_url(result, timeout=timeout, unicode=True)
posters.append(findRe(html, '"(http://www.movieposterdb.com/posters/.+?\.jpg)"'))
return posters

View file

@ -3,8 +3,8 @@
import re
import feedparser
from ox.cache import readUrl, readUrlUnicode
from ox import findRe, stripTags
from ox.cache import read_url
from ox import findRe, strip_tags
from ox import langCode2To3, langTo3Code
def findSubtitlesByImdb(imdb, parts = 1, language = "eng"):
@ -16,7 +16,7 @@ def findSubtitlesByImdb(imdb, parts = 1, language = "eng"):
if language:
url += "sublanguageid-%s/" % language
url += "subsumcd-%s/subformat-srt/imdbid-%s/rss_2_00" % (parts, imdb)
data = readUrl(url)
data = read_url(url)
if "title>opensubtitles.com - search results</title" in data:
fd = feedparser.parse(data)
opensubtitleId = None
@ -31,11 +31,11 @@ def findSubtitlesByImdb(imdb, parts = 1, language = "eng"):
def downloadSubtitleById(opensubtitle_id):
srts = {}
data = readUrl('http://www.opensubtitles.org/en/subtitles/%s' % opensubtitle_id)
data = read_url('http://www.opensubtitles.org/en/subtitles/%s' % opensubtitle_id)
reg_exp = 'href="(/en/download/file/.*?)">(.*?)</a>'
for f in re.compile(reg_exp, re.DOTALL).findall(data):
name = stripTags(f[1]).split('\n')[0]
name = strip_tags(f[1]).split('\n')[0]
url = "http://www.opensubtitles.com%s" % f[0]
srts[name] = readUrlUnicode(url)
srts[name] = read_url(url, unicode=True)
return srts

View file

@ -1,11 +1,11 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import re
from ox.net import readUrlUnicode
from ox.net import read_url
def getPosterUrl(id):
url = 'http://piratecinema.org/posters/'
html = readUrlUnicode(url)
html = read_url(url, unicode=True)
results = re.compile('src="(.+)" title=".+\((\d{7})\)"').findall(html)
for result in results:
if result[1] == id:

View file

@ -2,8 +2,8 @@
# vi:si:et:sw=4:sts=4:ts=4
import re
from ox.cache import getHeaders, readUrl, readUrlUnicode
from ox import findRe, stripTags
from ox.cache import getHeaders, read_url
from ox import findRe, strip_tags
def getUrlByImdb(imdb):
@ -14,7 +14,7 @@ def getUrlByImdb(imdb):
return u.url
'''
url = "http://www.rottentomatoes.com/alias?type=imdbid&s=%s" % imdb
data = readUrl(url)
data = read_url(url)
if "movie_title" in data:
movies = re.compile('(/m/.*?/)').findall(data)
if movies:
@ -25,13 +25,13 @@ def get_og(data, key):
return findRe(data, '<meta property="og:%s".*?content="(.*?)"' % key)
def getData(url):
data = readUrl(url)
data = read_url(url)
r = {}
r['title'] = findRe(data, '<h1 class="movie_title">(.*?)</h1>')
if '(' in r['title']:
r['year'] = findRe(r['title'], '\((\d*?)\)')
r['title'] = stripTags(re.sub('\((\d*?)\)', '', r['title'])).strip()
r['summary'] = stripTags(findRe(data, '<p id="movieSynopsis" class="movie_synopsis" itemprop="description">(.*?)</p>')).strip()
r['title'] = strip_tags(re.sub('\((\d*?)\)', '', r['title'])).strip()
r['summary'] = strip_tags(findRe(data, '<p id="movieSynopsis" class="movie_synopsis" itemprop="description">(.*?)</p>')).strip()
r['summary'] = r['summary'].replace('\t', ' ').replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')
if not r['summary']:
r['summary'] = get_og(data, 'description')

View file

@ -2,16 +2,16 @@
# vi:si:et:sw=4:sts=4:ts=4
import re
from ..cache import readUrlUnicode
from .. import stripTags, decodeHtml
from ..cache import read_url
from .. import strip_tags, decodeHtml
from ..utils import datetime
def cleanup(key, data, data_type):
if data:
if isinstance(data[0], basestring):
#FIXME: some types need stripTags
#data = [stripTags(decodeHtml(p)).strip() for p in data]
#FIXME: some types need strip_tags
#data = [strip_tags(decodeHtml(p)).strip() for p in data]
data = [decodeHtml(p).strip() for p in data]
elif isinstance(data[0], list) or isinstance(data[0], tuple):
data = [cleanup(key, p, data_type) for p in data]
@ -30,13 +30,13 @@ class SiteParser(dict):
def getUrl(self, page):
return "%s%s" % (self.baseUrl, page)
def readUrlUnicode(self, url, timeout):
return readUrlUnicode(url, timeout=timeout)
def read_url(self, url, timeout):
return read_url(url, timeout=timeout, unicode=True)
def __init__(self, timeout=-1):
for key in self.regex:
url = self.getUrl(self.regex[key]['page'])
data = self.readUrlUnicode(url, timeout)
data = self.read_url(url, timeout)
if isinstance(self.regex[key]['re'], basestring):
data = re.compile(self.regex[key]['re'], re.DOTALL).findall(data)
data = cleanup(key, data, self.regex[key]['type'])

View file

@ -5,7 +5,7 @@ import re
import time
import ox.cache
from ox.html import decodeHtml, stripTags
from ox.html import decodeHtml, strip_tags
import ox.net
@ -21,11 +21,11 @@ def getNews(year, month, day):
for section in sections:
url = 'http://www.spiegel.de/%s/0,1518,archiv-%d-%03d,00.html' % (section, year, day)
if date == time.strftime('%d.%m.%Y', time.localtime()):
html = ox.net.readUrl(url)
html = ox.net.read_url(url)
else:
html = ox.cache.readUrl(url)
html = ox.cache.read_url(url)
for item in re.compile('<div class="spTeaserCenterpage(.*?)</p>', re.DOTALL).findall(html):
dateString = stripTags(re.compile('<div class="spDateTime">(.*?)</div>', re.DOTALL).findall(item)[0]).strip()
dateString = strip_tags(re.compile('<div class="spDateTime">(.*?)</div>', re.DOTALL).findall(item)[0]).strip()
try:
description = formatString(re.compile('<p>(.*?)<', re.DOTALL).findall(item)[0])
except:
@ -104,12 +104,12 @@ def getIssue(year, week):
return None
url = 'http://service.spiegel.de/digas/servlet/epaper?Q=SP&JG=%d&AG=%d&SE=1&AN=INHALT' % (year, week)
contents = []
data = ox.cache.readUrl(url)
data = ox.cache.read_url(url)
items = re.compile('<a.?href="http://service.spiegel.de/digas/servlet/epaper\?Q=SP&JG=".?>(.*?)</a>').findall(data)
for item in items:
item = item[1]
page = int(re.compile('&amp;SE=(.*?)"').findall(item)[0])
title = stripTags(item).strip()
title = strip_tags(item).strip()
contents.append({'title': title, 'page': page})
pageUrl = {}
pages = page + 2
@ -163,7 +163,7 @@ def archiveIssues():
f.close()
filename = '%s/Der Spiegel %d %02d.jpg' % (dirname, y, w)
if not os.path.exists(filename):
data = ox.cache.readUrl(issue['coverUrl'])
data = ox.cache.read_url(issue['coverUrl'])
f = open(filename, 'w')
f.write(data)
f.close()
@ -172,7 +172,7 @@ def archiveIssues():
if url:
filename = '%s/Der Spiegel %d %02d %03d.jpg' % (dirname, y, w, page)
if not os.path.exists(filename):
data = ox.cache.readUrl(url)
data = ox.cache.read_url(url)
f = open(filename, 'w')
f.write(data)
f.close()
@ -243,7 +243,7 @@ def archiveNews():
f.close()
filename = dirname + '/' + new['imageUrl'].split('/')[-1]
if not os.path.exists(filename):
data = ox.cache.readUrl(new['imageUrl'])
data = ox.cache.read_url(new['imageUrl'])
f = open(filename, 'w')
f.write(data)
f.close()

View file

@ -6,8 +6,7 @@ import socket
from urllib import quote, urlencode
from urllib2 import URLError
from ox.cache import readUrl, readUrlUnicode
from ox import findRe, cache, stripTags, decodeHtml, getTorrentInfo, normalizeNewlines
from ox import findRe, cache, strip_tags, decodeHtml, getTorrentInfo, normalizeNewlines
from ox.normalize import normalizeImdbId
import ox
@ -18,13 +17,10 @@ cache_timeout = 24*60*60 # cache search only for 24 hours
season_episode = re.compile("S..E..", re.IGNORECASE)
def _readUrl(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None):
def read_url(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None, unicode=False):
headers = headers.copy()
headers['Cookie'] = 'language=en_EN'
return cache.readUrl(url, data, headers, timeout)
def _readUrlUnicode(url, timeout=cache.cache_timeout):
return cache.readUrlUnicode(url, _readUrl=_readUrl, timeout=timeout)
return cache.read_url(url, data, headers, timeout, unicode=unicode)
def findMovies(query, max_results=10):
results = []
@ -37,7 +33,7 @@ def findMovies(query, max_results=10):
if not url.startswith('/'):
url = "/" + url
url = "http://thepiratebay.org" + url
data = _readUrlUnicode(url, timeout=cache_timeout)
data = read_url(url, timeout=cache_timeout, unicode=True)
regexp = '''<tr.*?<td class="vertTh"><a href="/browse/(.*?)".*?<td><a href="(/torrent/.*?)" class="detLink".*?>(.*?)</a>.*?</tr>'''
for row in re.compile(regexp, re.DOTALL).findall(data):
torrentType = row[0]
@ -83,7 +79,7 @@ def getData(piratebayId):
torrent[u'domain'] = 'thepiratebay.org'
torrent[u'comment_link'] = 'http://thepiratebay.org/torrent/%s' % piratebayId
data = _readUrlUnicode(torrent['comment_link'])
data = read_url(torrent['comment_link'], unicode=True)
torrent[u'title'] = findRe(data, '<title>(.*?) \(download torrent\) - TPB</title>')
if not torrent[u'title']:
return None
@ -94,12 +90,12 @@ def getData(piratebayId):
for d in re.compile('dt>(.*?):</dt>.*?<dd.*?>(.*?)</dd>', re.DOTALL).findall(data):
key = d[0].lower().strip()
key = _key_map.get(key, key)
value = decodeHtml(stripTags(d[1].strip()))
value = decodeHtml(strip_tags(d[1].strip()))
torrent[key] = value
torrent[u'description'] = findRe(data, '<div class="nfo">(.*?)</div>')
if torrent[u'description']:
torrent['description'] = normalizeNewlines(decodeHtml(stripTags(torrent['description']))).strip()
t = _readUrl(torrent[u'torrent_link'])
torrent['description'] = normalizeNewlines(decodeHtml(strip_tags(torrent['description']))).strip()
t = _read_url(torrent[u'torrent_link'])
torrent[u'torrent_info'] = getTorrentInfo(t)
return torrent

View file

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
from ox import intValue
from ox import int_value
class Torrent(dict):
@ -25,7 +25,7 @@ class Torrent(dict):
for key in self._int_keys:
value = self.data.get(key, -1)
if not isinstance(value, int):
value = int(intValue(value))
value = int(int_value(value))
self[key] = value
self['infohash'] = self.data['torrent_info'].get('hash', '')
self['size'] = self.data['torrent_info'].get('size', -1)

View file

@ -3,8 +3,8 @@
import re
import time
from ox import stripTags, findRe
from ox.cache import readUrlUnicode
from ox import strip_tags, findRe
from ox.cache import read_url
def getEpisodeData(url):
@ -14,9 +14,9 @@ def getEpisodeData(url):
example:
getEpisodeData('http://www.tv.com/lost/do-no-harm/episode/399310/summary.html')
'''
data = readUrlUnicode(url)
data = read_url(url, unicode=True)
r = {}
r['description'] = stripTags(findRe(data, 'div id="main-col">.*?<div>(.*?)</div').split('\r')[0])
r['description'] = strip_tags(findRe(data, 'div id="main-col">.*?<div>(.*?)</div').split('\r')[0])
r['show'] = findRe(data, '<h1>(.*?)</h1>')
r['title'] = findRe(data, '<title>.*?: (.*?) - TV.com </title>')
#episode score

View file

@ -4,13 +4,13 @@ import re
from StringIO import StringIO
import xml.etree.ElementTree as ET
from ox.cache import readUrl, readUrlUnicode
from ox.cache import read_url
from ox import findString, findRe
def getData(id):
url = 'http://www.vimeo.com/moogaloop/load/clip:%s' %id
xml = readUrl(url)
xml = read_url(url)
tree = ET.parse(StringIO(xml))
request_signature = tree.find('request_signature').text
request_signature_expires = tree.find('request_signature_expires').text

View file

@ -4,7 +4,7 @@ import re
from urllib import urlencode
from ox.utils import json
from ox.cache import readUrl, readUrlUnicode
from ox.cache import read_url
from ox import findRe, decodeHtml
@ -47,7 +47,7 @@ def getUrlByAllmovieId(allmovieId):
def getWikiData(wikipediaUrl):
url = wikipediaUrl.replace('wikipedia.org/wiki/', 'wikipedia.org/w/index.php?title=')
url = "%s&action=raw" % url
data = readUrl(url).decode('utf-8')
data = read_url(url).decode('utf-8')
return data
def getMovieData(wikipediaUrl):
@ -106,7 +106,7 @@ def getMovieData(wikipediaUrl):
def getImageUrl(name):
url = 'http://en.wikipedia.org/wiki/Image:' + name.replace(' ', '%20')
data = readUrlUnicode(url)
data = read_url(url, unicode=True)
url = findRe(data, 'href="(http://upload.wikimedia.org/.*?)"')
if not url:
url = findRe(data, 'href="(//upload.wikimedia.org/.*?)"')
@ -133,9 +133,9 @@ def find(query, max_results=10):
query = {'action': 'query', 'list':'search', 'format': 'json',
'srlimit': max_results, 'srwhat': 'text', 'srsearch': query.encode('utf-8')}
url = "http://en.wikipedia.org/w/api.php?" + urlencode(query)
data = readUrl(url)
data = read_url(url)
if not data:
data = readUrl(url, timeout=0)
data = read_url(url, timeout=0)
result = json.loads(data)
results = []
if result and 'query' in result:

View file

@ -5,7 +5,7 @@ import re
from xml.dom.minidom import parseString
import feedparser
from ox.cache import readUrl, cache_timeout
from ox.cache import read_url, cache_timeout
def getVideoUrl(youtubeId, format='mp4', timeout=cache_timeout):
@ -33,7 +33,7 @@ def getVideoUrl(youtubeId, format='mp4', timeout=cache_timeout):
def find(query, max_results=10, offset=1, orderBy='relevance'):
query = quote(query)
url = "http://gdata.youtube.com/feeds/api/videos?vq=%s&orderby=%s&start-index=%s&max-results=%s" % (query, orderBy, offset, max_results)
data = readUrl(url)
data = read_url(url)
fd = feedparser.parse(data)
videos = []
for item in fd.entries:
@ -48,7 +48,7 @@ def find(query, max_results=10, offset=1, orderBy='relevance'):
def info(id):
info = {}
url = "http://gdata.youtube.com/feeds/api/videos/%s?v=2" % id
data = readUrl(url)
data = read_url(url)
xml = parseString(data)
info['url'] = 'http://www.youtube.com/watch?v=%s' % id
info['title'] = xml.getElementsByTagName('title')[0].firstChild.data
@ -62,21 +62,21 @@ def info(id):
info['keywords'] = xml.getElementsByTagName('media:keywords')[0].firstChild.data.split(', ')
url = "http://www.youtube.com/watch?v=%s" % id
data = readUrl(url)
data = read_url(url)
match = re.compile('<h4>License:</h4>(.*?)</p>', re.DOTALL).findall(data)
if match:
info['license'] = match[0].strip()
info['license'] = re.sub('<.+?>', '', info['license']).strip()
url = "http://www.youtube.com/api/timedtext?hl=en&type=list&tlangs=1&v=%s&asrs=1"%id
data = readUrl(url)
data = read_url(url)
xml = parseString(data)
languages = [t.getAttribute('lang_code') for t in xml.getElementsByTagName('track')]
if languages:
info['subtitles'] = {}
for language in languages:
url = "http://www.youtube.com/api/timedtext?hl=en&v=%s&type=track&lang=%s&name&kind"%(id, language)
data = readUrl(url)
data = read_url(url)
xml = parseString(data)
subs = []
for t in xml.getElementsByTagName('text'):
@ -101,7 +101,7 @@ def videos(id, format=''):
'mp4': 'video/mp4'
}.get(format)
url = "http://www.youtube.com/watch?v=%s" % id
data = readUrl(url)
data = read_url(url)
match = re.compile('"url_encoded_fmt_stream_map": "(.*?)"').findall(data)
streams = {}
for x in match[0].split(','):