ox.web under_score api rewrite
This commit is contained in:
parent
bb35daa95c
commit
a4fd3c930f
29 changed files with 268 additions and 285 deletions
|
@ -307,6 +307,8 @@ def parse_movie_path(path):
|
||||||
title = title.replace('_ ', ': ')
|
title = title.replace('_ ', ': ')
|
||||||
if title.endswith('_'):
|
if title.endswith('_'):
|
||||||
title = title[:-1] + '.'
|
title = title[:-1] + '.'
|
||||||
|
if title.startswith('_'):
|
||||||
|
title = '.' + title[1:]
|
||||||
|
|
||||||
year = find_re(title, '(\(\d{4}\))')
|
year = find_re(title, '(\(\d{4}\))')
|
||||||
if not year:
|
if not year:
|
||||||
|
@ -344,8 +346,9 @@ def parse_movie_path(path):
|
||||||
else:
|
else:
|
||||||
season = None
|
season = None
|
||||||
|
|
||||||
episode = find_re(parts[-1], '\.Episode (\d+)\.')
|
episode = find_re(parts[-1], '\.Episode[s]* ([\d+]+)\.')
|
||||||
if episode:
|
if episode:
|
||||||
|
episode = episode.split('+')[0]
|
||||||
episode = int(episode)
|
episode = int(episode)
|
||||||
else:
|
else:
|
||||||
episode = None
|
episode = None
|
||||||
|
|
|
@ -7,7 +7,7 @@ from utils import json, ET
|
||||||
|
|
||||||
def get_embed_code(url, maxwidth=None, maxheight=None):
|
def get_embed_code(url, maxwidth=None, maxheight=None):
|
||||||
embed = {}
|
embed = {}
|
||||||
header = cache.getHeaders(url)
|
header = cache.get_headers(url)
|
||||||
if header.get('content-type', '').startswith('text/html'):
|
if header.get('content-type', '').startswith('text/html'):
|
||||||
html = cache.readUrl(url)
|
html = cache.readUrl(url)
|
||||||
json_oembed = filter(lambda l: 'json+oembed' in l, re.compile('<link.*?>').findall(html))
|
json_oembed = filter(lambda l: 'json+oembed' in l, re.compile('<link.*?>').findall(html))
|
||||||
|
|
|
@ -7,68 +7,68 @@ from ox import strip_tags, find_re
|
||||||
from ox.cache import read_url
|
from ox.cache import read_url
|
||||||
|
|
||||||
|
|
||||||
def getId(url):
|
def get_id(url):
|
||||||
return url.split("/")[-1]
|
return url.split("/")[-1]
|
||||||
|
|
||||||
def getData(id):
|
def get_data(id):
|
||||||
'''
|
'''
|
||||||
>>> getData('129689')['cast'][1][1]
|
>>> get_data('129689')['cast'][1][1]
|
||||||
u'Marianne'
|
u'Marianne'
|
||||||
>>> getData('129689')['credits'][0][0]
|
>>> get_data('129689')['credits'][0][0]
|
||||||
u'Jean-Luc Godard'
|
u'Jean-Luc Godard'
|
||||||
>>> getData('129689')['posters'][0]
|
>>> get_data('129689')['posters'][0]
|
||||||
u'http://image.allmusic.com/00/adg/cov200/dru800/u812/u81260bbffr.jpg'
|
u'http://image.allmusic.com/00/adg/cov200/dru800/u812/u81260bbffr.jpg'
|
||||||
>>> getData('129689')['rating']
|
>>> get_data('129689')['rating']
|
||||||
u'4.5'
|
u'4.5'
|
||||||
'''
|
'''
|
||||||
if id.startswith('http'):
|
if id.startswith('http'):
|
||||||
id = getId(id)
|
id = get_id(id)
|
||||||
data = {
|
data = {
|
||||||
"url": getUrl(id)
|
"url": get_url(id)
|
||||||
}
|
}
|
||||||
html = read_url(data["url"], unicode=True)
|
html = read_url(data["url"], unicode=True)
|
||||||
data['aka'] = parseList(html, 'AKA')
|
data['aka'] = parse_list(html, 'AKA')
|
||||||
data['category'] = find_re(html, '<dt>category</dt>.*?<dd>(.*?)</dd>')
|
data['category'] = find_re(html, '<dt>category</dt>.*?<dd>(.*?)</dd>')
|
||||||
data['countries'] = parseList(html, 'countries')
|
data['countries'] = parse_list(html, 'countries')
|
||||||
data['director'] = parseEntry(html, 'directed by')
|
data['director'] = parse_entry(html, 'directed by')
|
||||||
data['genres'] = parseList(html, 'genres')
|
data['genres'] = parse_list(html, 'genres')
|
||||||
data['keywords'] = parseList(html, 'keywords')
|
data['keywords'] = parse_list(html, 'keywords')
|
||||||
data['posters'] = [find_re(html, '<img src="(http://cps-.*?)"')]
|
data['posters'] = [find_re(html, '<img src="(http://cps-.*?)"')]
|
||||||
data['produced'] = parseList(html, 'produced by')
|
data['produced'] = parse_list(html, 'produced by')
|
||||||
data['rating'] = find_re(html, 'Stars" title="(.*?) Stars"')
|
data['rating'] = find_re(html, 'Stars" title="(.*?) Stars"')
|
||||||
data['released'] = parseEntry(html, 'released by')
|
data['released'] = parse_entry(html, 'released by')
|
||||||
data['releasedate'] = parseList(html, 'release date')
|
data['releasedate'] = parse_list(html, 'release date')
|
||||||
data['runtime'] = parseEntry(html, 'run time').replace('min.', '').strip()
|
data['runtime'] = parse_entry(html, 'run time').replace('min.', '').strip()
|
||||||
data['set'] = parseEntry(html, 'set in')
|
data['set'] = parse_entry(html, 'set in')
|
||||||
data['synopsis'] = strip_tags(find_re(html, '<div class="toggle-text" itemprop="description">(.*?)</div>')).strip()
|
data['synopsis'] = strip_tags(find_re(html, '<div class="toggle-text" itemprop="description">(.*?)</div>')).strip()
|
||||||
data['themes'] = parseList(html, 'themes')
|
data['themes'] = parse_list(html, 'themes')
|
||||||
data['types'] = parseList(html, 'types')
|
data['types'] = parse_list(html, 'types')
|
||||||
data['year'] = find_re(html, '<span class="year">.*?(\d+)')
|
data['year'] = find_re(html, '<span class="year">.*?(\d+)')
|
||||||
#data['stills'] = [re.sub('_derived.*?/', '', i) for i in re.compile('<a href="#" title="movie still".*?<img src="(.*?)"', re.DOTALL).findall(html)]
|
#data['stills'] = [re.sub('_derived.*?/', '', i) for i in re.compile('<a href="#" title="movie still".*?<img src="(.*?)"', re.DOTALL).findall(html)]
|
||||||
data['stills'] = re.compile('<a href="#" title="movie still".*?<img src="(.*?)"', re.DOTALL).findall(html)
|
data['stills'] = re.compile('<a href="#" title="movie still".*?<img src="(.*?)"', re.DOTALL).findall(html)
|
||||||
#html = read_url("http://allmovie.com/work/%s/cast" % id, unicode=True)
|
#html = read_url("http://allmovie.com/work/%s/cast" % id, unicode=True)
|
||||||
#data['cast'] = parseTable(html)
|
#data['cast'] = parse_table(html)
|
||||||
#html = read_url("http://allmovie.com/work/%s/credits" % id, unicode=True)
|
#html = read_url("http://allmovie.com/work/%s/credits" % id, unicode=True)
|
||||||
#data['credits'] = parseTable(html)
|
#data['credits'] = parse_table(html)
|
||||||
html = read_url("http://allmovie.com/work/%s/review" % id, unicode=True)
|
html = read_url("http://allmovie.com/work/%s/review" % id, unicode=True)
|
||||||
data['review'] = strip_tags(find_re(html, '<div class="toggle-text" itemprop="description">(.*?)</div>')).strip()
|
data['review'] = strip_tags(find_re(html, '<div class="toggle-text" itemprop="description">(.*?)</div>')).strip()
|
||||||
return data
|
return data
|
||||||
|
|
||||||
def getUrl(id):
|
def get_url(id):
|
||||||
return "http://allmovie.com/work/%s" % id
|
return "http://allmovie.com/work/%s" % id
|
||||||
|
|
||||||
def parseEntry(html, title):
|
def parse_entry(html, title):
|
||||||
html = find_re(html, '<dt>%s</dt>.*?<dd>(.*?)</dd>' % title)
|
html = find_re(html, '<dt>%s</dt>.*?<dd>(.*?)</dd>' % title)
|
||||||
return strip_tags(html).strip()
|
return strip_tags(html).strip()
|
||||||
|
|
||||||
def parseList(html, title):
|
def parse_list(html, title):
|
||||||
html = find_re(html, '<dt>%s</dt>.*?<dd>(.*?)</dd>' % title.lower())
|
html = find_re(html, '<dt>%s</dt>.*?<dd>(.*?)</dd>' % title.lower())
|
||||||
r = map(lambda x: strip_tags(x), re.compile('<li>(.*?)</li>', re.DOTALL).findall(html))
|
r = map(lambda x: strip_tags(x), re.compile('<li>(.*?)</li>', re.DOTALL).findall(html))
|
||||||
if not r and html:
|
if not r and html:
|
||||||
r = [strip_tags(html)]
|
r = [strip_tags(html)]
|
||||||
return r
|
return r
|
||||||
|
|
||||||
def parseTable(html):
|
def parse_table(html):
|
||||||
return map(
|
return map(
|
||||||
lambda x: map(
|
lambda x: map(
|
||||||
lambda x: strip_tags(x).strip().replace(' ', ''),
|
lambda x: strip_tags(x).strip().replace(' ', ''),
|
||||||
|
@ -77,10 +77,10 @@ def parseTable(html):
|
||||||
find_re(html, '<div id="results-table">(.*?)</table>').split('</tr>')[:-1]
|
find_re(html, '<div id="results-table">(.*?)</table>').split('</tr>')[:-1]
|
||||||
)
|
)
|
||||||
|
|
||||||
def parseText(html, title):
|
def parse_text(html, title):
|
||||||
return strip_tags(find_re(html, '%s</td>.*?<td colspan="2"><p>(.*?)</td>' % title)).strip()
|
return strip_tags(find_re(html, '%s</td>.*?<td colspan="2"><p>(.*?)</td>' % title)).strip()
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
print getData('129689')
|
print get_data('129689')
|
||||||
# print getData('177524')
|
# print get_data('177524')
|
||||||
|
|
||||||
|
|
|
@ -13,17 +13,17 @@ def findISBN(title, author):
|
||||||
data = read_url(url, unicode=True)
|
data = read_url(url, unicode=True)
|
||||||
links = re.compile('href="(http://www.amazon.com/.*?/dp/.*?)"').findall(data)
|
links = re.compile('href="(http://www.amazon.com/.*?/dp/.*?)"').findall(data)
|
||||||
id = find_re(re.compile('href="(http://www.amazon.com/.*?/dp/.*?)"').findall(data)[0], '/dp/(.*?)/')
|
id = find_re(re.compile('href="(http://www.amazon.com/.*?/dp/.*?)"').findall(data)[0], '/dp/(.*?)/')
|
||||||
data = getData(id)
|
data = get_data(id)
|
||||||
if author in data['authors']:
|
if author in data['authors']:
|
||||||
return data
|
return data
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
def getData(id):
|
def get_data(id):
|
||||||
url = "http://www.amazon.com/title/dp/%s/" % id
|
url = "http://www.amazon.com/title/dp/%s/" % id
|
||||||
data = read_url(url, unicode=True)
|
data = read_url(url, unicode=True)
|
||||||
|
|
||||||
|
|
||||||
def findData(key):
|
def find_data(key):
|
||||||
return find_re(data, '<li><b>%s:</b>(.*?)</li>'% key).strip()
|
return find_re(data, '<li><b>%s:</b>(.*?)</li>'% key).strip()
|
||||||
|
|
||||||
r = {}
|
r = {}
|
||||||
|
@ -34,15 +34,15 @@ def getData(id):
|
||||||
t = re.compile('>(.*?)</a> \(Translator\)').findall(data)
|
t = re.compile('>(.*?)</a> \(Translator\)').findall(data)
|
||||||
if t:
|
if t:
|
||||||
r['translator'] = t
|
r['translator'] = t
|
||||||
r['publisher'] = findData('Publisher')
|
r['publisher'] = find_data('Publisher')
|
||||||
r['language'] = findData('Language')
|
r['language'] = find_data('Language')
|
||||||
r['isbn-10'] = findData('ISBN-10')
|
r['isbn-10'] = find_data('ISBN-10')
|
||||||
r['isbn-13'] = findData('ISBN-13').replace('-', '')
|
r['isbn-13'] = find_data('ISBN-13').replace('-', '')
|
||||||
r['dimensions'] = find_re(data, '<li><b>.*?Product Dimensions:.*?</b>(.*?)</li>')
|
r['dimensions'] = find_re(data, '<li><b>.*?Product Dimensions:.*?</b>(.*?)</li>')
|
||||||
|
|
||||||
r['pages'] = findData('Paperback')
|
r['pages'] = find_data('Paperback')
|
||||||
if not r['pages']:
|
if not r['pages']:
|
||||||
r['pages'] = findData('Hardcover')
|
r['pages'] = find_data('Hardcover')
|
||||||
|
|
||||||
r['review'] = strip_tags(find_re(data, '<h3 class="productDescriptionSource">Review</h3>.*?<div class="productDescriptionWrapper">(.*?)</div>').replace('<br />', '\n')).strip()
|
r['review'] = strip_tags(find_re(data, '<h3 class="productDescriptionSource">Review</h3>.*?<div class="productDescriptionWrapper">(.*?)</div>').replace('<br />', '\n')).strip()
|
||||||
|
|
||||||
|
|
|
@ -14,7 +14,7 @@ HEADERS = {
|
||||||
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7) '
|
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7) '
|
||||||
USER_AGENT += 'AppleWebKit/534.48.3 (KHTML, like Gecko) Version/5.1 Safari/534.48.3'
|
USER_AGENT += 'AppleWebKit/534.48.3 (KHTML, like Gecko) Version/5.1 Safari/534.48.3'
|
||||||
|
|
||||||
def getMovieData(title, director):
|
def get_movie_data(title, director):
|
||||||
if isinstance(title, unicode):
|
if isinstance(title, unicode):
|
||||||
title = title.encode('utf-8')
|
title = title.encode('utf-8')
|
||||||
if isinstance(director, unicode):
|
if isinstance(director, unicode):
|
||||||
|
@ -60,8 +60,8 @@ def getMovieData(title, director):
|
||||||
return data
|
return data
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
print getMovieData('Alphaville', 'Jean-Luc Godard')
|
print get_movie_data('Alphaville', 'Jean-Luc Godard')
|
||||||
print getMovieData('Sin City', 'Roberto Rodriguez')
|
print get_movie_data('Sin City', 'Roberto Rodriguez')
|
||||||
print getMovieData('Breathless', 'Jean-Luc Godard')
|
print get_movie_data('Breathless', 'Jean-Luc Godard')
|
||||||
print getMovieData('Capitalism: A Love Story', 'Michael Moore')
|
print get_movie_data('Capitalism: A Love Story', 'Michael Moore')
|
||||||
print getMovieData('Film Socialisme', 'Jean-Luc Godard')
|
print get_movie_data('Film Socialisme', 'Jean-Luc Godard')
|
||||||
|
|
|
@ -3,15 +3,15 @@
|
||||||
from .. import cache
|
from .. import cache
|
||||||
from ..utils import json
|
from ..utils import json
|
||||||
|
|
||||||
def getId(url):
|
def get_id(url):
|
||||||
return url.split("/")[-1]
|
return url.split("/")[-1]
|
||||||
|
|
||||||
def getUrl(id):
|
def get_url(id):
|
||||||
return "http://www.archive.org/details/%s" % id
|
return "http://www.archive.org/details/%s" % id
|
||||||
|
|
||||||
def getData(id):
|
def get_data(id):
|
||||||
data = {}
|
data = {}
|
||||||
url = getUrl(id)
|
url = get_url(id)
|
||||||
details = cache.read_url('%s?output=json' % url)
|
details = cache.read_url('%s?output=json' % url)
|
||||||
details = json.loads(details)
|
details = json.loads(details)
|
||||||
for key in ('title', 'description', 'runtime'):
|
for key in ('title', 'description', 'runtime'):
|
||||||
|
|
|
@ -9,25 +9,25 @@ from ox.text import find_re, remove_special_characters
|
||||||
|
|
||||||
import imdb
|
import imdb
|
||||||
|
|
||||||
def getId(url):
|
def get_id(url):
|
||||||
return url.split("/")[-1]
|
return url.split("/")[-1]
|
||||||
|
|
||||||
def getUrl(id):
|
def get_url(id):
|
||||||
return "http://www.criterion.com/films/%s" % id
|
return "http://www.criterion.com/films/%s" % id
|
||||||
|
|
||||||
def getData(id, timeout=ox.cache.cache_timeout, get_imdb=False):
|
def get_data(id, timeout=ox.cache.cache_timeout, get_imdb=False):
|
||||||
'''
|
'''
|
||||||
>>> getData('1333')['imdbId']
|
>>> get_data('1333')['imdbId']
|
||||||
u'0060304'
|
u'0060304'
|
||||||
|
|
||||||
>>> getData('236')['posters'][0]
|
>>> get_data('236')['posters'][0]
|
||||||
u'http://criterion_production.s3.amazonaws.com/release_images/1586/ThirdManReplace.jpg'
|
u'http://criterion_production.s3.amazonaws.com/release_images/1586/ThirdManReplace.jpg'
|
||||||
|
|
||||||
>>> getData('786')['posters'][0]
|
>>> get_data('786')['posters'][0]
|
||||||
u'http://criterion_production.s3.amazonaws.com/product_images/185/343_box_348x490.jpg'
|
u'http://criterion_production.s3.amazonaws.com/product_images/185/343_box_348x490.jpg'
|
||||||
'''
|
'''
|
||||||
data = {
|
data = {
|
||||||
"url": getUrl(id)
|
"url": get_url(id)
|
||||||
}
|
}
|
||||||
try:
|
try:
|
||||||
html = read_url(data["url"], timeout=timeout, unicode=True)
|
html = read_url(data["url"], timeout=timeout, unicode=True)
|
||||||
|
@ -71,21 +71,21 @@ def getData(id, timeout=ox.cache.cache_timeout, get_imdb=False):
|
||||||
if timeout == ox.cache.cache_timeout:
|
if timeout == ox.cache.cache_timeout:
|
||||||
timeout = -1
|
timeout = -1
|
||||||
if get_imdb:
|
if get_imdb:
|
||||||
data['imdbId'] = imdb.getMovieId(data['title'],
|
data['imdbId'] = imdb.get_movie_id(data['title'],
|
||||||
data['director'], data['year'], timeout=timeout)
|
data['director'], data['year'], timeout=timeout)
|
||||||
return data
|
return data
|
||||||
|
|
||||||
def getIds():
|
def get_ids():
|
||||||
ids = []
|
ids = []
|
||||||
html = read_url("http://www.criterion.com/library/expanded_view?m=dvd&p=1&pp=50&s=spine", unicode=True)
|
html = read_url("http://www.criterion.com/library/expanded_view?m=dvd&p=1&pp=50&s=spine", unicode=True)
|
||||||
results = re.compile("\&p=(\d+)\&").findall(html)
|
results = re.compile("\&p=(\d+)\&").findall(html)
|
||||||
pages = max(map(int, results))
|
pages = max(map(int, results))
|
||||||
for page in range(1, pages):
|
for page in range(1, pages):
|
||||||
for id in getIdsByPage(page):
|
for id in get_idsByPage(page):
|
||||||
ids.append(id)
|
ids.append(id)
|
||||||
return map(lambda id: str(id), sorted(map(lambda id: int(id), set(ids))))
|
return map(lambda id: str(id), sorted(map(lambda id: int(id), set(ids))))
|
||||||
|
|
||||||
def getIdsByPage(page):
|
def get_idsByPage(page):
|
||||||
ids = []
|
ids = []
|
||||||
url = "http://www.criterion.com/library/expanded_view?m=dvd&p=%s&pp=50&s=spine" % page
|
url = "http://www.criterion.com/library/expanded_view?m=dvd&p=%s&pp=50&s=spine" % page
|
||||||
html = read_url(url, unicode=True)
|
html = read_url(url, unicode=True)
|
||||||
|
@ -101,4 +101,4 @@ def getIdsByPage(page):
|
||||||
return set(ids)
|
return set(ids)
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
print getIds()
|
print get_ids()
|
||||||
|
|
|
@ -5,7 +5,7 @@ from urllib import unquote
|
||||||
from ox.cache import read_url
|
from ox.cache import read_url
|
||||||
|
|
||||||
|
|
||||||
def getVideoUrl(url):
|
def get_video_url(url):
|
||||||
'''
|
'''
|
||||||
>>> getVideoUrl('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3opar_priere-pour-refuznik-1-jeanluc-goda_shortfilms').split('?auth')[0]
|
>>> getVideoUrl('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3opar_priere-pour-refuznik-1-jeanluc-goda_shortfilms').split('?auth')[0]
|
||||||
'http://www.dailymotion.com/cdn/FLV-320x240/video/x3opar_priere-pour-refuznik-1-jean-luc-god_shortfilms.flv'
|
'http://www.dailymotion.com/cdn/FLV-320x240/video/x3opar_priere-pour-refuznik-1-jean-luc-god_shortfilms.flv'
|
||||||
|
|
|
@ -9,7 +9,7 @@ from ox.cache import read_url
|
||||||
import google
|
import google
|
||||||
|
|
||||||
|
|
||||||
def getShowUrl(title):
|
def get_show_url(title):
|
||||||
'''
|
'''
|
||||||
Search Epguide Url for Show via Show Title.
|
Search Epguide Url for Show via Show Title.
|
||||||
Use Google to search the url, this is also done on Epguide.
|
Use Google to search the url, this is also done on Epguide.
|
||||||
|
@ -20,7 +20,7 @@ def getShowUrl(title):
|
||||||
return url
|
return url
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def getShowData(url):
|
def get_show_data(url):
|
||||||
data = read_url(url, unicode=True)
|
data = read_url(url, unicode=True)
|
||||||
r = {}
|
r = {}
|
||||||
r['title'] = strip_tags(find_re(data, '<h1>(.*?)</h1>'))
|
r['title'] = strip_tags(find_re(data, '<h1>(.*?)</h1>'))
|
||||||
|
|
|
@ -9,28 +9,28 @@ from ox import find_re, strip_tags
|
||||||
from ox.web.imdb import ImdbCombined
|
from ox.web.imdb import ImdbCombined
|
||||||
|
|
||||||
|
|
||||||
def getData(id, timeout=-1):
|
def get_data(id, timeout=-1):
|
||||||
'''
|
'''
|
||||||
>>> getData('the-matrix')['poster']
|
>>> get_data('the-matrix')['poster']
|
||||||
'http://content7.flixster.com/movie/16/90/52/1690525_gal.jpg'
|
'http://content7.flixster.com/movie/16/90/52/1690525_gal.jpg'
|
||||||
|
|
||||||
>>> getData('0133093')['poster']
|
>>> get_data('0133093')['poster']
|
||||||
'http://content7.flixster.com/movie/16/90/52/1690525_gal.jpg'
|
'http://content7.flixster.com/movie/16/90/52/1690525_gal.jpg'
|
||||||
|
|
||||||
>>> getData('2-or-3-things-i-know-about-her')['poster']
|
>>> get_data('2-or-3-things-i-know-about-her')['poster']
|
||||||
'http://content6.flixster.com/movie/10/95/43/10954392_gal.jpg'
|
'http://content6.flixster.com/movie/10/95/43/10954392_gal.jpg'
|
||||||
|
|
||||||
>>> getData('0078875')['rottentomatoes_id']
|
>>> get_data('0078875')['rottentomatoes_id']
|
||||||
'http://www.rottentomatoes.com/m/the-tin-drum/'
|
'http://www.rottentomatoes.com/m/the-tin-drum/'
|
||||||
'''
|
'''
|
||||||
if len(id) == 7:
|
if len(id) == 7:
|
||||||
try:
|
try:
|
||||||
int(id)
|
int(id)
|
||||||
id = getIdByImdb(id)
|
id = get_id(imdb=id)
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
data = {
|
data = {
|
||||||
"url": getUrl(id),
|
"url": get_url(id),
|
||||||
}
|
}
|
||||||
html = read_url(data['url'], timeout=timeout, timeout=True)
|
html = read_url(data['url'], timeout=timeout, timeout=True)
|
||||||
doc = document_fromstring(html)
|
doc = document_fromstring(html)
|
||||||
|
@ -55,21 +55,20 @@ def getData(id, timeout=-1):
|
||||||
return None
|
return None
|
||||||
return data
|
return data
|
||||||
|
|
||||||
def getIdByImdb(imdbId):
|
def get_id(url=None, imdb=None):
|
||||||
'''
|
'''
|
||||||
>>> getIdByImdb('0133093')
|
>>> get_id(imdb='0133093')
|
||||||
u'the-matrix'
|
u'the-matrix'
|
||||||
|
|
||||||
#>>> getIdByImdb('0060304')
|
#>>> get_id(imdb='0060304')
|
||||||
#u'2-or-3-things-i-know-about-her'
|
#u'2-or-3-things-i-know-about-her'
|
||||||
'''
|
'''
|
||||||
i = ImdbCombined(imdbId)
|
if imdb:
|
||||||
|
i = ImdbCombined(imdb)
|
||||||
title = i['title']
|
title = i['title']
|
||||||
return title.replace(' ', '-').lower().replace("'", '')
|
return title.replace(' ', '-').lower().replace("'", '')
|
||||||
|
|
||||||
def getId(url):
|
|
||||||
return url.split('/')[-1]
|
return url.split('/')[-1]
|
||||||
|
|
||||||
def getUrl(id):
|
def get_url(id):
|
||||||
return "http://www.flixster.com/movie/%s"%id
|
return "http://www.flixster.com/movie/%s"%id
|
||||||
|
|
||||||
|
|
|
@ -5,7 +5,7 @@ import json
|
||||||
from ox.cache import read_url
|
from ox.cache import read_url
|
||||||
from ox import find_re
|
from ox import find_re
|
||||||
|
|
||||||
class Imdb(dict):
|
class Freebase(dict):
|
||||||
def __init__(self, id, timeout=-1):
|
def __init__(self, id, timeout=-1):
|
||||||
url = "http://ids.freebaseapps.com/get_ids?id=/authority/imdb/title/tt%s" % id
|
url = "http://ids.freebaseapps.com/get_ids?id=/authority/imdb/title/tt%s" % id
|
||||||
'''
|
'''
|
||||||
|
|
|
@ -20,7 +20,7 @@ def read_url(url, data=None, headers=ox.cache.DEFAULT_HEADERS, timeout=ox.cache.
|
||||||
headers = headers.copy()
|
headers = headers.copy()
|
||||||
return ox.cache.read_url(url, data, headers, timeout, unicode=unicode)
|
return ox.cache.read_url(url, data, headers, timeout, unicode=unicode)
|
||||||
|
|
||||||
def getUrl(id):
|
def get_url(id):
|
||||||
return "http://www.imdb.com/title/tt%s/" % id
|
return "http://www.imdb.com/title/tt%s/" % id
|
||||||
|
|
||||||
class Imdb(SiteParser):
|
class Imdb(SiteParser):
|
||||||
|
@ -420,7 +420,7 @@ class ImdbCombined(Imdb):
|
||||||
self.regex = _regex
|
self.regex = _regex
|
||||||
super(ImdbCombined, self).__init__(id, timeout)
|
super(ImdbCombined, self).__init__(id, timeout)
|
||||||
|
|
||||||
def getMovieIdByTitle(title, timeout=-1):
|
def get_movie_by_title(title, timeout=-1):
|
||||||
'''
|
'''
|
||||||
This only works for exact title matches from the data dump
|
This only works for exact title matches from the data dump
|
||||||
Usually in the format
|
Usually in the format
|
||||||
|
@ -431,22 +431,22 @@ def getMovieIdByTitle(title, timeout=-1):
|
||||||
If there is more than one film with that title for the year
|
If there is more than one film with that title for the year
|
||||||
Title (Year/I)
|
Title (Year/I)
|
||||||
|
|
||||||
>>> getMovieIdByTitle(u'"Father Knows Best" (1954) {(#5.34)}')
|
>>> get_movie_by_title(u'"Father Knows Best" (1954) {(#5.34)}')
|
||||||
u'1602860'
|
u'1602860'
|
||||||
|
|
||||||
>>> getMovieIdByTitle(u'The Matrix (1999)')
|
>>> get_movie_by_title(u'The Matrix (1999)')
|
||||||
u'0133093'
|
u'0133093'
|
||||||
|
|
||||||
>>> getMovieIdByTitle(u'Little Egypt (1951)')
|
>>> get_movie_by_title(u'Little Egypt (1951)')
|
||||||
u'0043748'
|
u'0043748'
|
||||||
|
|
||||||
>>> getMovieIdByTitle(u'Little Egypt (1897/I)')
|
>>> get_movie_by_title(u'Little Egypt (1897/I)')
|
||||||
u'0214882'
|
u'0214882'
|
||||||
|
|
||||||
>>> getMovieIdByTitle(u'Little Egypt')
|
>>> get_movie_by_title(u'Little Egypt')
|
||||||
None
|
None
|
||||||
|
|
||||||
>>> getMovieIdByTitle(u'"Dexter" (2006) {Father Knows Best (#1.9)}')
|
>>> get_movie_by_title(u'"Dexter" (2006) {Father Knows Best (#1.9)}')
|
||||||
u'0866567'
|
u'0866567'
|
||||||
'''
|
'''
|
||||||
params = {'s':'tt','q': title}
|
params = {'s':'tt','q': title}
|
||||||
|
@ -465,21 +465,21 @@ def getMovieIdByTitle(title, timeout=-1):
|
||||||
return results[0]
|
return results[0]
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def getMovieId(title, director='', year='', timeout=-1):
|
def get_movie_id(title, director='', year='', timeout=-1):
|
||||||
'''
|
'''
|
||||||
>>> getMovieId('The Matrix')
|
>>> get_movie_id('The Matrix')
|
||||||
u'0133093'
|
u'0133093'
|
||||||
|
|
||||||
>>> getMovieId('2 or 3 Things I Know About Her', 'Jean-Luc Godard')
|
>>> get_movie_id('2 or 3 Things I Know About Her', 'Jean-Luc Godard')
|
||||||
u'0060304'
|
u'0060304'
|
||||||
|
|
||||||
>>> getMovieId('2 or 3 Things I Know About Her', 'Jean-Luc Godard', '1967')
|
>>> get_movie_id('2 or 3 Things I Know About Her', 'Jean-Luc Godard', '1967')
|
||||||
u'0060304'
|
u'0060304'
|
||||||
|
|
||||||
>>> getMovieId(u"Histoire(s) du cinema: Le controle de l'univers", 'Jean-Luc Godard')
|
>>> get_movie_id(u"Histoire(s) du cinema: Le controle de l'univers", 'Jean-Luc Godard')
|
||||||
u'0179214'
|
u'0179214'
|
||||||
|
|
||||||
>>> getMovieId(u"Histoire(s) du cinéma: Le contrôle de l'univers", 'Jean-Luc Godard')
|
>>> get_movie_id(u"Histoire(s) du cinéma: Le contrôle de l'univers", 'Jean-Luc Godard')
|
||||||
u'0179214'
|
u'0179214'
|
||||||
'''
|
'''
|
||||||
imdbId = {
|
imdbId = {
|
||||||
|
@ -555,12 +555,12 @@ def getMovieId(title, director='', year='', timeout=-1):
|
||||||
#or nothing
|
#or nothing
|
||||||
return ''
|
return ''
|
||||||
|
|
||||||
def getMoviePoster(imdbId):
|
def get_movie_poster(imdbId):
|
||||||
'''
|
'''
|
||||||
>>> getMoviePoster('0133093')
|
>>> get_movie_poster('0133093')
|
||||||
'http://ia.media-imdb.com/images/M/MV5BMjEzNjg1NTg2NV5BMl5BanBnXkFtZTYwNjY3MzQ5._V1._SX338_SY475_.jpg'
|
'http://ia.media-imdb.com/images/M/MV5BMjEzNjg1NTg2NV5BMl5BanBnXkFtZTYwNjY3MzQ5._V1._SX338_SY475_.jpg'
|
||||||
|
|
||||||
>>> getMoviePoster('0994352')
|
>>> get_movie_poster('0994352')
|
||||||
'http://ia.media-imdb.com/images/M/MV5BMjA3NzMyMzU1MV5BMl5BanBnXkFtZTcwNjc1ODUwMg@@._V1._SX594_SY755_.jpg'
|
'http://ia.media-imdb.com/images/M/MV5BMjA3NzMyMzU1MV5BMl5BanBnXkFtZTcwNjc1ODUwMg@@._V1._SX594_SY755_.jpg'
|
||||||
'''
|
'''
|
||||||
info = ImdbCombined(imdbId)
|
info = ImdbCombined(imdbId)
|
||||||
|
@ -570,10 +570,10 @@ def getMoviePoster(imdbId):
|
||||||
poster = find_re(data, 'img id="primary-img".*?src="(.*?)"')
|
poster = find_re(data, 'img id="primary-img".*?src="(.*?)"')
|
||||||
return poster
|
return poster
|
||||||
elif 'series' in info:
|
elif 'series' in info:
|
||||||
return getMoviePoster(info['series'])
|
return get_movie_poster(info['series'])
|
||||||
return ''
|
return ''
|
||||||
|
|
||||||
def maxVotes():
|
def max_votes():
|
||||||
url = 'http://www.imdb.com/search/title?num_votes=500000,&sort=num_votes,desc'
|
url = 'http://www.imdb.com/search/title?num_votes=500000,&sort=num_votes,desc'
|
||||||
data = ox.cache.read_url(url)
|
data = ox.cache.read_url(url)
|
||||||
votes = max([int(v.replace(',', ''))
|
votes = max([int(v.replace(',', ''))
|
||||||
|
@ -581,7 +581,7 @@ def maxVotes():
|
||||||
return votes
|
return votes
|
||||||
|
|
||||||
def guess(title, director='', timeout=-1):
|
def guess(title, director='', timeout=-1):
|
||||||
return getMovieId(title, director, timeout=timeout)
|
return get_movie_id(title, director, timeout=timeout)
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
import json
|
import json
|
||||||
|
|
|
@ -7,19 +7,19 @@ from ox.html import strip_tags
|
||||||
from ox.text import find_re
|
from ox.text import find_re
|
||||||
|
|
||||||
|
|
||||||
def getData(id):
|
def get_data(id):
|
||||||
'''
|
'''
|
||||||
>>> getData('1991/silence_of_the_lambs')['imdbId']
|
>>> get_data('1991/silence_of_the_lambs')['imdbId']
|
||||||
u'0102926'
|
u'0102926'
|
||||||
|
|
||||||
>>> getData('1991/silence_of_the_lambs')['posters'][0]
|
>>> get_data('1991/silence_of_the_lambs')['posters'][0]
|
||||||
u'http://www.impawards.com/1991/posters/silence_of_the_lambs_ver1.jpg'
|
u'http://www.impawards.com/1991/posters/silence_of_the_lambs_ver1.jpg'
|
||||||
|
|
||||||
>>> getData('1991/silence_of_the_lambs')['url']
|
>>> get_data('1991/silence_of_the_lambs')['url']
|
||||||
u'http://www.impawards.com/1991/silence_of_the_lambs_ver1.html'
|
u'http://www.impawards.com/1991/silence_of_the_lambs_ver1.html'
|
||||||
'''
|
'''
|
||||||
data = {
|
data = {
|
||||||
'url': getUrl(id)
|
'url': get_url(id)
|
||||||
}
|
}
|
||||||
html = read_url(data['url'], unicode=True)
|
html = read_url(data['url'], unicode=True)
|
||||||
data['imdbId'] = find_re(html, 'imdb.com/title/tt(\d{7})')
|
data['imdbId'] = find_re(html, 'imdb.com/title/tt(\d{7})')
|
||||||
|
@ -48,7 +48,7 @@ def getData(id):
|
||||||
|
|
||||||
return data
|
return data
|
||||||
|
|
||||||
def getId(url):
|
def get_id(url):
|
||||||
split = url.split('/')
|
split = url.split('/')
|
||||||
year = split[3]
|
year = split[3]
|
||||||
split = split[4][:-5].split('_')
|
split = split[4][:-5].split('_')
|
||||||
|
@ -59,26 +59,25 @@ def getId(url):
|
||||||
id = '%s/%s' % (year, '_'.join(split))
|
id = '%s/%s' % (year, '_'.join(split))
|
||||||
return id
|
return id
|
||||||
|
|
||||||
def getIds():
|
def get_ids(page=None):
|
||||||
ids = []
|
|
||||||
html = read_url('http://www.impawards.com/archives/latest.html', timeout = 60*60, unicode=True)
|
|
||||||
pages = int(find_re(html, '<a href= page(.*?).html>')) + 1
|
|
||||||
for page in range(pages, 0, -1):
|
|
||||||
for id in getIdsByPage(page):
|
|
||||||
if not id in ids:
|
|
||||||
ids.append(id)
|
|
||||||
return ids
|
|
||||||
|
|
||||||
def getIdsByPage(page):
|
|
||||||
ids = []
|
ids = []
|
||||||
|
if page:
|
||||||
html = read_url('http://www.impawards.com/archives/page%s.html' % page, timeout = -1, unicode=True)
|
html = read_url('http://www.impawards.com/archives/page%s.html' % page, timeout = -1, unicode=True)
|
||||||
results = re.compile('<a href = \.\./(.*?)>', re.DOTALL).findall(html)
|
results = re.compile('<a href = \.\./(.*?)>', re.DOTALL).findall(html)
|
||||||
for result in results:
|
for result in results:
|
||||||
url = 'http://impawards.com/%s' % result
|
url = 'http://impawards.com/%s' % result
|
||||||
ids.append(getId(url))
|
ids.append(get_id(url))
|
||||||
return set(ids)
|
return set(ids)
|
||||||
|
#get all
|
||||||
|
html = read_url('http://www.impawards.com/archives/latest.html', timeout = 60*60, unicode=True)
|
||||||
|
pages = int(find_re(html, '<a href= page(.*?).html>')) + 1
|
||||||
|
for page in range(pages, 0, -1):
|
||||||
|
for id in get_ids(page):
|
||||||
|
if not id in ids:
|
||||||
|
ids.append(id)
|
||||||
|
return ids
|
||||||
|
|
||||||
def getUrl(id):
|
def get_url(id):
|
||||||
url = u"http://www.impawards.com/%s.html" % id
|
url = u"http://www.impawards.com/%s.html" % id
|
||||||
html = read_url(url, unicode=True)
|
html = read_url(url, unicode=True)
|
||||||
if find_re(html, "No Movie Posters on This Page"):
|
if find_re(html, "No Movie Posters on This Page"):
|
||||||
|
@ -297,5 +296,5 @@ _id_map = {
|
||||||
}
|
}
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
ids = getIds()
|
ids = get_ids()
|
||||||
print sorted(ids), len(ids)
|
print sorted(ids), len(ids)
|
||||||
|
|
|
@ -24,7 +24,7 @@ ITUNES_HEADERS = {
|
||||||
'Connection': 'close',
|
'Connection': 'close',
|
||||||
}
|
}
|
||||||
|
|
||||||
def composeUrl(request, parameters):
|
def compose_url(request, parameters):
|
||||||
if request == 'advancedSearch':
|
if request == 'advancedSearch':
|
||||||
url = 'http://ax.phobos.apple.com.edgesuite.net/WebObjects/MZSearch.woa/wa/advancedSearch?'
|
url = 'http://ax.phobos.apple.com.edgesuite.net/WebObjects/MZSearch.woa/wa/advancedSearch?'
|
||||||
if parameters['media'] == 'music':
|
if parameters['media'] == 'music':
|
||||||
|
@ -60,7 +60,7 @@ def composeUrl(request, parameters):
|
||||||
url = 'http://phobos.apple.com/WebObjects/MZStore.woa/wa/viewMovie?id=%s&prvw=1' % parameters['id']
|
url = 'http://phobos.apple.com/WebObjects/MZStore.woa/wa/viewMovie?id=%s&prvw=1' % parameters['id']
|
||||||
return url
|
return url
|
||||||
|
|
||||||
def parseXmlDict(xml):
|
def parse_xml_dict(xml):
|
||||||
values = {}
|
values = {}
|
||||||
strings = xml.split('<key>')
|
strings = xml.split('<key>')
|
||||||
for string in strings:
|
for string in strings:
|
||||||
|
@ -78,7 +78,7 @@ def parseXmlDict(xml):
|
||||||
values[key] = value
|
values[key] = value
|
||||||
return values
|
return values
|
||||||
|
|
||||||
def parseCast(xml, title):
|
def parse_cast(xml, title):
|
||||||
list = []
|
list = []
|
||||||
try:
|
try:
|
||||||
strings = find_re(xml, '<SetFontStyle normalStyle="textColor">%s(.*?)</VBoxView>' % title[:-1].upper()).split('</GotoURL>')
|
strings = find_re(xml, '<SetFontStyle normalStyle="textColor">%s(.*?)</VBoxView>' % title[:-1].upper()).split('</GotoURL>')
|
||||||
|
@ -89,7 +89,7 @@ def parseCast(xml, title):
|
||||||
except:
|
except:
|
||||||
return list
|
return list
|
||||||
|
|
||||||
def parseMovies(xml, title):
|
def parse_movies(xml, title):
|
||||||
list = []
|
list = []
|
||||||
try:
|
try:
|
||||||
strings = find_re(xml, '<SetFontStyle normalStyle="outlineTitleFontStyle"><b>%s(.*?)</Test>' % title[:-1].upper()).split('</GotoURL>')
|
strings = find_re(xml, '<SetFontStyle normalStyle="outlineTitleFontStyle"><b>%s(.*?)</Test>' % title[:-1].upper()).split('</GotoURL>')
|
||||||
|
@ -109,17 +109,17 @@ class ItunesAlbum:
|
||||||
self.title = title
|
self.title = title
|
||||||
self.artist = artist
|
self.artist = artist
|
||||||
if not id:
|
if not id:
|
||||||
self.id = self.getId()
|
self.id = self.get_id()
|
||||||
|
|
||||||
def getId(self):
|
def get_id(self):
|
||||||
url = composeUrl('advancedSearch', {'media': 'music', 'title': self.title, 'artist': self.artist})
|
url = compose_url('advancedSearch', {'media': 'music', 'title': self.title, 'artist': self.artist})
|
||||||
xml = read_url(url, headers = ITUNES_HEADERS)
|
xml = read_url(url, headers = ITUNES_HEADERS)
|
||||||
id = find_re(xml, 'viewAlbum\?id=(.*?)&')
|
id = find_re(xml, 'viewAlbum\?id=(.*?)&')
|
||||||
return id
|
return id
|
||||||
|
|
||||||
def getData(self):
|
def get_data(self):
|
||||||
data = {'id': self.id}
|
data = {'id': self.id}
|
||||||
url = composeUrl('viewAlbum', {'id': self.id})
|
url = compose_url('viewAlbum', {'id': self.id})
|
||||||
xml = read_url(url, None, ITUNES_HEADERS)
|
xml = read_url(url, None, ITUNES_HEADERS)
|
||||||
data['albumName'] = find_re(xml, '<B>(.*?)</B>')
|
data['albumName'] = find_re(xml, '<B>(.*?)</B>')
|
||||||
data['artistName'] = find_re(xml, '<b>(.*?)</b>')
|
data['artistName'] = find_re(xml, '<b>(.*?)</b>')
|
||||||
|
@ -130,7 +130,7 @@ class ItunesAlbum:
|
||||||
data['tracks'] = []
|
data['tracks'] = []
|
||||||
strings = find_re(xml, '<key>items</key>.*?<dict>(.*?)$').split('<dict>')
|
strings = find_re(xml, '<key>items</key>.*?<dict>(.*?)$').split('<dict>')
|
||||||
for string in strings:
|
for string in strings:
|
||||||
data['tracks'].append(parseXmlDict(string))
|
data['tracks'].append(parse_xml_dict(string))
|
||||||
data['type'] = find_re(xml, '<key>listType</key><string>(.*?)<')
|
data['type'] = find_re(xml, '<key>listType</key><string>(.*?)<')
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
@ -140,48 +140,48 @@ class ItunesMovie:
|
||||||
self.title = title
|
self.title = title
|
||||||
self.director = director
|
self.director = director
|
||||||
if not id:
|
if not id:
|
||||||
self.id = self.getId()
|
self.id = self.get_id()
|
||||||
|
|
||||||
def getId(self):
|
def get_id(self):
|
||||||
url = composeUrl('advancedSearch', {'media': 'movie', 'title': self.title, 'director': self.director})
|
url = compose_url('advancedSearch', {'media': 'movie', 'title': self.title, 'director': self.director})
|
||||||
xml = read_url(url, headers = ITUNES_HEADERS)
|
xml = read_url(url, headers = ITUNES_HEADERS)
|
||||||
id = find_re(xml, 'viewMovie\?id=(.*?)&')
|
id = find_re(xml, 'viewMovie\?id=(.*?)&')
|
||||||
return id
|
return id
|
||||||
|
|
||||||
def getData(self):
|
def get_data(self):
|
||||||
data = {'id': self.id}
|
data = {'id': self.id}
|
||||||
url = composeUrl('viewMovie', {'id': self.id})
|
url = compose_url('viewMovie', {'id': self.id})
|
||||||
xml = read_url(url, None, ITUNES_HEADERS)
|
xml = read_url(url, None, ITUNES_HEADERS)
|
||||||
f = open('/Users/rolux/Desktop/iTunesData.xml', 'w')
|
f = open('/Users/rolux/Desktop/iTunesData.xml', 'w')
|
||||||
f.write(xml)
|
f.write(xml)
|
||||||
f.close()
|
f.close()
|
||||||
data['actors'] = parseCast(xml, 'actors')
|
data['actors'] = parse_cast(xml, 'actors')
|
||||||
string = find_re(xml, 'Average Rating:(.*?)</HBoxView>')
|
string = find_re(xml, 'Average Rating:(.*?)</HBoxView>')
|
||||||
data['averageRating'] = string.count('rating_star_000033.png') + string.count('½') * 0.5
|
data['averageRating'] = string.count('rating_star_000033.png') + string.count('½') * 0.5
|
||||||
data['directors'] = parseCast(xml, 'directors')
|
data['directors'] = parse_cast(xml, 'directors')
|
||||||
data['format'] = find_re(xml, 'Format:(.*?)<')
|
data['format'] = find_re(xml, 'Format:(.*?)<')
|
||||||
data['genre'] = decode_html(find_re(xml, 'Genre:(.*?)<'))
|
data['genre'] = decode_html(find_re(xml, 'Genre:(.*?)<'))
|
||||||
data['plotSummary'] = decode_html(find_re(xml, 'PLOT SUMMARY</b>.*?<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
|
data['plotSummary'] = decode_html(find_re(xml, 'PLOT SUMMARY</b>.*?<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
|
||||||
data['posterUrl'] = find_re(xml, 'reflection="." url="(.*?)"')
|
data['posterUrl'] = find_re(xml, 'reflection="." url="(.*?)"')
|
||||||
data['producers'] = parseCast(xml, 'producers')
|
data['producers'] = parse_cast(xml, 'producers')
|
||||||
data['rated'] = find_re(xml, 'Rated(.*?)<')
|
data['rated'] = find_re(xml, 'Rated(.*?)<')
|
||||||
data['relatedMovies'] = parseMovies(xml, 'related movies')
|
data['relatedMovies'] = parse_movies(xml, 'related movies')
|
||||||
data['releaseDate'] = find_re(xml, 'Released(.*?)<')
|
data['releaseDate'] = find_re(xml, 'Released(.*?)<')
|
||||||
data['runTime'] = find_re(xml, 'Run Time:(.*?)<')
|
data['runTime'] = find_re(xml, 'Run Time:(.*?)<')
|
||||||
data['screenwriters'] = parseCast(xml, 'screenwriters')
|
data['screenwriters'] = parse_cast(xml, 'screenwriters')
|
||||||
data['soundtrackId'] = find_re(xml, 'viewAlbum\?id=(.*?)&')
|
data['soundtrackId'] = find_re(xml, 'viewAlbum\?id=(.*?)&')
|
||||||
data['trailerUrl'] = find_re(xml, 'autoplay="." url="(.*?)"')
|
data['trailerUrl'] = find_re(xml, 'autoplay="." url="(.*?)"')
|
||||||
return data
|
return data
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
from ox.utils import json
|
from ox.utils import json
|
||||||
data = ItunesAlbum(title = 'So Red the Rose', artist = 'Arcadia').getData()
|
data = ItunesAlbum(title = 'So Red the Rose', artist = 'Arcadia').get_data()
|
||||||
print json.dumps(data, sort_keys = True, indent = 4)
|
print json.dumps(data, sort_keys = True, indent = 4)
|
||||||
data = ItunesMovie(title = 'The Matrix', director = 'Wachowski').getData()
|
data = ItunesMovie(title = 'The Matrix', director = 'Wachowski').get_data()
|
||||||
print json.dumps(data, sort_keys = True, indent = 4)
|
print json.dumps(data, sort_keys = True, indent = 4)
|
||||||
for v in data['relatedMovies']:
|
for v in data['relatedMovies']:
|
||||||
data = ItunesMovie(id = v['id']).getData()
|
data = ItunesMovie(id = v['id']).get_data()
|
||||||
print json.dumps(data, sort_keys = True, indent = 4)
|
print json.dumps(data, sort_keys = True, indent = 4)
|
||||||
data = ItunesMovie(id='272960052').getData()
|
data = ItunesMovie(id='272960052').get_data()
|
||||||
print json.dumps(data, sort_keys = True, indent = 4)
|
print json.dumps(data, sort_keys = True, indent = 4)
|
||||||
|
|
||||||
|
|
|
@ -5,7 +5,7 @@ from ox.html import decode_html
|
||||||
from ox.text import find_re
|
from ox.text import find_re
|
||||||
|
|
||||||
|
|
||||||
def getLyrics(title, artist):
|
def get_lyrics(title, artist):
|
||||||
html = read_url('http://lyricsfly.com/api/')
|
html = read_url('http://lyricsfly.com/api/')
|
||||||
key = find_re(html, '<font color=green><b>(.*?)</b></font>')
|
key = find_re(html, '<font color=green><b>(.*?)</b></font>')
|
||||||
url = 'http://lyricsfly.com/api/api.php?i=%s&a=%s&t=%s' % (key, artist, title)
|
url = 'http://lyricsfly.com/api/api.php?i=%s&a=%s&t=%s' % (key, artist, title)
|
||||||
|
|
|
@ -7,25 +7,24 @@ from lxml.html import document_fromstring
|
||||||
from ox.cache import read_url
|
from ox.cache import read_url
|
||||||
from ox import find_re, strip_tags
|
from ox import find_re, strip_tags
|
||||||
|
|
||||||
def getUrl(id):
|
def get_url(id=None, imdb=None):
|
||||||
return 'http://www.metacritic.com/movie/%s' % id
|
if imdb:
|
||||||
|
|
||||||
def getId(url):
|
|
||||||
return url.split('/')[-1]
|
|
||||||
|
|
||||||
def getUrlByImdb(imdb):
|
|
||||||
url = "http://www.imdb.com/title/tt%s/criticreviews" % imdb
|
url = "http://www.imdb.com/title/tt%s/criticreviews" % imdb
|
||||||
data = read_url(url)
|
data = read_url(url)
|
||||||
metacritic_url = find_re(data, '"(http://www.metacritic.com/movie/.*?)"')
|
metacritic_url = find_re(data, '"(http://www.metacritic.com/movie/.*?)"')
|
||||||
return metacritic_url or None
|
return metacritic_url or None
|
||||||
|
return 'http://www.metacritic.com/movie/%s' % id
|
||||||
|
|
||||||
def getMetacriticShowUrl(title):
|
def get_id(url):
|
||||||
|
return url.split('/')[-1]
|
||||||
|
|
||||||
|
def get_show_url(title):
|
||||||
title = quote(title)
|
title = quote(title)
|
||||||
url = "http://www.metacritic.com/search/process?ty=6&ts=%s&tfs=tvshow_title&x=0&y=0&sb=0&release_date_s=&release_date_e=&metascore_s=&metascore_e=" % title
|
url = "http://www.metacritic.com/search/process?ty=6&ts=%s&tfs=tvshow_title&x=0&y=0&sb=0&release_date_s=&release_date_e=&metascore_s=&metascore_e=" % title
|
||||||
data = read_url(url)
|
data = read_url(url)
|
||||||
return find_re(data, '(http://www.metacritic.com/tv/shows/.*?)\?')
|
return find_re(data, '(http://www.metacritic.com/tv/shows/.*?)\?')
|
||||||
|
|
||||||
def getData(url):
|
def get_data(url):
|
||||||
data = read_url(url, unicode=True)
|
data = read_url(url, unicode=True)
|
||||||
doc = document_fromstring(data)
|
doc = document_fromstring(data)
|
||||||
score = filter(lambda s: s.attrib.get('property') == 'v:average',
|
score = filter(lambda s: s.attrib.get('property') == 'v:average',
|
||||||
|
@ -57,7 +56,7 @@ def getData(url):
|
||||||
|
|
||||||
return {
|
return {
|
||||||
'critics': metacritics,
|
'critics': metacritics,
|
||||||
'id': getId(url),
|
'id': get_id(url),
|
||||||
'score': score,
|
'score': score,
|
||||||
'url': url,
|
'url': url,
|
||||||
}
|
}
|
||||||
|
|
|
@ -13,7 +13,7 @@ import ox
|
||||||
from torrent import Torrent
|
from torrent import Torrent
|
||||||
|
|
||||||
|
|
||||||
def _parseResultsPage(data, max_results=10):
|
def _parse_results_page(data, max_results=10):
|
||||||
results=[]
|
results=[]
|
||||||
regexp = '''<tr><td>(.*?)</td><td>(.*?)<a href="/tor/(.*?)">(.*?)</a>.*?</td>.*?</tr>'''
|
regexp = '''<tr><td>(.*?)</td><td>(.*?)<a href="/tor/(.*?)">(.*?)</a>.*?</td>.*?</tr>'''
|
||||||
for row in re.compile(regexp, re.DOTALL).findall(data):
|
for row in re.compile(regexp, re.DOTALL).findall(data):
|
||||||
|
@ -27,22 +27,17 @@ def _parseResultsPage(data, max_results=10):
|
||||||
results.append((torrentTitle, torrentLink, ''))
|
results.append((torrentTitle, torrentLink, ''))
|
||||||
return results
|
return results
|
||||||
|
|
||||||
def findMovie(query, max_results=10):
|
def find_movie(query=None, imdb=None, max_results=10):
|
||||||
'''search for torrents on mininova
|
'''search for torrents on mininova
|
||||||
'''
|
'''
|
||||||
|
if imdb:
|
||||||
|
url = "http://www.mininova.org/imdb/?imdb=%s" % normalize_imdbid(imdb)
|
||||||
|
else:
|
||||||
url = "http://www.mininova.org/search/%s/seeds" % quote(query)
|
url = "http://www.mininova.org/search/%s/seeds" % quote(query)
|
||||||
data = read_url(url, unicode=True)
|
data = read_url(url, unicode=True)
|
||||||
return _parseResultsPage(data, max_results)
|
return _parse_results_page(data, max_results)
|
||||||
|
|
||||||
def findMovieByImdb(imdbId):
|
def get_id(mininovaId):
|
||||||
'''find torrents on mininova for a given imdb id
|
|
||||||
'''
|
|
||||||
results = []
|
|
||||||
imdbId = normalize_imdbid(imdbId)
|
|
||||||
data = read_url("http://www.mininova.org/imdb/?imdb=%s" % imdbId, unicode=True)
|
|
||||||
return _parseResultsPage(data)
|
|
||||||
|
|
||||||
def getId(mininovaId):
|
|
||||||
mininovaId = unicode(mininovaId)
|
mininovaId = unicode(mininovaId)
|
||||||
d = find_re(mininovaId, "/(\d+)")
|
d = find_re(mininovaId, "/(\d+)")
|
||||||
if d:
|
if d:
|
||||||
|
@ -54,7 +49,7 @@ def getId(mininovaId):
|
||||||
return mininovaId[-1]
|
return mininovaId[-1]
|
||||||
|
|
||||||
def exists(mininovaId):
|
def exists(mininovaId):
|
||||||
mininovaId = getId(mininovaId)
|
mininovaId = get_id(mininovaId)
|
||||||
data = ox.net.read_url("http://www.mininova.org/tor/%s" % mininovaId)
|
data = ox.net.read_url("http://www.mininova.org/tor/%s" % mininovaId)
|
||||||
if not data or 'Torrent not found...' in data:
|
if not data or 'Torrent not found...' in data:
|
||||||
return False
|
return False
|
||||||
|
@ -62,11 +57,11 @@ def exists(mininovaId):
|
||||||
return False
|
return False
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def getData(mininovaId):
|
def get_data(mininovaId):
|
||||||
_key_map = {
|
_key_map = {
|
||||||
'by': u'uploader',
|
'by': u'uploader',
|
||||||
}
|
}
|
||||||
mininovaId = getId(mininovaId)
|
mininovaId = get_id(mininovaId)
|
||||||
torrent = dict()
|
torrent = dict()
|
||||||
torrent[u'id'] = mininovaId
|
torrent[u'id'] = mininovaId
|
||||||
torrent[u'domain'] = 'mininova.org'
|
torrent[u'domain'] = 'mininova.org'
|
||||||
|
@ -101,7 +96,7 @@ class Mininova(Torrent):
|
||||||
'72dfa59d2338e4a48c78cec9de25964cddb64104'
|
'72dfa59d2338e4a48c78cec9de25964cddb64104'
|
||||||
'''
|
'''
|
||||||
def __init__(self, mininovaId):
|
def __init__(self, mininovaId):
|
||||||
self.data = getData(mininovaId)
|
self.data = get_data(mininovaId)
|
||||||
if not self.data:
|
if not self.data:
|
||||||
return
|
return
|
||||||
Torrent.__init__(self)
|
Torrent.__init__(self)
|
||||||
|
|
|
@ -6,39 +6,39 @@ import re
|
||||||
from ox.cache import read_url
|
from ox.cache import read_url
|
||||||
from ox import find_re
|
from ox import find_re
|
||||||
|
|
||||||
def getData(id):
|
def get_data(id):
|
||||||
'''
|
'''
|
||||||
>>> getData('0060304')['posters'][0]
|
>>> get_data('0060304')['posters'][0]
|
||||||
u'http://www.movieposterdb.com/posters/06_03/1967/0060304/l_99688_0060304_639fdd1e.jpg'
|
u'http://www.movieposterdb.com/posters/06_03/1967/0060304/l_99688_0060304_639fdd1e.jpg'
|
||||||
>>> getData('0123456')['posters']
|
>>> get_data('0123456')['posters']
|
||||||
[]
|
[]
|
||||||
'''
|
'''
|
||||||
data = {
|
data = {
|
||||||
"url": getUrl(id)
|
"url": get_url(id)
|
||||||
}
|
}
|
||||||
data["posters"] = getPostersByUrl(data["url"])
|
data["posters"] = get_posters(data["url"])
|
||||||
return data
|
return data
|
||||||
|
|
||||||
def getId(url):
|
def get_id(url):
|
||||||
return url.split("/")[-2]
|
return url.split("/")[-2]
|
||||||
|
|
||||||
def getPostersByUrl(url, group=True, timeout=-1):
|
def get_posters(url, group=True, timeout=-1):
|
||||||
posters = []
|
posters = []
|
||||||
html = read_url(url, timeout=timeout, unicode=True)
|
html = read_url(url, timeout=timeout, unicode=True)
|
||||||
if url in html:
|
if url in html:
|
||||||
if group:
|
if group:
|
||||||
results = re.compile('<a href="(http://www.movieposterdb.com/group/.+?)\??">', re.DOTALL).findall(html)
|
results = re.compile('<a href="(http://www.movieposterdb.com/group/.+?)\??">', re.DOTALL).findall(html)
|
||||||
for result in results:
|
for result in results:
|
||||||
posters += getPostersByUrl(result, False)
|
posters += get_posters(result, False)
|
||||||
results = re.compile('<a href="(http://www.movieposterdb.com/poster/.+?)">', re.DOTALL).findall(html)
|
results = re.compile('<a href="(http://www.movieposterdb.com/poster/.+?)">', re.DOTALL).findall(html)
|
||||||
for result in results:
|
for result in results:
|
||||||
html = read_url(result, timeout=timeout, unicode=True)
|
html = read_url(result, timeout=timeout, unicode=True)
|
||||||
posters.append(find_re(html, '"(http://www.movieposterdb.com/posters/.+?\.jpg)"'))
|
posters.append(find_re(html, '"(http://www.movieposterdb.com/posters/.+?\.jpg)"'))
|
||||||
return posters
|
return posters
|
||||||
|
|
||||||
def getUrl(id):
|
def get_url(id):
|
||||||
return "http://www.movieposterdb.com/movie/%s/" % id
|
return "http://www.movieposterdb.com/movie/%s/" % id
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
print getData('0060304')
|
print get_data('0060304')
|
||||||
print getData('0133093')
|
print get_data('0133093')
|
||||||
|
|
|
@ -7,7 +7,7 @@ from ox.cache import read_url
|
||||||
from ox import find_re, strip_tags
|
from ox import find_re, strip_tags
|
||||||
from ox import langCode2To3, langTo3Code
|
from ox import langCode2To3, langTo3Code
|
||||||
|
|
||||||
def findSubtitlesByImdb(imdb, parts = 1, language = "eng"):
|
def find_subtitles(imdb, parts = 1, language = "eng"):
|
||||||
if len(language) == 2:
|
if len(language) == 2:
|
||||||
language = langCode2To3(language)
|
language = langCode2To3(language)
|
||||||
elif len(language) != 3:
|
elif len(language) != 3:
|
||||||
|
@ -29,7 +29,7 @@ def findSubtitlesByImdb(imdb, parts = 1, language = "eng"):
|
||||||
opensubtitleId = find_re(data, '/en/subtitles/(.*?)/')
|
opensubtitleId = find_re(data, '/en/subtitles/(.*?)/')
|
||||||
return opensubtitleId
|
return opensubtitleId
|
||||||
|
|
||||||
def downloadSubtitleById(opensubtitle_id):
|
def download_subtitle(opensubtitle_id):
|
||||||
srts = {}
|
srts = {}
|
||||||
data = read_url('http://www.opensubtitles.org/en/subtitles/%s' % opensubtitle_id)
|
data = read_url('http://www.opensubtitles.org/en/subtitles/%s' % opensubtitle_id)
|
||||||
reg_exp = 'href="(/en/download/file/.*?)">(.*?)</a>'
|
reg_exp = 'href="(/en/download/file/.*?)">(.*?)</a>'
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
# vi:si:et:sw=4:sts=4:ts=4
|
# vi:si:et:sw=4:sts=4:ts=4
|
||||||
import ox.cache
|
import ox.cache
|
||||||
|
|
||||||
def getPosterUrl(id):
|
def get_poster_url(id):
|
||||||
url = "http://0xdb.org/%s/poster.0xdb.jpg" % id
|
url = "http://0xdb.org/%s/poster.0xdb.jpg" % id
|
||||||
if ox.cache.exists(url):
|
if ox.cache.exists(url):
|
||||||
return url
|
return url
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
import re
|
import re
|
||||||
from ox.net import read_url
|
from ox.net import read_url
|
||||||
|
|
||||||
def getPosterUrl(id):
|
def get_poster_url(id):
|
||||||
url = 'http://piratecinema.org/posters/'
|
url = 'http://piratecinema.org/posters/'
|
||||||
html = read_url(url, unicode=True)
|
html = read_url(url, unicode=True)
|
||||||
results = re.compile('src="(.+)" title=".+\((\d{7})\)"').findall(html)
|
results = re.compile('src="(.+)" title=".+\((\d{7})\)"').findall(html)
|
||||||
|
@ -13,5 +13,5 @@ def getPosterUrl(id):
|
||||||
return ''
|
return ''
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
print getPosterUrl('0749451')
|
print get_poster_url('0749451')
|
||||||
|
|
||||||
|
|
|
@ -2,17 +2,18 @@
|
||||||
# vi:si:et:sw=4:sts=4:ts=4
|
# vi:si:et:sw=4:sts=4:ts=4
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from ox.cache import getHeaders, read_url
|
from ox.cache import read_url
|
||||||
from ox import find_re, strip_tags
|
from ox import find_re, strip_tags
|
||||||
|
|
||||||
|
|
||||||
def getUrlByImdb(imdb):
|
def get_url(id=None, imdb=None):
|
||||||
#this would also wor but does not cache:
|
#this would also wor but does not cache:
|
||||||
'''
|
'''
|
||||||
from urllib2 import urlopen
|
from urllib2 import urlopen
|
||||||
u = urlopen(url)
|
u = urlopen(url)
|
||||||
return u.url
|
return u.url
|
||||||
'''
|
'''
|
||||||
|
if imdb:
|
||||||
url = "http://www.rottentomatoes.com/alias?type=imdbid&s=%s" % imdb
|
url = "http://www.rottentomatoes.com/alias?type=imdbid&s=%s" % imdb
|
||||||
data = read_url(url)
|
data = read_url(url)
|
||||||
if "movie_title" in data:
|
if "movie_title" in data:
|
||||||
|
@ -24,7 +25,7 @@ def getUrlByImdb(imdb):
|
||||||
def get_og(data, key):
|
def get_og(data, key):
|
||||||
return find_re(data, '<meta property="og:%s".*?content="(.*?)"' % key)
|
return find_re(data, '<meta property="og:%s".*?content="(.*?)"' % key)
|
||||||
|
|
||||||
def getData(url):
|
def get_data(url):
|
||||||
data = read_url(url)
|
data = read_url(url)
|
||||||
r = {}
|
r = {}
|
||||||
r['title'] = find_re(data, '<h1 class="movie_title">(.*?)</h1>')
|
r['title'] = find_re(data, '<h1 class="movie_title">(.*?)</h1>')
|
||||||
|
|
|
@ -27,7 +27,7 @@ class SiteParser(dict):
|
||||||
baseUrl = ''
|
baseUrl = ''
|
||||||
regex = {}
|
regex = {}
|
||||||
|
|
||||||
def getUrl(self, page):
|
def get_url(self, page):
|
||||||
return "%s%s" % (self.baseUrl, page)
|
return "%s%s" % (self.baseUrl, page)
|
||||||
|
|
||||||
def read_url(self, url, timeout):
|
def read_url(self, url, timeout):
|
||||||
|
@ -35,7 +35,7 @@ class SiteParser(dict):
|
||||||
|
|
||||||
def __init__(self, timeout=-1):
|
def __init__(self, timeout=-1):
|
||||||
for key in self.regex:
|
for key in self.regex:
|
||||||
url = self.getUrl(self.regex[key]['page'])
|
url = self.get_url(self.regex[key]['page'])
|
||||||
data = self.read_url(url, timeout)
|
data = self.read_url(url, timeout)
|
||||||
if isinstance(self.regex[key]['re'], basestring):
|
if isinstance(self.regex[key]['re'], basestring):
|
||||||
data = re.compile(self.regex[key]['re'], re.DOTALL).findall(data)
|
data = re.compile(self.regex[key]['re'], re.DOTALL).findall(data)
|
||||||
|
|
|
@ -9,7 +9,7 @@ from ox.html import decode_html, strip_tags
|
||||||
import ox.net
|
import ox.net
|
||||||
|
|
||||||
|
|
||||||
def getNews(year, month, day):
|
def get_news(year, month, day):
|
||||||
sections = [
|
sections = [
|
||||||
'politik', 'wirtschaft', 'panorama', 'sport', 'kultur', 'netzwelt',
|
'politik', 'wirtschaft', 'panorama', 'sport', 'kultur', 'netzwelt',
|
||||||
'wissenschaft', 'unispiegel', 'schulspiegel', 'reise', 'auto'
|
'wissenschaft', 'unispiegel', 'schulspiegel', 'reise', 'auto'
|
||||||
|
@ -27,7 +27,7 @@ def getNews(year, month, day):
|
||||||
for item in re.compile('<div class="spTeaserCenterpage(.*?)</p>', re.DOTALL).findall(html):
|
for item in re.compile('<div class="spTeaserCenterpage(.*?)</p>', re.DOTALL).findall(html):
|
||||||
dateString = strip_tags(re.compile('<div class="spDateTime">(.*?)</div>', re.DOTALL).findall(item)[0]).strip()
|
dateString = strip_tags(re.compile('<div class="spDateTime">(.*?)</div>', re.DOTALL).findall(item)[0]).strip()
|
||||||
try:
|
try:
|
||||||
description = formatString(re.compile('<p>(.*?)<', re.DOTALL).findall(item)[0])
|
description = format_string(re.compile('<p>(.*?)<', re.DOTALL).findall(item)[0])
|
||||||
except:
|
except:
|
||||||
description = ''
|
description = ''
|
||||||
try:
|
try:
|
||||||
|
@ -35,7 +35,7 @@ def getNews(year, month, day):
|
||||||
except:
|
except:
|
||||||
imageUrl = ''
|
imageUrl = ''
|
||||||
try:
|
try:
|
||||||
title = formatString(re.compile('alt=[\'|"](.*?)[\'|"] title=', re.DOTALL).findall(item)[0]).replace(' : ', ': ').replace('::', ':')
|
title = format_string(re.compile('alt=[\'|"](.*?)[\'|"] title=', re.DOTALL).findall(item)[0]).replace(' : ', ': ').replace('::', ':')
|
||||||
except:
|
except:
|
||||||
title = ''
|
title = ''
|
||||||
if dateString[:10] == date and description and imageUrl and title.find(': ') != -1:
|
if dateString[:10] == date and description and imageUrl and title.find(': ') != -1:
|
||||||
|
@ -45,12 +45,12 @@ def getNews(year, month, day):
|
||||||
else:
|
else:
|
||||||
new['date'] = '%s-%s-%s %s:%s' % (dateString[6:10], dateString[3:5], dateString[:2], dateString[12:14], dateString[15:17])
|
new['date'] = '%s-%s-%s %s:%s' % (dateString[6:10], dateString[3:5], dateString[:2], dateString[12:14], dateString[15:17])
|
||||||
# fix decode_html
|
# fix decode_html
|
||||||
# new['description'] = formatString(decode_html(description))
|
# new['description'] = format_string(decode_html(description))
|
||||||
new['description'] = formatString(description)
|
new['description'] = format_string(description)
|
||||||
new['imageUrl'] = imageUrl
|
new['imageUrl'] = imageUrl
|
||||||
new['section'] = formatSection(section)
|
new['section'] = format_section(section)
|
||||||
new['title'] = formatString(title)
|
new['title'] = format_string(title)
|
||||||
new['title1'] = new['title'].replace('\xdf', '\xdf\xdf')[:len(formatString(re.compile('<h4>(.*?)</h4>', re.DOTALL).findall(item)[0]))].replace('\xdf\xdf', '\xdf')
|
new['title1'] = new['title'].replace('\xdf', '\xdf\xdf')[:len(format_string(re.compile('<h4>(.*?)</h4>', re.DOTALL).findall(item)[0]))].replace('\xdf\xdf', '\xdf')
|
||||||
if new['title1'][-1:] == ':':
|
if new['title1'][-1:] == ':':
|
||||||
new['title1'] = new['title1'][0:-1]
|
new['title1'] = new['title1'][0:-1]
|
||||||
new['title2'] = new['title'][len(new['title1']) + 2:]
|
new['title2'] = new['title'][len(new['title1']) + 2:]
|
||||||
|
@ -67,21 +67,21 @@ def getNews(year, month, day):
|
||||||
'''
|
'''
|
||||||
return news
|
return news
|
||||||
|
|
||||||
def splitTitle(title):
|
def split_title(title):
|
||||||
title1 = re.compile('(.*?): ').findall(title)[0]
|
title1 = re.compile('(.*?): ').findall(title)[0]
|
||||||
title2 = re.compile(': (.*?)$').findall(title)[0]
|
title2 = re.compile(': (.*?)$').findall(title)[0]
|
||||||
return [title1, title2]
|
return [title1, title2]
|
||||||
|
|
||||||
def formatString(string):
|
def format_string(string):
|
||||||
string = string.replace('<span class="spOptiBreak"> </span>', '')
|
string = string.replace('<span class="spOptiBreak"> </span>', '')
|
||||||
string = string.replace('\n', ' ').replace(' ', ' ').strip()
|
string = string.replace('\n', ' ').replace(' ', ' ').strip()
|
||||||
string = string.replace('&', '&').replace(''', '\'').replace('"', '"')
|
string = string.replace('&', '&').replace(''', '\'').replace('"', '"')
|
||||||
return string
|
return string
|
||||||
|
|
||||||
def formatSection(string):
|
def format_section(string):
|
||||||
return string[:1].upper() + string[1:].replace('spiegel', 'SPIEGEL')
|
return string[:1].upper() + string[1:].replace('spiegel', 'SPIEGEL')
|
||||||
|
|
||||||
def formatSubsection(string):
|
def format_subsection(string):
|
||||||
# SPIEGEL, SPIEGEL special
|
# SPIEGEL, SPIEGEL special
|
||||||
subsection = {
|
subsection = {
|
||||||
'abi': 'Abi - und dann?',
|
'abi': 'Abi - und dann?',
|
||||||
|
@ -98,7 +98,7 @@ def formatSubsection(string):
|
||||||
return subsection[string].replace(u'\xc3', 'ae')
|
return subsection[string].replace(u'\xc3', 'ae')
|
||||||
return string[:1].upper() + string[1:]
|
return string[:1].upper() + string[1:]
|
||||||
|
|
||||||
def getIssue(year, week):
|
def get_issue(year, week):
|
||||||
coverUrl = 'http://www.spiegel.de/static/epaper/SP/%d/%d/ROSPANZ%d%03d0001-312.jpg' % (year, week, year, week)
|
coverUrl = 'http://www.spiegel.de/static/epaper/SP/%d/%d/ROSPANZ%d%03d0001-312.jpg' % (year, week, year, week)
|
||||||
if not ox.net.exists(coverUrl):
|
if not ox.net.exists(coverUrl):
|
||||||
return None
|
return None
|
||||||
|
@ -122,7 +122,7 @@ def getIssue(year, week):
|
||||||
return {'pages': pages, 'contents': contents, 'coverUrl': coverUrl, 'pageUrl': pageUrl}
|
return {'pages': pages, 'contents': contents, 'coverUrl': coverUrl, 'pageUrl': pageUrl}
|
||||||
|
|
||||||
|
|
||||||
def archiveIssues():
|
def archive_issues():
|
||||||
'''
|
'''
|
||||||
this is just an example of an archiving application
|
this is just an example of an archiving application
|
||||||
'''
|
'''
|
||||||
|
@ -140,8 +140,8 @@ def archiveIssues():
|
||||||
else:
|
else:
|
||||||
wMax = 53
|
wMax = 53
|
||||||
for w in range(wMax, 0, -1):
|
for w in range(wMax, 0, -1):
|
||||||
print 'getIssue(%d, %d)' % (y, w)
|
print 'get_issue(%d, %d)' % (y, w)
|
||||||
issue = getIssue(y, w)
|
issue = get_issue(y, w)
|
||||||
if issue:
|
if issue:
|
||||||
dirname = '%s/%d/%02d' % (archivePath, y, w)
|
dirname = '%s/%d/%02d' % (archivePath, y, w)
|
||||||
if not os.path.exists(dirname):
|
if not os.path.exists(dirname):
|
||||||
|
@ -188,7 +188,7 @@ def archiveIssues():
|
||||||
print p['min'], p['sum'] / p['num'], p['max']
|
print p['min'], p['sum'] / p['num'], p['max']
|
||||||
|
|
||||||
|
|
||||||
def archiveNews():
|
def archive_news():
|
||||||
'''
|
'''
|
||||||
this is just an example of an archiving application
|
this is just an example of an archiving application
|
||||||
'''
|
'''
|
||||||
|
@ -235,7 +235,7 @@ def archiveNews():
|
||||||
f.close()
|
f.close()
|
||||||
filename = filename[:-5] + '.txt'
|
filename = filename[:-5] + '.txt'
|
||||||
if not os.path.exists(filename) or True:
|
if not os.path.exists(filename) or True:
|
||||||
data = splitTitle(new['title'])
|
data = split_title(new['title'])
|
||||||
data.append(new['description'])
|
data.append(new['description'])
|
||||||
data = '\n'.join(data)
|
data = '\n'.join(data)
|
||||||
f = open(filename, 'w')
|
f = open(filename, 'w')
|
||||||
|
@ -256,19 +256,14 @@ def archiveNews():
|
||||||
count[string] = {'count': 1, 'string': '%s %s http://www.spiegel.de/%s/0,1518,archiv-%d-%03d,00.html' % (new['date'], new['date'], new['section'].lower(), y, int(datetime(y, m, d).strftime('%j')))}
|
count[string] = {'count': 1, 'string': '%s %s http://www.spiegel.de/%s/0,1518,archiv-%d-%03d,00.html' % (new['date'], new['date'], new['section'].lower(), y, int(datetime(y, m, d).strftime('%j')))}
|
||||||
else:
|
else:
|
||||||
count[string] = {'count': count[string]['count'] + 1, 'string': '%s %s' % (new['date'], count[string]['string'][17:])}
|
count[string] = {'count': count[string]['count'] + 1, 'string': '%s %s' % (new['date'], count[string]['string'][17:])}
|
||||||
strings = splitTitle(new['title'])
|
strings = split_title(new['title'])
|
||||||
if strings[0] != new['title1'] or strings[1] != new['title2']:
|
if strings[0] != new['title1'] or strings[1] != new['title2']:
|
||||||
colon.append('%s %s %s: %s' % (new['date'], new['title'], new['title1'], new['title2']))
|
colon.append('%s %s %s: %s' % (new['date'], new['title'], new['title1'], new['title2']))
|
||||||
for key in sortDictByKey(count):
|
for key in sorted(count):
|
||||||
print '%6d %-24s %s' % (count[key]['count'], key, count[key]['string'])
|
print '%6d %-24s %s' % (count[key]['count'], key, count[key]['string'])
|
||||||
for value in colon:
|
for value in colon:
|
||||||
print value
|
print value
|
||||||
|
|
||||||
def sortDictByKey(d):
|
|
||||||
keys = d.keys()
|
|
||||||
keys.sort()
|
|
||||||
return keys
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
# spiegel = Spiegel(2008, 8)
|
# spiegel = Spiegel(2008, 8)
|
||||||
# print spiegel.getContents()
|
# print spiegel.getContents()
|
||||||
|
@ -281,12 +276,12 @@ if __name__ == '__main__':
|
||||||
news = getNews(2008, 2, d)
|
news = getNews(2008, 2, d)
|
||||||
for new in news:
|
for new in news:
|
||||||
strings = new['url'].split('/')
|
strings = new['url'].split('/')
|
||||||
string = formatSection(strings[3])
|
string = format_section(strings[3])
|
||||||
if len(strings) == 6:
|
if len(strings) == 6:
|
||||||
string += '/' + formatSubsection(strings[4])
|
string += '/' + format_subsection(strings[4])
|
||||||
if not string in x:
|
if not string in x:
|
||||||
x.append(string)
|
x.append(string)
|
||||||
print x
|
print x
|
||||||
'''
|
'''
|
||||||
# archiveIssues()
|
# archive_issues()
|
||||||
archiveNews()
|
archive_news()
|
||||||
|
|
|
@ -22,7 +22,9 @@ def read_url(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_
|
||||||
headers['Cookie'] = 'language=en_EN'
|
headers['Cookie'] = 'language=en_EN'
|
||||||
return cache.read_url(url, data, headers, timeout, unicode=unicode)
|
return cache.read_url(url, data, headers, timeout, unicode=unicode)
|
||||||
|
|
||||||
def findMovies(query, max_results=10):
|
def find_movies(query=None, imdb=None, max_results=10):
|
||||||
|
if imdb:
|
||||||
|
query = "tt" + normalize_imdbid(imdb)
|
||||||
results = []
|
results = []
|
||||||
next = ["http://thepiratebay.org/search/%s/0/3/200" % quote(query), ]
|
next = ["http://thepiratebay.org/search/%s/0/3/200" % quote(query), ]
|
||||||
page_count = 1
|
page_count = 1
|
||||||
|
@ -47,10 +49,7 @@ def findMovies(query, max_results=10):
|
||||||
next = re.compile('<a.*?href="(.*?)".*?>.*?next.gif.*?</a>').findall(data)
|
next = re.compile('<a.*?href="(.*?)".*?>.*?next.gif.*?</a>').findall(data)
|
||||||
return results
|
return results
|
||||||
|
|
||||||
def findMovieByImdb(imdb):
|
def get_id(piratebayId):
|
||||||
return findMovies("tt" + normalize_imdbid(imdb))
|
|
||||||
|
|
||||||
def getId(piratebayId):
|
|
||||||
if piratebayId.startswith('http://torrents.thepiratebay.org/'):
|
if piratebayId.startswith('http://torrents.thepiratebay.org/'):
|
||||||
piratebayId = piratebayId.split('org/')[1]
|
piratebayId = piratebayId.split('org/')[1]
|
||||||
d = find_re(piratebayId, "tor/(\d+)")
|
d = find_re(piratebayId, "tor/(\d+)")
|
||||||
|
@ -62,10 +61,10 @@ def getId(piratebayId):
|
||||||
return piratebayId
|
return piratebayId
|
||||||
|
|
||||||
def exists(piratebayId):
|
def exists(piratebayId):
|
||||||
piratebayId = getId(piratebayId)
|
piratebayId = get_id(piratebayId)
|
||||||
return ox.net.exists("http://thepiratebay.org/torrent/%s" % piratebayId)
|
return ox.net.exists("http://thepiratebay.org/torrent/%s" % piratebayId)
|
||||||
|
|
||||||
def getData(piratebayId):
|
def get_data(piratebayId):
|
||||||
_key_map = {
|
_key_map = {
|
||||||
'spoken language(s)': u'language',
|
'spoken language(s)': u'language',
|
||||||
'texted language(s)': u'subtitle language',
|
'texted language(s)': u'subtitle language',
|
||||||
|
@ -73,7 +72,7 @@ def getData(piratebayId):
|
||||||
'leechers': 'leecher',
|
'leechers': 'leecher',
|
||||||
'seeders': 'seeder',
|
'seeders': 'seeder',
|
||||||
}
|
}
|
||||||
piratebayId = getId(piratebayId)
|
piratebayId = get_id(piratebayId)
|
||||||
torrent = dict()
|
torrent = dict()
|
||||||
torrent[u'id'] = piratebayId
|
torrent[u'id'] = piratebayId
|
||||||
torrent[u'domain'] = 'thepiratebay.org'
|
torrent[u'domain'] = 'thepiratebay.org'
|
||||||
|
@ -108,7 +107,7 @@ class Thepiratebay(Torrent):
|
||||||
'4e84415d36ed7b54066160c05a0b0f061898d12b'
|
'4e84415d36ed7b54066160c05a0b0f061898d12b'
|
||||||
'''
|
'''
|
||||||
def __init__(self, piratebayId):
|
def __init__(self, piratebayId):
|
||||||
self.data = getData(piratebayId)
|
self.data = get_data(piratebayId)
|
||||||
if not self.data:
|
if not self.data:
|
||||||
return
|
return
|
||||||
Torrent.__init__(self)
|
Torrent.__init__(self)
|
||||||
|
|
|
@ -7,12 +7,12 @@ from ox import strip_tags, find_re
|
||||||
from ox.cache import read_url
|
from ox.cache import read_url
|
||||||
|
|
||||||
|
|
||||||
def getEpisodeData(url):
|
def get_episode_data(url):
|
||||||
'''
|
'''
|
||||||
prases informatin on tvcom episode pages
|
prases informatin on tvcom episode pages
|
||||||
returns dict with title, show, description, score
|
returns dict with title, show, description, score
|
||||||
example:
|
example:
|
||||||
getEpisodeData('http://www.tv.com/lost/do-no-harm/episode/399310/summary.html')
|
get_episode_data('http://www.tv.com/lost/do-no-harm/episode/399310/summary.html')
|
||||||
'''
|
'''
|
||||||
data = read_url(url, unicode=True)
|
data = read_url(url, unicode=True)
|
||||||
r = {}
|
r = {}
|
||||||
|
|
|
@ -8,7 +8,7 @@ from ox.cache import read_url
|
||||||
from ox import find_string, find_re
|
from ox import find_string, find_re
|
||||||
|
|
||||||
|
|
||||||
def getData(id):
|
def get_data(id):
|
||||||
url = 'http://www.vimeo.com/moogaloop/load/clip:%s' %id
|
url = 'http://www.vimeo.com/moogaloop/load/clip:%s' %id
|
||||||
xml = read_url(url)
|
xml = read_url(url)
|
||||||
tree = ET.parse(StringIO(xml))
|
tree = ET.parse(StringIO(xml))
|
||||||
|
|
|
@ -8,52 +8,45 @@ from ox.cache import read_url
|
||||||
from ox import find_re, decode_html
|
from ox import find_re, decode_html
|
||||||
|
|
||||||
|
|
||||||
def getId(url):
|
def get_id(url):
|
||||||
return url.split("/")[-1]
|
return url.split("/")[-1]
|
||||||
|
|
||||||
def getUrl(id):
|
def get_url(id=None, imdb=None, allmovie=None):
|
||||||
|
if imdb:
|
||||||
|
query = '"%s"'% imdbId
|
||||||
|
result = find(query)
|
||||||
|
if result:
|
||||||
|
url = result[0][1]
|
||||||
|
data = get_movie_data(url)
|
||||||
|
if 'imdb_id' in data:
|
||||||
|
return url
|
||||||
|
return ""
|
||||||
|
if allmovie:
|
||||||
|
query = '"amg_id = 1:%s"'% allmovie
|
||||||
|
result = find(query)
|
||||||
|
if result:
|
||||||
|
url = result[0][1]
|
||||||
|
return url
|
||||||
|
return ''
|
||||||
return "http://en.wikipedia.org/wiki/%s" % id
|
return "http://en.wikipedia.org/wiki/%s" % id
|
||||||
|
|
||||||
|
def get_movie_id(title, director='', year=''):
|
||||||
def getMovieId(title, director='', year=''):
|
|
||||||
query = '"%s" film %s %s' % (title, director, year)
|
query = '"%s" film %s %s' % (title, director, year)
|
||||||
result = find(query, 1)
|
result = find(query, 1)
|
||||||
if result:
|
if result:
|
||||||
return result[0][1]
|
return result[0][1]
|
||||||
return ''
|
return ''
|
||||||
|
|
||||||
def getUrlByImdbId(imdbId):
|
def get_wiki_data(wikipedia_url):
|
||||||
query = '"%s"'% imdbId
|
url = wikipedia_url.replace('wikipedia.org/wiki/', 'wikipedia.org/w/index.php?title=')
|
||||||
result = find(query)
|
|
||||||
if result:
|
|
||||||
url = result[0][1]
|
|
||||||
data = getMovieData(url)
|
|
||||||
if 'imdb_id' in data:
|
|
||||||
return url
|
|
||||||
return ""
|
|
||||||
|
|
||||||
def getUrlByImdb(imdbId):
|
|
||||||
# deprecated, use getUrlByImdbId()
|
|
||||||
return getUrlByImdbId(imdbId)
|
|
||||||
|
|
||||||
def getUrlByAllmovieId(allmovieId):
|
|
||||||
query = '"amg_id = 1:%s"'% allmovieId
|
|
||||||
result = find(query)
|
|
||||||
if result:
|
|
||||||
url = result[0][1]
|
|
||||||
return url
|
|
||||||
return ''
|
|
||||||
|
|
||||||
def getWikiData(wikipediaUrl):
|
|
||||||
url = wikipediaUrl.replace('wikipedia.org/wiki/', 'wikipedia.org/w/index.php?title=')
|
|
||||||
url = "%s&action=raw" % url
|
url = "%s&action=raw" % url
|
||||||
data = read_url(url).decode('utf-8')
|
data = read_url(url).decode('utf-8')
|
||||||
return data
|
return data
|
||||||
|
|
||||||
def getMovieData(wikipediaUrl):
|
def get_movie_data(wikipedia_url):
|
||||||
if not wikipediaUrl.startswith('http'):
|
if not wikipedia_url.startswith('http'):
|
||||||
wikipediaUrl = getUrl(wikipediaUrl)
|
wikipedia_url = get_url(wikipedia_url)
|
||||||
data = getWikiData(wikipediaUrl)
|
data = get_wiki_data(wikipedia_url)
|
||||||
filmbox_data = find_re(data, '''\{\{[Ii]nfobox.[Ff]ilm(.*?)\n\}\}''')
|
filmbox_data = find_re(data, '''\{\{[Ii]nfobox.[Ff]ilm(.*?)\n\}\}''')
|
||||||
filmbox = {}
|
filmbox = {}
|
||||||
_box = filmbox_data.strip().split('|')
|
_box = filmbox_data.strip().split('|')
|
||||||
|
@ -104,7 +97,7 @@ def getMovieData(wikipediaUrl):
|
||||||
filmbox['title_sort'] = find_re(data, '''\{\{DEFAULTSORT:(.*?)\}\}''')
|
filmbox['title_sort'] = find_re(data, '''\{\{DEFAULTSORT:(.*?)\}\}''')
|
||||||
return filmbox
|
return filmbox
|
||||||
|
|
||||||
def getImageUrl(name):
|
def get_image_url(name):
|
||||||
url = 'http://en.wikipedia.org/wiki/Image:' + name.replace(' ', '%20')
|
url = 'http://en.wikipedia.org/wiki/Image:' + name.replace(' ', '%20')
|
||||||
data = read_url(url, unicode=True)
|
data = read_url(url, unicode=True)
|
||||||
url = find_re(data, 'href="(http://upload.wikimedia.org/.*?)"')
|
url = find_re(data, 'href="(http://upload.wikimedia.org/.*?)"')
|
||||||
|
@ -114,19 +107,19 @@ def getImageUrl(name):
|
||||||
url = 'http:' + url
|
url = 'http:' + url
|
||||||
return url
|
return url
|
||||||
|
|
||||||
def getPosterUrl(wikipediaUrl):
|
def get_poster_url(wikipedia_url):
|
||||||
if not wikipediaUrl.startswith('http'): wikipediaUrl = getUrl(wikipediaUrl)
|
if not wikipedia_url.startswith('http'): wikipedia_url = get_url(wikipedia_url)
|
||||||
data = getMovieData(wikipediaUrl)
|
data = get_movie_data(wikipedia_url)
|
||||||
if 'image' in data:
|
if 'image' in data:
|
||||||
return getImageUrl(data['image'])
|
return get_image_url(data['image'])
|
||||||
return ''
|
return ''
|
||||||
|
|
||||||
def getMoviePoster(wikipediaUrl):
|
def get_movie_poster(wikipedia_url):
|
||||||
# deprecated, use getPosterUrl()
|
# deprecated, use get_poster_url()
|
||||||
return getPosterUrl(wikipediaUrl)
|
return get_poster_url(wikipedia_url)
|
||||||
|
|
||||||
def getAllmovieId(wikipediaUrl):
|
def get_allmovie_id(wikipedia_url):
|
||||||
data = getMovieData(wikipediaUrl)
|
data = get_movie_data(wikipedia_url)
|
||||||
return data.get('amg_id', '')
|
return data.get('amg_id', '')
|
||||||
|
|
||||||
def find(query, max_results=10):
|
def find(query, max_results=10):
|
||||||
|
|
|
@ -8,7 +8,7 @@ import feedparser
|
||||||
from ox.cache import read_url, cache_timeout
|
from ox.cache import read_url, cache_timeout
|
||||||
|
|
||||||
|
|
||||||
def getVideoUrl(youtubeId, format='mp4', timeout=cache_timeout):
|
def video_url(youtubeId, format='mp4', timeout=cache_timeout):
|
||||||
"""
|
"""
|
||||||
youtubeId - if of video
|
youtubeId - if of video
|
||||||
format - video format, options: webm, 1080p, 720p, mp4, high
|
format - video format, options: webm, 1080p, 720p, mp4, high
|
||||||
|
|
Loading…
Reference in a new issue