ox.web under_score api rewrite

This commit is contained in:
j 2012-08-15 17:15:40 +02:00
parent bb35daa95c
commit a4fd3c930f
29 changed files with 268 additions and 285 deletions

View file

@ -307,6 +307,8 @@ def parse_movie_path(path):
title = title.replace('_ ', ': ') title = title.replace('_ ', ': ')
if title.endswith('_'): if title.endswith('_'):
title = title[:-1] + '.' title = title[:-1] + '.'
if title.startswith('_'):
title = '.' + title[1:]
year = find_re(title, '(\(\d{4}\))') year = find_re(title, '(\(\d{4}\))')
if not year: if not year:
@ -344,8 +346,9 @@ def parse_movie_path(path):
else: else:
season = None season = None
episode = find_re(parts[-1], '\.Episode (\d+)\.') episode = find_re(parts[-1], '\.Episode[s]* ([\d+]+)\.')
if episode: if episode:
episode = episode.split('+')[0]
episode = int(episode) episode = int(episode)
else: else:
episode = None episode = None

View file

@ -7,7 +7,7 @@ from utils import json, ET
def get_embed_code(url, maxwidth=None, maxheight=None): def get_embed_code(url, maxwidth=None, maxheight=None):
embed = {} embed = {}
header = cache.getHeaders(url) header = cache.get_headers(url)
if header.get('content-type', '').startswith('text/html'): if header.get('content-type', '').startswith('text/html'):
html = cache.readUrl(url) html = cache.readUrl(url)
json_oembed = filter(lambda l: 'json+oembed' in l, re.compile('<link.*?>').findall(html)) json_oembed = filter(lambda l: 'json+oembed' in l, re.compile('<link.*?>').findall(html))

View file

@ -7,68 +7,68 @@ from ox import strip_tags, find_re
from ox.cache import read_url from ox.cache import read_url
def getId(url): def get_id(url):
return url.split("/")[-1] return url.split("/")[-1]
def getData(id): def get_data(id):
''' '''
>>> getData('129689')['cast'][1][1] >>> get_data('129689')['cast'][1][1]
u'Marianne' u'Marianne'
>>> getData('129689')['credits'][0][0] >>> get_data('129689')['credits'][0][0]
u'Jean-Luc Godard' u'Jean-Luc Godard'
>>> getData('129689')['posters'][0] >>> get_data('129689')['posters'][0]
u'http://image.allmusic.com/00/adg/cov200/dru800/u812/u81260bbffr.jpg' u'http://image.allmusic.com/00/adg/cov200/dru800/u812/u81260bbffr.jpg'
>>> getData('129689')['rating'] >>> get_data('129689')['rating']
u'4.5' u'4.5'
''' '''
if id.startswith('http'): if id.startswith('http'):
id = getId(id) id = get_id(id)
data = { data = {
"url": getUrl(id) "url": get_url(id)
} }
html = read_url(data["url"], unicode=True) html = read_url(data["url"], unicode=True)
data['aka'] = parseList(html, 'AKA') data['aka'] = parse_list(html, 'AKA')
data['category'] = find_re(html, '<dt>category</dt>.*?<dd>(.*?)</dd>') data['category'] = find_re(html, '<dt>category</dt>.*?<dd>(.*?)</dd>')
data['countries'] = parseList(html, 'countries') data['countries'] = parse_list(html, 'countries')
data['director'] = parseEntry(html, 'directed by') data['director'] = parse_entry(html, 'directed by')
data['genres'] = parseList(html, 'genres') data['genres'] = parse_list(html, 'genres')
data['keywords'] = parseList(html, 'keywords') data['keywords'] = parse_list(html, 'keywords')
data['posters'] = [find_re(html, '<img src="(http://cps-.*?)"')] data['posters'] = [find_re(html, '<img src="(http://cps-.*?)"')]
data['produced'] = parseList(html, 'produced by') data['produced'] = parse_list(html, 'produced by')
data['rating'] = find_re(html, 'Stars" title="(.*?) Stars"') data['rating'] = find_re(html, 'Stars" title="(.*?) Stars"')
data['released'] = parseEntry(html, 'released by') data['released'] = parse_entry(html, 'released by')
data['releasedate'] = parseList(html, 'release date') data['releasedate'] = parse_list(html, 'release date')
data['runtime'] = parseEntry(html, 'run time').replace('min.', '').strip() data['runtime'] = parse_entry(html, 'run time').replace('min.', '').strip()
data['set'] = parseEntry(html, 'set in') data['set'] = parse_entry(html, 'set in')
data['synopsis'] = strip_tags(find_re(html, '<div class="toggle-text" itemprop="description">(.*?)</div>')).strip() data['synopsis'] = strip_tags(find_re(html, '<div class="toggle-text" itemprop="description">(.*?)</div>')).strip()
data['themes'] = parseList(html, 'themes') data['themes'] = parse_list(html, 'themes')
data['types'] = parseList(html, 'types') data['types'] = parse_list(html, 'types')
data['year'] = find_re(html, '<span class="year">.*?(\d+)') data['year'] = find_re(html, '<span class="year">.*?(\d+)')
#data['stills'] = [re.sub('_derived.*?/', '', i) for i in re.compile('<a href="#" title="movie still".*?<img src="(.*?)"', re.DOTALL).findall(html)] #data['stills'] = [re.sub('_derived.*?/', '', i) for i in re.compile('<a href="#" title="movie still".*?<img src="(.*?)"', re.DOTALL).findall(html)]
data['stills'] = re.compile('<a href="#" title="movie still".*?<img src="(.*?)"', re.DOTALL).findall(html) data['stills'] = re.compile('<a href="#" title="movie still".*?<img src="(.*?)"', re.DOTALL).findall(html)
#html = read_url("http://allmovie.com/work/%s/cast" % id, unicode=True) #html = read_url("http://allmovie.com/work/%s/cast" % id, unicode=True)
#data['cast'] = parseTable(html) #data['cast'] = parse_table(html)
#html = read_url("http://allmovie.com/work/%s/credits" % id, unicode=True) #html = read_url("http://allmovie.com/work/%s/credits" % id, unicode=True)
#data['credits'] = parseTable(html) #data['credits'] = parse_table(html)
html = read_url("http://allmovie.com/work/%s/review" % id, unicode=True) html = read_url("http://allmovie.com/work/%s/review" % id, unicode=True)
data['review'] = strip_tags(find_re(html, '<div class="toggle-text" itemprop="description">(.*?)</div>')).strip() data['review'] = strip_tags(find_re(html, '<div class="toggle-text" itemprop="description">(.*?)</div>')).strip()
return data return data
def getUrl(id): def get_url(id):
return "http://allmovie.com/work/%s" % id return "http://allmovie.com/work/%s" % id
def parseEntry(html, title): def parse_entry(html, title):
html = find_re(html, '<dt>%s</dt>.*?<dd>(.*?)</dd>' % title) html = find_re(html, '<dt>%s</dt>.*?<dd>(.*?)</dd>' % title)
return strip_tags(html).strip() return strip_tags(html).strip()
def parseList(html, title): def parse_list(html, title):
html = find_re(html, '<dt>%s</dt>.*?<dd>(.*?)</dd>' % title.lower()) html = find_re(html, '<dt>%s</dt>.*?<dd>(.*?)</dd>' % title.lower())
r = map(lambda x: strip_tags(x), re.compile('<li>(.*?)</li>', re.DOTALL).findall(html)) r = map(lambda x: strip_tags(x), re.compile('<li>(.*?)</li>', re.DOTALL).findall(html))
if not r and html: if not r and html:
r = [strip_tags(html)] r = [strip_tags(html)]
return r return r
def parseTable(html): def parse_table(html):
return map( return map(
lambda x: map( lambda x: map(
lambda x: strip_tags(x).strip().replace('&nbsp;', ''), lambda x: strip_tags(x).strip().replace('&nbsp;', ''),
@ -77,10 +77,10 @@ def parseTable(html):
find_re(html, '<div id="results-table">(.*?)</table>').split('</tr>')[:-1] find_re(html, '<div id="results-table">(.*?)</table>').split('</tr>')[:-1]
) )
def parseText(html, title): def parse_text(html, title):
return strip_tags(find_re(html, '%s</td>.*?<td colspan="2"><p>(.*?)</td>' % title)).strip() return strip_tags(find_re(html, '%s</td>.*?<td colspan="2"><p>(.*?)</td>' % title)).strip()
if __name__ == '__main__': if __name__ == '__main__':
print getData('129689') print get_data('129689')
# print getData('177524') # print get_data('177524')

View file

@ -13,17 +13,17 @@ def findISBN(title, author):
data = read_url(url, unicode=True) data = read_url(url, unicode=True)
links = re.compile('href="(http://www.amazon.com/.*?/dp/.*?)"').findall(data) links = re.compile('href="(http://www.amazon.com/.*?/dp/.*?)"').findall(data)
id = find_re(re.compile('href="(http://www.amazon.com/.*?/dp/.*?)"').findall(data)[0], '/dp/(.*?)/') id = find_re(re.compile('href="(http://www.amazon.com/.*?/dp/.*?)"').findall(data)[0], '/dp/(.*?)/')
data = getData(id) data = get_data(id)
if author in data['authors']: if author in data['authors']:
return data return data
return {} return {}
def getData(id): def get_data(id):
url = "http://www.amazon.com/title/dp/%s/" % id url = "http://www.amazon.com/title/dp/%s/" % id
data = read_url(url, unicode=True) data = read_url(url, unicode=True)
def findData(key): def find_data(key):
return find_re(data, '<li><b>%s:</b>(.*?)</li>'% key).strip() return find_re(data, '<li><b>%s:</b>(.*?)</li>'% key).strip()
r = {} r = {}
@ -34,15 +34,15 @@ def getData(id):
t = re.compile('>(.*?)</a> \(Translator\)').findall(data) t = re.compile('>(.*?)</a> \(Translator\)').findall(data)
if t: if t:
r['translator'] = t r['translator'] = t
r['publisher'] = findData('Publisher') r['publisher'] = find_data('Publisher')
r['language'] = findData('Language') r['language'] = find_data('Language')
r['isbn-10'] = findData('ISBN-10') r['isbn-10'] = find_data('ISBN-10')
r['isbn-13'] = findData('ISBN-13').replace('-', '') r['isbn-13'] = find_data('ISBN-13').replace('-', '')
r['dimensions'] = find_re(data, '<li><b>.*?Product Dimensions:.*?</b>(.*?)</li>') r['dimensions'] = find_re(data, '<li><b>.*?Product Dimensions:.*?</b>(.*?)</li>')
r['pages'] = findData('Paperback') r['pages'] = find_data('Paperback')
if not r['pages']: if not r['pages']:
r['pages'] = findData('Hardcover') r['pages'] = find_data('Hardcover')
r['review'] = strip_tags(find_re(data, '<h3 class="productDescriptionSource">Review</h3>.*?<div class="productDescriptionWrapper">(.*?)</div>').replace('<br />', '\n')).strip() r['review'] = strip_tags(find_re(data, '<h3 class="productDescriptionSource">Review</h3>.*?<div class="productDescriptionWrapper">(.*?)</div>').replace('<br />', '\n')).strip()

View file

@ -14,7 +14,7 @@ HEADERS = {
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7) ' USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7) '
USER_AGENT += 'AppleWebKit/534.48.3 (KHTML, like Gecko) Version/5.1 Safari/534.48.3' USER_AGENT += 'AppleWebKit/534.48.3 (KHTML, like Gecko) Version/5.1 Safari/534.48.3'
def getMovieData(title, director): def get_movie_data(title, director):
if isinstance(title, unicode): if isinstance(title, unicode):
title = title.encode('utf-8') title = title.encode('utf-8')
if isinstance(director, unicode): if isinstance(director, unicode):
@ -60,8 +60,8 @@ def getMovieData(title, director):
return data return data
if __name__ == '__main__': if __name__ == '__main__':
print getMovieData('Alphaville', 'Jean-Luc Godard') print get_movie_data('Alphaville', 'Jean-Luc Godard')
print getMovieData('Sin City', 'Roberto Rodriguez') print get_movie_data('Sin City', 'Roberto Rodriguez')
print getMovieData('Breathless', 'Jean-Luc Godard') print get_movie_data('Breathless', 'Jean-Luc Godard')
print getMovieData('Capitalism: A Love Story', 'Michael Moore') print get_movie_data('Capitalism: A Love Story', 'Michael Moore')
print getMovieData('Film Socialisme', 'Jean-Luc Godard') print get_movie_data('Film Socialisme', 'Jean-Luc Godard')

View file

@ -3,15 +3,15 @@
from .. import cache from .. import cache
from ..utils import json from ..utils import json
def getId(url): def get_id(url):
return url.split("/")[-1] return url.split("/")[-1]
def getUrl(id): def get_url(id):
return "http://www.archive.org/details/%s" % id return "http://www.archive.org/details/%s" % id
def getData(id): def get_data(id):
data = {} data = {}
url = getUrl(id) url = get_url(id)
details = cache.read_url('%s?output=json' % url) details = cache.read_url('%s?output=json' % url)
details = json.loads(details) details = json.loads(details)
for key in ('title', 'description', 'runtime'): for key in ('title', 'description', 'runtime'):

View file

@ -9,25 +9,25 @@ from ox.text import find_re, remove_special_characters
import imdb import imdb
def getId(url): def get_id(url):
return url.split("/")[-1] return url.split("/")[-1]
def getUrl(id): def get_url(id):
return "http://www.criterion.com/films/%s" % id return "http://www.criterion.com/films/%s" % id
def getData(id, timeout=ox.cache.cache_timeout, get_imdb=False): def get_data(id, timeout=ox.cache.cache_timeout, get_imdb=False):
''' '''
>>> getData('1333')['imdbId'] >>> get_data('1333')['imdbId']
u'0060304' u'0060304'
>>> getData('236')['posters'][0] >>> get_data('236')['posters'][0]
u'http://criterion_production.s3.amazonaws.com/release_images/1586/ThirdManReplace.jpg' u'http://criterion_production.s3.amazonaws.com/release_images/1586/ThirdManReplace.jpg'
>>> getData('786')['posters'][0] >>> get_data('786')['posters'][0]
u'http://criterion_production.s3.amazonaws.com/product_images/185/343_box_348x490.jpg' u'http://criterion_production.s3.amazonaws.com/product_images/185/343_box_348x490.jpg'
''' '''
data = { data = {
"url": getUrl(id) "url": get_url(id)
} }
try: try:
html = read_url(data["url"], timeout=timeout, unicode=True) html = read_url(data["url"], timeout=timeout, unicode=True)
@ -71,21 +71,21 @@ def getData(id, timeout=ox.cache.cache_timeout, get_imdb=False):
if timeout == ox.cache.cache_timeout: if timeout == ox.cache.cache_timeout:
timeout = -1 timeout = -1
if get_imdb: if get_imdb:
data['imdbId'] = imdb.getMovieId(data['title'], data['imdbId'] = imdb.get_movie_id(data['title'],
data['director'], data['year'], timeout=timeout) data['director'], data['year'], timeout=timeout)
return data return data
def getIds(): def get_ids():
ids = [] ids = []
html = read_url("http://www.criterion.com/library/expanded_view?m=dvd&p=1&pp=50&s=spine", unicode=True) html = read_url("http://www.criterion.com/library/expanded_view?m=dvd&p=1&pp=50&s=spine", unicode=True)
results = re.compile("\&amp;p=(\d+)\&").findall(html) results = re.compile("\&amp;p=(\d+)\&").findall(html)
pages = max(map(int, results)) pages = max(map(int, results))
for page in range(1, pages): for page in range(1, pages):
for id in getIdsByPage(page): for id in get_idsByPage(page):
ids.append(id) ids.append(id)
return map(lambda id: str(id), sorted(map(lambda id: int(id), set(ids)))) return map(lambda id: str(id), sorted(map(lambda id: int(id), set(ids))))
def getIdsByPage(page): def get_idsByPage(page):
ids = [] ids = []
url = "http://www.criterion.com/library/expanded_view?m=dvd&p=%s&pp=50&s=spine" % page url = "http://www.criterion.com/library/expanded_view?m=dvd&p=%s&pp=50&s=spine" % page
html = read_url(url, unicode=True) html = read_url(url, unicode=True)
@ -101,4 +101,4 @@ def getIdsByPage(page):
return set(ids) return set(ids)
if __name__ == '__main__': if __name__ == '__main__':
print getIds() print get_ids()

View file

@ -5,7 +5,7 @@ from urllib import unquote
from ox.cache import read_url from ox.cache import read_url
def getVideoUrl(url): def get_video_url(url):
''' '''
>>> getVideoUrl('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3opar_priere-pour-refuznik-1-jeanluc-goda_shortfilms').split('?auth')[0] >>> getVideoUrl('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3opar_priere-pour-refuznik-1-jeanluc-goda_shortfilms').split('?auth')[0]
'http://www.dailymotion.com/cdn/FLV-320x240/video/x3opar_priere-pour-refuznik-1-jean-luc-god_shortfilms.flv' 'http://www.dailymotion.com/cdn/FLV-320x240/video/x3opar_priere-pour-refuznik-1-jean-luc-god_shortfilms.flv'

View file

@ -9,7 +9,7 @@ from ox.cache import read_url
import google import google
def getShowUrl(title): def get_show_url(title):
''' '''
Search Epguide Url for Show via Show Title. Search Epguide Url for Show via Show Title.
Use Google to search the url, this is also done on Epguide. Use Google to search the url, this is also done on Epguide.
@ -20,7 +20,7 @@ def getShowUrl(title):
return url return url
return None return None
def getShowData(url): def get_show_data(url):
data = read_url(url, unicode=True) data = read_url(url, unicode=True)
r = {} r = {}
r['title'] = strip_tags(find_re(data, '<h1>(.*?)</h1>')) r['title'] = strip_tags(find_re(data, '<h1>(.*?)</h1>'))

View file

@ -9,28 +9,28 @@ from ox import find_re, strip_tags
from ox.web.imdb import ImdbCombined from ox.web.imdb import ImdbCombined
def getData(id, timeout=-1): def get_data(id, timeout=-1):
''' '''
>>> getData('the-matrix')['poster'] >>> get_data('the-matrix')['poster']
'http://content7.flixster.com/movie/16/90/52/1690525_gal.jpg' 'http://content7.flixster.com/movie/16/90/52/1690525_gal.jpg'
>>> getData('0133093')['poster'] >>> get_data('0133093')['poster']
'http://content7.flixster.com/movie/16/90/52/1690525_gal.jpg' 'http://content7.flixster.com/movie/16/90/52/1690525_gal.jpg'
>>> getData('2-or-3-things-i-know-about-her')['poster'] >>> get_data('2-or-3-things-i-know-about-her')['poster']
'http://content6.flixster.com/movie/10/95/43/10954392_gal.jpg' 'http://content6.flixster.com/movie/10/95/43/10954392_gal.jpg'
>>> getData('0078875')['rottentomatoes_id'] >>> get_data('0078875')['rottentomatoes_id']
'http://www.rottentomatoes.com/m/the-tin-drum/' 'http://www.rottentomatoes.com/m/the-tin-drum/'
''' '''
if len(id) == 7: if len(id) == 7:
try: try:
int(id) int(id)
id = getIdByImdb(id) id = get_id(imdb=id)
except: except:
pass pass
data = { data = {
"url": getUrl(id), "url": get_url(id),
} }
html = read_url(data['url'], timeout=timeout, timeout=True) html = read_url(data['url'], timeout=timeout, timeout=True)
doc = document_fromstring(html) doc = document_fromstring(html)
@ -55,21 +55,20 @@ def getData(id, timeout=-1):
return None return None
return data return data
def getIdByImdb(imdbId): def get_id(url=None, imdb=None):
''' '''
>>> getIdByImdb('0133093') >>> get_id(imdb='0133093')
u'the-matrix' u'the-matrix'
#>>> getIdByImdb('0060304') #>>> get_id(imdb='0060304')
#u'2-or-3-things-i-know-about-her' #u'2-or-3-things-i-know-about-her'
''' '''
i = ImdbCombined(imdbId) if imdb:
title = i['title'] i = ImdbCombined(imdb)
return title.replace(' ', '-').lower().replace("'", '') title = i['title']
return title.replace(' ', '-').lower().replace("'", '')
def getId(url):
return url.split('/')[-1] return url.split('/')[-1]
def getUrl(id): def get_url(id):
return "http://www.flixster.com/movie/%s"%id return "http://www.flixster.com/movie/%s"%id

View file

@ -5,7 +5,7 @@ import json
from ox.cache import read_url from ox.cache import read_url
from ox import find_re from ox import find_re
class Imdb(dict): class Freebase(dict):
def __init__(self, id, timeout=-1): def __init__(self, id, timeout=-1):
url = "http://ids.freebaseapps.com/get_ids?id=/authority/imdb/title/tt%s" % id url = "http://ids.freebaseapps.com/get_ids?id=/authority/imdb/title/tt%s" % id
''' '''

View file

@ -20,7 +20,7 @@ def read_url(url, data=None, headers=ox.cache.DEFAULT_HEADERS, timeout=ox.cache.
headers = headers.copy() headers = headers.copy()
return ox.cache.read_url(url, data, headers, timeout, unicode=unicode) return ox.cache.read_url(url, data, headers, timeout, unicode=unicode)
def getUrl(id): def get_url(id):
return "http://www.imdb.com/title/tt%s/" % id return "http://www.imdb.com/title/tt%s/" % id
class Imdb(SiteParser): class Imdb(SiteParser):
@ -420,7 +420,7 @@ class ImdbCombined(Imdb):
self.regex = _regex self.regex = _regex
super(ImdbCombined, self).__init__(id, timeout) super(ImdbCombined, self).__init__(id, timeout)
def getMovieIdByTitle(title, timeout=-1): def get_movie_by_title(title, timeout=-1):
''' '''
This only works for exact title matches from the data dump This only works for exact title matches from the data dump
Usually in the format Usually in the format
@ -431,22 +431,22 @@ def getMovieIdByTitle(title, timeout=-1):
If there is more than one film with that title for the year If there is more than one film with that title for the year
Title (Year/I) Title (Year/I)
>>> getMovieIdByTitle(u'"Father Knows Best" (1954) {(#5.34)}') >>> get_movie_by_title(u'"Father Knows Best" (1954) {(#5.34)}')
u'1602860' u'1602860'
>>> getMovieIdByTitle(u'The Matrix (1999)') >>> get_movie_by_title(u'The Matrix (1999)')
u'0133093' u'0133093'
>>> getMovieIdByTitle(u'Little Egypt (1951)') >>> get_movie_by_title(u'Little Egypt (1951)')
u'0043748' u'0043748'
>>> getMovieIdByTitle(u'Little Egypt (1897/I)') >>> get_movie_by_title(u'Little Egypt (1897/I)')
u'0214882' u'0214882'
>>> getMovieIdByTitle(u'Little Egypt') >>> get_movie_by_title(u'Little Egypt')
None None
>>> getMovieIdByTitle(u'"Dexter" (2006) {Father Knows Best (#1.9)}') >>> get_movie_by_title(u'"Dexter" (2006) {Father Knows Best (#1.9)}')
u'0866567' u'0866567'
''' '''
params = {'s':'tt','q': title} params = {'s':'tt','q': title}
@ -465,21 +465,21 @@ def getMovieIdByTitle(title, timeout=-1):
return results[0] return results[0]
return None return None
def getMovieId(title, director='', year='', timeout=-1): def get_movie_id(title, director='', year='', timeout=-1):
''' '''
>>> getMovieId('The Matrix') >>> get_movie_id('The Matrix')
u'0133093' u'0133093'
>>> getMovieId('2 or 3 Things I Know About Her', 'Jean-Luc Godard') >>> get_movie_id('2 or 3 Things I Know About Her', 'Jean-Luc Godard')
u'0060304' u'0060304'
>>> getMovieId('2 or 3 Things I Know About Her', 'Jean-Luc Godard', '1967') >>> get_movie_id('2 or 3 Things I Know About Her', 'Jean-Luc Godard', '1967')
u'0060304' u'0060304'
>>> getMovieId(u"Histoire(s) du cinema: Le controle de l'univers", 'Jean-Luc Godard') >>> get_movie_id(u"Histoire(s) du cinema: Le controle de l'univers", 'Jean-Luc Godard')
u'0179214' u'0179214'
>>> getMovieId(u"Histoire(s) du cinéma: Le contrôle de l'univers", 'Jean-Luc Godard') >>> get_movie_id(u"Histoire(s) du cinéma: Le contrôle de l'univers", 'Jean-Luc Godard')
u'0179214' u'0179214'
''' '''
imdbId = { imdbId = {
@ -555,12 +555,12 @@ def getMovieId(title, director='', year='', timeout=-1):
#or nothing #or nothing
return '' return ''
def getMoviePoster(imdbId): def get_movie_poster(imdbId):
''' '''
>>> getMoviePoster('0133093') >>> get_movie_poster('0133093')
'http://ia.media-imdb.com/images/M/MV5BMjEzNjg1NTg2NV5BMl5BanBnXkFtZTYwNjY3MzQ5._V1._SX338_SY475_.jpg' 'http://ia.media-imdb.com/images/M/MV5BMjEzNjg1NTg2NV5BMl5BanBnXkFtZTYwNjY3MzQ5._V1._SX338_SY475_.jpg'
>>> getMoviePoster('0994352') >>> get_movie_poster('0994352')
'http://ia.media-imdb.com/images/M/MV5BMjA3NzMyMzU1MV5BMl5BanBnXkFtZTcwNjc1ODUwMg@@._V1._SX594_SY755_.jpg' 'http://ia.media-imdb.com/images/M/MV5BMjA3NzMyMzU1MV5BMl5BanBnXkFtZTcwNjc1ODUwMg@@._V1._SX594_SY755_.jpg'
''' '''
info = ImdbCombined(imdbId) info = ImdbCombined(imdbId)
@ -570,10 +570,10 @@ def getMoviePoster(imdbId):
poster = find_re(data, 'img id="primary-img".*?src="(.*?)"') poster = find_re(data, 'img id="primary-img".*?src="(.*?)"')
return poster return poster
elif 'series' in info: elif 'series' in info:
return getMoviePoster(info['series']) return get_movie_poster(info['series'])
return '' return ''
def maxVotes(): def max_votes():
url = 'http://www.imdb.com/search/title?num_votes=500000,&sort=num_votes,desc' url = 'http://www.imdb.com/search/title?num_votes=500000,&sort=num_votes,desc'
data = ox.cache.read_url(url) data = ox.cache.read_url(url)
votes = max([int(v.replace(',', '')) votes = max([int(v.replace(',', ''))
@ -581,7 +581,7 @@ def maxVotes():
return votes return votes
def guess(title, director='', timeout=-1): def guess(title, director='', timeout=-1):
return getMovieId(title, director, timeout=timeout) return get_movie_id(title, director, timeout=timeout)
if __name__ == "__main__": if __name__ == "__main__":
import json import json

View file

@ -7,19 +7,19 @@ from ox.html import strip_tags
from ox.text import find_re from ox.text import find_re
def getData(id): def get_data(id):
''' '''
>>> getData('1991/silence_of_the_lambs')['imdbId'] >>> get_data('1991/silence_of_the_lambs')['imdbId']
u'0102926' u'0102926'
>>> getData('1991/silence_of_the_lambs')['posters'][0] >>> get_data('1991/silence_of_the_lambs')['posters'][0]
u'http://www.impawards.com/1991/posters/silence_of_the_lambs_ver1.jpg' u'http://www.impawards.com/1991/posters/silence_of_the_lambs_ver1.jpg'
>>> getData('1991/silence_of_the_lambs')['url'] >>> get_data('1991/silence_of_the_lambs')['url']
u'http://www.impawards.com/1991/silence_of_the_lambs_ver1.html' u'http://www.impawards.com/1991/silence_of_the_lambs_ver1.html'
''' '''
data = { data = {
'url': getUrl(id) 'url': get_url(id)
} }
html = read_url(data['url'], unicode=True) html = read_url(data['url'], unicode=True)
data['imdbId'] = find_re(html, 'imdb.com/title/tt(\d{7})') data['imdbId'] = find_re(html, 'imdb.com/title/tt(\d{7})')
@ -48,7 +48,7 @@ def getData(id):
return data return data
def getId(url): def get_id(url):
split = url.split('/') split = url.split('/')
year = split[3] year = split[3]
split = split[4][:-5].split('_') split = split[4][:-5].split('_')
@ -59,26 +59,25 @@ def getId(url):
id = '%s/%s' % (year, '_'.join(split)) id = '%s/%s' % (year, '_'.join(split))
return id return id
def getIds(): def get_ids(page=None):
ids = [] ids = []
if page:
html = read_url('http://www.impawards.com/archives/page%s.html' % page, timeout = -1, unicode=True)
results = re.compile('<a href = \.\./(.*?)>', re.DOTALL).findall(html)
for result in results:
url = 'http://impawards.com/%s' % result
ids.append(get_id(url))
return set(ids)
#get all
html = read_url('http://www.impawards.com/archives/latest.html', timeout = 60*60, unicode=True) html = read_url('http://www.impawards.com/archives/latest.html', timeout = 60*60, unicode=True)
pages = int(find_re(html, '<a href= page(.*?).html>')) + 1 pages = int(find_re(html, '<a href= page(.*?).html>')) + 1
for page in range(pages, 0, -1): for page in range(pages, 0, -1):
for id in getIdsByPage(page): for id in get_ids(page):
if not id in ids: if not id in ids:
ids.append(id) ids.append(id)
return ids return ids
def getIdsByPage(page): def get_url(id):
ids = []
html = read_url('http://www.impawards.com/archives/page%s.html' % page, timeout = -1, unicode=True)
results = re.compile('<a href = \.\./(.*?)>', re.DOTALL).findall(html)
for result in results:
url = 'http://impawards.com/%s' % result
ids.append(getId(url))
return set(ids)
def getUrl(id):
url = u"http://www.impawards.com/%s.html" % id url = u"http://www.impawards.com/%s.html" % id
html = read_url(url, unicode=True) html = read_url(url, unicode=True)
if find_re(html, "No Movie Posters on This Page"): if find_re(html, "No Movie Posters on This Page"):
@ -297,5 +296,5 @@ _id_map = {
} }
if __name__ == '__main__': if __name__ == '__main__':
ids = getIds() ids = get_ids()
print sorted(ids), len(ids) print sorted(ids), len(ids)

View file

@ -24,7 +24,7 @@ ITUNES_HEADERS = {
'Connection': 'close', 'Connection': 'close',
} }
def composeUrl(request, parameters): def compose_url(request, parameters):
if request == 'advancedSearch': if request == 'advancedSearch':
url = 'http://ax.phobos.apple.com.edgesuite.net/WebObjects/MZSearch.woa/wa/advancedSearch?' url = 'http://ax.phobos.apple.com.edgesuite.net/WebObjects/MZSearch.woa/wa/advancedSearch?'
if parameters['media'] == 'music': if parameters['media'] == 'music':
@ -60,7 +60,7 @@ def composeUrl(request, parameters):
url = 'http://phobos.apple.com/WebObjects/MZStore.woa/wa/viewMovie?id=%s&prvw=1' % parameters['id'] url = 'http://phobos.apple.com/WebObjects/MZStore.woa/wa/viewMovie?id=%s&prvw=1' % parameters['id']
return url return url
def parseXmlDict(xml): def parse_xml_dict(xml):
values = {} values = {}
strings = xml.split('<key>') strings = xml.split('<key>')
for string in strings: for string in strings:
@ -78,7 +78,7 @@ def parseXmlDict(xml):
values[key] = value values[key] = value
return values return values
def parseCast(xml, title): def parse_cast(xml, title):
list = [] list = []
try: try:
strings = find_re(xml, '<SetFontStyle normalStyle="textColor">%s(.*?)</VBoxView>' % title[:-1].upper()).split('</GotoURL>') strings = find_re(xml, '<SetFontStyle normalStyle="textColor">%s(.*?)</VBoxView>' % title[:-1].upper()).split('</GotoURL>')
@ -89,7 +89,7 @@ def parseCast(xml, title):
except: except:
return list return list
def parseMovies(xml, title): def parse_movies(xml, title):
list = [] list = []
try: try:
strings = find_re(xml, '<SetFontStyle normalStyle="outlineTitleFontStyle"><b>%s(.*?)</Test>' % title[:-1].upper()).split('</GotoURL>') strings = find_re(xml, '<SetFontStyle normalStyle="outlineTitleFontStyle"><b>%s(.*?)</Test>' % title[:-1].upper()).split('</GotoURL>')
@ -109,17 +109,17 @@ class ItunesAlbum:
self.title = title self.title = title
self.artist = artist self.artist = artist
if not id: if not id:
self.id = self.getId() self.id = self.get_id()
def getId(self): def get_id(self):
url = composeUrl('advancedSearch', {'media': 'music', 'title': self.title, 'artist': self.artist}) url = compose_url('advancedSearch', {'media': 'music', 'title': self.title, 'artist': self.artist})
xml = read_url(url, headers = ITUNES_HEADERS) xml = read_url(url, headers = ITUNES_HEADERS)
id = find_re(xml, 'viewAlbum\?id=(.*?)&') id = find_re(xml, 'viewAlbum\?id=(.*?)&')
return id return id
def getData(self): def get_data(self):
data = {'id': self.id} data = {'id': self.id}
url = composeUrl('viewAlbum', {'id': self.id}) url = compose_url('viewAlbum', {'id': self.id})
xml = read_url(url, None, ITUNES_HEADERS) xml = read_url(url, None, ITUNES_HEADERS)
data['albumName'] = find_re(xml, '<B>(.*?)</B>') data['albumName'] = find_re(xml, '<B>(.*?)</B>')
data['artistName'] = find_re(xml, '<b>(.*?)</b>') data['artistName'] = find_re(xml, '<b>(.*?)</b>')
@ -130,7 +130,7 @@ class ItunesAlbum:
data['tracks'] = [] data['tracks'] = []
strings = find_re(xml, '<key>items</key>.*?<dict>(.*?)$').split('<dict>') strings = find_re(xml, '<key>items</key>.*?<dict>(.*?)$').split('<dict>')
for string in strings: for string in strings:
data['tracks'].append(parseXmlDict(string)) data['tracks'].append(parse_xml_dict(string))
data['type'] = find_re(xml, '<key>listType</key><string>(.*?)<') data['type'] = find_re(xml, '<key>listType</key><string>(.*?)<')
return data return data
@ -140,48 +140,48 @@ class ItunesMovie:
self.title = title self.title = title
self.director = director self.director = director
if not id: if not id:
self.id = self.getId() self.id = self.get_id()
def getId(self): def get_id(self):
url = composeUrl('advancedSearch', {'media': 'movie', 'title': self.title, 'director': self.director}) url = compose_url('advancedSearch', {'media': 'movie', 'title': self.title, 'director': self.director})
xml = read_url(url, headers = ITUNES_HEADERS) xml = read_url(url, headers = ITUNES_HEADERS)
id = find_re(xml, 'viewMovie\?id=(.*?)&') id = find_re(xml, 'viewMovie\?id=(.*?)&')
return id return id
def getData(self): def get_data(self):
data = {'id': self.id} data = {'id': self.id}
url = composeUrl('viewMovie', {'id': self.id}) url = compose_url('viewMovie', {'id': self.id})
xml = read_url(url, None, ITUNES_HEADERS) xml = read_url(url, None, ITUNES_HEADERS)
f = open('/Users/rolux/Desktop/iTunesData.xml', 'w') f = open('/Users/rolux/Desktop/iTunesData.xml', 'w')
f.write(xml) f.write(xml)
f.close() f.close()
data['actors'] = parseCast(xml, 'actors') data['actors'] = parse_cast(xml, 'actors')
string = find_re(xml, 'Average Rating:(.*?)</HBoxView>') string = find_re(xml, 'Average Rating:(.*?)</HBoxView>')
data['averageRating'] = string.count('rating_star_000033.png') + string.count('&#189;') * 0.5 data['averageRating'] = string.count('rating_star_000033.png') + string.count('&#189;') * 0.5
data['directors'] = parseCast(xml, 'directors') data['directors'] = parse_cast(xml, 'directors')
data['format'] = find_re(xml, 'Format:(.*?)<') data['format'] = find_re(xml, 'Format:(.*?)<')
data['genre'] = decode_html(find_re(xml, 'Genre:(.*?)<')) data['genre'] = decode_html(find_re(xml, 'Genre:(.*?)<'))
data['plotSummary'] = decode_html(find_re(xml, 'PLOT SUMMARY</b>.*?<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>')) data['plotSummary'] = decode_html(find_re(xml, 'PLOT SUMMARY</b>.*?<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
data['posterUrl'] = find_re(xml, 'reflection="." url="(.*?)"') data['posterUrl'] = find_re(xml, 'reflection="." url="(.*?)"')
data['producers'] = parseCast(xml, 'producers') data['producers'] = parse_cast(xml, 'producers')
data['rated'] = find_re(xml, 'Rated(.*?)<') data['rated'] = find_re(xml, 'Rated(.*?)<')
data['relatedMovies'] = parseMovies(xml, 'related movies') data['relatedMovies'] = parse_movies(xml, 'related movies')
data['releaseDate'] = find_re(xml, 'Released(.*?)<') data['releaseDate'] = find_re(xml, 'Released(.*?)<')
data['runTime'] = find_re(xml, 'Run Time:(.*?)<') data['runTime'] = find_re(xml, 'Run Time:(.*?)<')
data['screenwriters'] = parseCast(xml, 'screenwriters') data['screenwriters'] = parse_cast(xml, 'screenwriters')
data['soundtrackId'] = find_re(xml, 'viewAlbum\?id=(.*?)&') data['soundtrackId'] = find_re(xml, 'viewAlbum\?id=(.*?)&')
data['trailerUrl'] = find_re(xml, 'autoplay="." url="(.*?)"') data['trailerUrl'] = find_re(xml, 'autoplay="." url="(.*?)"')
return data return data
if __name__ == '__main__': if __name__ == '__main__':
from ox.utils import json from ox.utils import json
data = ItunesAlbum(title = 'So Red the Rose', artist = 'Arcadia').getData() data = ItunesAlbum(title = 'So Red the Rose', artist = 'Arcadia').get_data()
print json.dumps(data, sort_keys = True, indent = 4) print json.dumps(data, sort_keys = True, indent = 4)
data = ItunesMovie(title = 'The Matrix', director = 'Wachowski').getData() data = ItunesMovie(title = 'The Matrix', director = 'Wachowski').get_data()
print json.dumps(data, sort_keys = True, indent = 4) print json.dumps(data, sort_keys = True, indent = 4)
for v in data['relatedMovies']: for v in data['relatedMovies']:
data = ItunesMovie(id = v['id']).getData() data = ItunesMovie(id = v['id']).get_data()
print json.dumps(data, sort_keys = True, indent = 4) print json.dumps(data, sort_keys = True, indent = 4)
data = ItunesMovie(id='272960052').getData() data = ItunesMovie(id='272960052').get_data()
print json.dumps(data, sort_keys = True, indent = 4) print json.dumps(data, sort_keys = True, indent = 4)

View file

@ -5,7 +5,7 @@ from ox.html import decode_html
from ox.text import find_re from ox.text import find_re
def getLyrics(title, artist): def get_lyrics(title, artist):
html = read_url('http://lyricsfly.com/api/') html = read_url('http://lyricsfly.com/api/')
key = find_re(html, '<font color=green><b>(.*?)</b></font>') key = find_re(html, '<font color=green><b>(.*?)</b></font>')
url = 'http://lyricsfly.com/api/api.php?i=%s&a=%s&t=%s' % (key, artist, title) url = 'http://lyricsfly.com/api/api.php?i=%s&a=%s&t=%s' % (key, artist, title)

View file

@ -7,25 +7,24 @@ from lxml.html import document_fromstring
from ox.cache import read_url from ox.cache import read_url
from ox import find_re, strip_tags from ox import find_re, strip_tags
def getUrl(id): def get_url(id=None, imdb=None):
if imdb:
url = "http://www.imdb.com/title/tt%s/criticreviews" % imdb
data = read_url(url)
metacritic_url = find_re(data, '"(http://www.metacritic.com/movie/.*?)"')
return metacritic_url or None
return 'http://www.metacritic.com/movie/%s' % id return 'http://www.metacritic.com/movie/%s' % id
def getId(url): def get_id(url):
return url.split('/')[-1] return url.split('/')[-1]
def getUrlByImdb(imdb): def get_show_url(title):
url = "http://www.imdb.com/title/tt%s/criticreviews" % imdb
data = read_url(url)
metacritic_url = find_re(data, '"(http://www.metacritic.com/movie/.*?)"')
return metacritic_url or None
def getMetacriticShowUrl(title):
title = quote(title) title = quote(title)
url = "http://www.metacritic.com/search/process?ty=6&ts=%s&tfs=tvshow_title&x=0&y=0&sb=0&release_date_s=&release_date_e=&metascore_s=&metascore_e=" % title url = "http://www.metacritic.com/search/process?ty=6&ts=%s&tfs=tvshow_title&x=0&y=0&sb=0&release_date_s=&release_date_e=&metascore_s=&metascore_e=" % title
data = read_url(url) data = read_url(url)
return find_re(data, '(http://www.metacritic.com/tv/shows/.*?)\?') return find_re(data, '(http://www.metacritic.com/tv/shows/.*?)\?')
def getData(url): def get_data(url):
data = read_url(url, unicode=True) data = read_url(url, unicode=True)
doc = document_fromstring(data) doc = document_fromstring(data)
score = filter(lambda s: s.attrib.get('property') == 'v:average', score = filter(lambda s: s.attrib.get('property') == 'v:average',
@ -57,7 +56,7 @@ def getData(url):
return { return {
'critics': metacritics, 'critics': metacritics,
'id': getId(url), 'id': get_id(url),
'score': score, 'score': score,
'url': url, 'url': url,
} }

View file

@ -13,7 +13,7 @@ import ox
from torrent import Torrent from torrent import Torrent
def _parseResultsPage(data, max_results=10): def _parse_results_page(data, max_results=10):
results=[] results=[]
regexp = '''<tr><td>(.*?)</td><td>(.*?)<a href="/tor/(.*?)">(.*?)</a>.*?</td>.*?</tr>''' regexp = '''<tr><td>(.*?)</td><td>(.*?)<a href="/tor/(.*?)">(.*?)</a>.*?</td>.*?</tr>'''
for row in re.compile(regexp, re.DOTALL).findall(data): for row in re.compile(regexp, re.DOTALL).findall(data):
@ -27,22 +27,17 @@ def _parseResultsPage(data, max_results=10):
results.append((torrentTitle, torrentLink, '')) results.append((torrentTitle, torrentLink, ''))
return results return results
def findMovie(query, max_results=10): def find_movie(query=None, imdb=None, max_results=10):
'''search for torrents on mininova '''search for torrents on mininova
''' '''
url = "http://www.mininova.org/search/%s/seeds" % quote(query) if imdb:
url = "http://www.mininova.org/imdb/?imdb=%s" % normalize_imdbid(imdb)
else:
url = "http://www.mininova.org/search/%s/seeds" % quote(query)
data = read_url(url, unicode=True) data = read_url(url, unicode=True)
return _parseResultsPage(data, max_results) return _parse_results_page(data, max_results)
def findMovieByImdb(imdbId): def get_id(mininovaId):
'''find torrents on mininova for a given imdb id
'''
results = []
imdbId = normalize_imdbid(imdbId)
data = read_url("http://www.mininova.org/imdb/?imdb=%s" % imdbId, unicode=True)
return _parseResultsPage(data)
def getId(mininovaId):
mininovaId = unicode(mininovaId) mininovaId = unicode(mininovaId)
d = find_re(mininovaId, "/(\d+)") d = find_re(mininovaId, "/(\d+)")
if d: if d:
@ -54,7 +49,7 @@ def getId(mininovaId):
return mininovaId[-1] return mininovaId[-1]
def exists(mininovaId): def exists(mininovaId):
mininovaId = getId(mininovaId) mininovaId = get_id(mininovaId)
data = ox.net.read_url("http://www.mininova.org/tor/%s" % mininovaId) data = ox.net.read_url("http://www.mininova.org/tor/%s" % mininovaId)
if not data or 'Torrent not found...' in data: if not data or 'Torrent not found...' in data:
return False return False
@ -62,11 +57,11 @@ def exists(mininovaId):
return False return False
return True return True
def getData(mininovaId): def get_data(mininovaId):
_key_map = { _key_map = {
'by': u'uploader', 'by': u'uploader',
} }
mininovaId = getId(mininovaId) mininovaId = get_id(mininovaId)
torrent = dict() torrent = dict()
torrent[u'id'] = mininovaId torrent[u'id'] = mininovaId
torrent[u'domain'] = 'mininova.org' torrent[u'domain'] = 'mininova.org'
@ -101,7 +96,7 @@ class Mininova(Torrent):
'72dfa59d2338e4a48c78cec9de25964cddb64104' '72dfa59d2338e4a48c78cec9de25964cddb64104'
''' '''
def __init__(self, mininovaId): def __init__(self, mininovaId):
self.data = getData(mininovaId) self.data = get_data(mininovaId)
if not self.data: if not self.data:
return return
Torrent.__init__(self) Torrent.__init__(self)

View file

@ -6,39 +6,39 @@ import re
from ox.cache import read_url from ox.cache import read_url
from ox import find_re from ox import find_re
def getData(id): def get_data(id):
''' '''
>>> getData('0060304')['posters'][0] >>> get_data('0060304')['posters'][0]
u'http://www.movieposterdb.com/posters/06_03/1967/0060304/l_99688_0060304_639fdd1e.jpg' u'http://www.movieposterdb.com/posters/06_03/1967/0060304/l_99688_0060304_639fdd1e.jpg'
>>> getData('0123456')['posters'] >>> get_data('0123456')['posters']
[] []
''' '''
data = { data = {
"url": getUrl(id) "url": get_url(id)
} }
data["posters"] = getPostersByUrl(data["url"]) data["posters"] = get_posters(data["url"])
return data return data
def getId(url): def get_id(url):
return url.split("/")[-2] return url.split("/")[-2]
def getPostersByUrl(url, group=True, timeout=-1): def get_posters(url, group=True, timeout=-1):
posters = [] posters = []
html = read_url(url, timeout=timeout, unicode=True) html = read_url(url, timeout=timeout, unicode=True)
if url in html: if url in html:
if group: if group:
results = re.compile('<a href="(http://www.movieposterdb.com/group/.+?)\??">', re.DOTALL).findall(html) results = re.compile('<a href="(http://www.movieposterdb.com/group/.+?)\??">', re.DOTALL).findall(html)
for result in results: for result in results:
posters += getPostersByUrl(result, False) posters += get_posters(result, False)
results = re.compile('<a href="(http://www.movieposterdb.com/poster/.+?)">', re.DOTALL).findall(html) results = re.compile('<a href="(http://www.movieposterdb.com/poster/.+?)">', re.DOTALL).findall(html)
for result in results: for result in results:
html = read_url(result, timeout=timeout, unicode=True) html = read_url(result, timeout=timeout, unicode=True)
posters.append(find_re(html, '"(http://www.movieposterdb.com/posters/.+?\.jpg)"')) posters.append(find_re(html, '"(http://www.movieposterdb.com/posters/.+?\.jpg)"'))
return posters return posters
def getUrl(id): def get_url(id):
return "http://www.movieposterdb.com/movie/%s/" % id return "http://www.movieposterdb.com/movie/%s/" % id
if __name__ == '__main__': if __name__ == '__main__':
print getData('0060304') print get_data('0060304')
print getData('0133093') print get_data('0133093')

View file

@ -7,7 +7,7 @@ from ox.cache import read_url
from ox import find_re, strip_tags from ox import find_re, strip_tags
from ox import langCode2To3, langTo3Code from ox import langCode2To3, langTo3Code
def findSubtitlesByImdb(imdb, parts = 1, language = "eng"): def find_subtitles(imdb, parts = 1, language = "eng"):
if len(language) == 2: if len(language) == 2:
language = langCode2To3(language) language = langCode2To3(language)
elif len(language) != 3: elif len(language) != 3:
@ -29,7 +29,7 @@ def findSubtitlesByImdb(imdb, parts = 1, language = "eng"):
opensubtitleId = find_re(data, '/en/subtitles/(.*?)/') opensubtitleId = find_re(data, '/en/subtitles/(.*?)/')
return opensubtitleId return opensubtitleId
def downloadSubtitleById(opensubtitle_id): def download_subtitle(opensubtitle_id):
srts = {} srts = {}
data = read_url('http://www.opensubtitles.org/en/subtitles/%s' % opensubtitle_id) data = read_url('http://www.opensubtitles.org/en/subtitles/%s' % opensubtitle_id)
reg_exp = 'href="(/en/download/file/.*?)">(.*?)</a>' reg_exp = 'href="(/en/download/file/.*?)">(.*?)</a>'

View file

@ -2,7 +2,7 @@
# vi:si:et:sw=4:sts=4:ts=4 # vi:si:et:sw=4:sts=4:ts=4
import ox.cache import ox.cache
def getPosterUrl(id): def get_poster_url(id):
url = "http://0xdb.org/%s/poster.0xdb.jpg" % id url = "http://0xdb.org/%s/poster.0xdb.jpg" % id
if ox.cache.exists(url): if ox.cache.exists(url):
return url return url

View file

@ -3,7 +3,7 @@
import re import re
from ox.net import read_url from ox.net import read_url
def getPosterUrl(id): def get_poster_url(id):
url = 'http://piratecinema.org/posters/' url = 'http://piratecinema.org/posters/'
html = read_url(url, unicode=True) html = read_url(url, unicode=True)
results = re.compile('src="(.+)" title=".+\((\d{7})\)"').findall(html) results = re.compile('src="(.+)" title=".+\((\d{7})\)"').findall(html)
@ -13,5 +13,5 @@ def getPosterUrl(id):
return '' return ''
if __name__ == '__main__': if __name__ == '__main__':
print getPosterUrl('0749451') print get_poster_url('0749451')

View file

@ -2,29 +2,30 @@
# vi:si:et:sw=4:sts=4:ts=4 # vi:si:et:sw=4:sts=4:ts=4
import re import re
from ox.cache import getHeaders, read_url from ox.cache import read_url
from ox import find_re, strip_tags from ox import find_re, strip_tags
def getUrlByImdb(imdb): def get_url(id=None, imdb=None):
#this would also wor but does not cache: #this would also wor but does not cache:
''' '''
from urllib2 import urlopen from urllib2 import urlopen
u = urlopen(url) u = urlopen(url)
return u.url return u.url
''' '''
url = "http://www.rottentomatoes.com/alias?type=imdbid&s=%s" % imdb if imdb:
data = read_url(url) url = "http://www.rottentomatoes.com/alias?type=imdbid&s=%s" % imdb
if "movie_title" in data: data = read_url(url)
movies = re.compile('(/m/.*?/)').findall(data) if "movie_title" in data:
if movies: movies = re.compile('(/m/.*?/)').findall(data)
return "http://www.rottentomatoes.com" + movies[0] if movies:
return "http://www.rottentomatoes.com" + movies[0]
return None return None
def get_og(data, key): def get_og(data, key):
return find_re(data, '<meta property="og:%s".*?content="(.*?)"' % key) return find_re(data, '<meta property="og:%s".*?content="(.*?)"' % key)
def getData(url): def get_data(url):
data = read_url(url) data = read_url(url)
r = {} r = {}
r['title'] = find_re(data, '<h1 class="movie_title">(.*?)</h1>') r['title'] = find_re(data, '<h1 class="movie_title">(.*?)</h1>')

View file

@ -27,7 +27,7 @@ class SiteParser(dict):
baseUrl = '' baseUrl = ''
regex = {} regex = {}
def getUrl(self, page): def get_url(self, page):
return "%s%s" % (self.baseUrl, page) return "%s%s" % (self.baseUrl, page)
def read_url(self, url, timeout): def read_url(self, url, timeout):
@ -35,7 +35,7 @@ class SiteParser(dict):
def __init__(self, timeout=-1): def __init__(self, timeout=-1):
for key in self.regex: for key in self.regex:
url = self.getUrl(self.regex[key]['page']) url = self.get_url(self.regex[key]['page'])
data = self.read_url(url, timeout) data = self.read_url(url, timeout)
if isinstance(self.regex[key]['re'], basestring): if isinstance(self.regex[key]['re'], basestring):
data = re.compile(self.regex[key]['re'], re.DOTALL).findall(data) data = re.compile(self.regex[key]['re'], re.DOTALL).findall(data)

View file

@ -9,7 +9,7 @@ from ox.html import decode_html, strip_tags
import ox.net import ox.net
def getNews(year, month, day): def get_news(year, month, day):
sections = [ sections = [
'politik', 'wirtschaft', 'panorama', 'sport', 'kultur', 'netzwelt', 'politik', 'wirtschaft', 'panorama', 'sport', 'kultur', 'netzwelt',
'wissenschaft', 'unispiegel', 'schulspiegel', 'reise', 'auto' 'wissenschaft', 'unispiegel', 'schulspiegel', 'reise', 'auto'
@ -27,7 +27,7 @@ def getNews(year, month, day):
for item in re.compile('<div class="spTeaserCenterpage(.*?)</p>', re.DOTALL).findall(html): for item in re.compile('<div class="spTeaserCenterpage(.*?)</p>', re.DOTALL).findall(html):
dateString = strip_tags(re.compile('<div class="spDateTime">(.*?)</div>', re.DOTALL).findall(item)[0]).strip() dateString = strip_tags(re.compile('<div class="spDateTime">(.*?)</div>', re.DOTALL).findall(item)[0]).strip()
try: try:
description = formatString(re.compile('<p>(.*?)<', re.DOTALL).findall(item)[0]) description = format_string(re.compile('<p>(.*?)<', re.DOTALL).findall(item)[0])
except: except:
description = '' description = ''
try: try:
@ -35,7 +35,7 @@ def getNews(year, month, day):
except: except:
imageUrl = '' imageUrl = ''
try: try:
title = formatString(re.compile('alt=[\'|"](.*?)[\'|"] title=', re.DOTALL).findall(item)[0]).replace(' : ', ': ').replace('::', ':') title = format_string(re.compile('alt=[\'|"](.*?)[\'|"] title=', re.DOTALL).findall(item)[0]).replace(' : ', ': ').replace('::', ':')
except: except:
title = '' title = ''
if dateString[:10] == date and description and imageUrl and title.find(': ') != -1: if dateString[:10] == date and description and imageUrl and title.find(': ') != -1:
@ -45,12 +45,12 @@ def getNews(year, month, day):
else: else:
new['date'] = '%s-%s-%s %s:%s' % (dateString[6:10], dateString[3:5], dateString[:2], dateString[12:14], dateString[15:17]) new['date'] = '%s-%s-%s %s:%s' % (dateString[6:10], dateString[3:5], dateString[:2], dateString[12:14], dateString[15:17])
# fix decode_html # fix decode_html
# new['description'] = formatString(decode_html(description)) # new['description'] = format_string(decode_html(description))
new['description'] = formatString(description) new['description'] = format_string(description)
new['imageUrl'] = imageUrl new['imageUrl'] = imageUrl
new['section'] = formatSection(section) new['section'] = format_section(section)
new['title'] = formatString(title) new['title'] = format_string(title)
new['title1'] = new['title'].replace('\xdf', '\xdf\xdf')[:len(formatString(re.compile('<h4>(.*?)</h4>', re.DOTALL).findall(item)[0]))].replace('\xdf\xdf', '\xdf') new['title1'] = new['title'].replace('\xdf', '\xdf\xdf')[:len(format_string(re.compile('<h4>(.*?)</h4>', re.DOTALL).findall(item)[0]))].replace('\xdf\xdf', '\xdf')
if new['title1'][-1:] == ':': if new['title1'][-1:] == ':':
new['title1'] = new['title1'][0:-1] new['title1'] = new['title1'][0:-1]
new['title2'] = new['title'][len(new['title1']) + 2:] new['title2'] = new['title'][len(new['title1']) + 2:]
@ -67,21 +67,21 @@ def getNews(year, month, day):
''' '''
return news return news
def splitTitle(title): def split_title(title):
title1 = re.compile('(.*?): ').findall(title)[0] title1 = re.compile('(.*?): ').findall(title)[0]
title2 = re.compile(': (.*?)$').findall(title)[0] title2 = re.compile(': (.*?)$').findall(title)[0]
return [title1, title2] return [title1, title2]
def formatString(string): def format_string(string):
string = string.replace('<span class="spOptiBreak"> </span>', '') string = string.replace('<span class="spOptiBreak"> </span>', '')
string = string.replace('\n', ' ').replace(' ', ' ').strip() string = string.replace('\n', ' ').replace(' ', ' ').strip()
string = string.replace('&amp;', '&').replace('&apos;', '\'').replace('&quot;', '"') string = string.replace('&amp;', '&').replace('&apos;', '\'').replace('&quot;', '"')
return string return string
def formatSection(string): def format_section(string):
return string[:1].upper() + string[1:].replace('spiegel', 'SPIEGEL') return string[:1].upper() + string[1:].replace('spiegel', 'SPIEGEL')
def formatSubsection(string): def format_subsection(string):
# SPIEGEL, SPIEGEL special # SPIEGEL, SPIEGEL special
subsection = { subsection = {
'abi': 'Abi - und dann?', 'abi': 'Abi - und dann?',
@ -98,7 +98,7 @@ def formatSubsection(string):
return subsection[string].replace(u'\xc3', 'ae') return subsection[string].replace(u'\xc3', 'ae')
return string[:1].upper() + string[1:] return string[:1].upper() + string[1:]
def getIssue(year, week): def get_issue(year, week):
coverUrl = 'http://www.spiegel.de/static/epaper/SP/%d/%d/ROSPANZ%d%03d0001-312.jpg' % (year, week, year, week) coverUrl = 'http://www.spiegel.de/static/epaper/SP/%d/%d/ROSPANZ%d%03d0001-312.jpg' % (year, week, year, week)
if not ox.net.exists(coverUrl): if not ox.net.exists(coverUrl):
return None return None
@ -122,7 +122,7 @@ def getIssue(year, week):
return {'pages': pages, 'contents': contents, 'coverUrl': coverUrl, 'pageUrl': pageUrl} return {'pages': pages, 'contents': contents, 'coverUrl': coverUrl, 'pageUrl': pageUrl}
def archiveIssues(): def archive_issues():
''' '''
this is just an example of an archiving application this is just an example of an archiving application
''' '''
@ -140,8 +140,8 @@ def archiveIssues():
else: else:
wMax = 53 wMax = 53
for w in range(wMax, 0, -1): for w in range(wMax, 0, -1):
print 'getIssue(%d, %d)' % (y, w) print 'get_issue(%d, %d)' % (y, w)
issue = getIssue(y, w) issue = get_issue(y, w)
if issue: if issue:
dirname = '%s/%d/%02d' % (archivePath, y, w) dirname = '%s/%d/%02d' % (archivePath, y, w)
if not os.path.exists(dirname): if not os.path.exists(dirname):
@ -188,7 +188,7 @@ def archiveIssues():
print p['min'], p['sum'] / p['num'], p['max'] print p['min'], p['sum'] / p['num'], p['max']
def archiveNews(): def archive_news():
''' '''
this is just an example of an archiving application this is just an example of an archiving application
''' '''
@ -235,7 +235,7 @@ def archiveNews():
f.close() f.close()
filename = filename[:-5] + '.txt' filename = filename[:-5] + '.txt'
if not os.path.exists(filename) or True: if not os.path.exists(filename) or True:
data = splitTitle(new['title']) data = split_title(new['title'])
data.append(new['description']) data.append(new['description'])
data = '\n'.join(data) data = '\n'.join(data)
f = open(filename, 'w') f = open(filename, 'w')
@ -256,19 +256,14 @@ def archiveNews():
count[string] = {'count': 1, 'string': '%s %s http://www.spiegel.de/%s/0,1518,archiv-%d-%03d,00.html' % (new['date'], new['date'], new['section'].lower(), y, int(datetime(y, m, d).strftime('%j')))} count[string] = {'count': 1, 'string': '%s %s http://www.spiegel.de/%s/0,1518,archiv-%d-%03d,00.html' % (new['date'], new['date'], new['section'].lower(), y, int(datetime(y, m, d).strftime('%j')))}
else: else:
count[string] = {'count': count[string]['count'] + 1, 'string': '%s %s' % (new['date'], count[string]['string'][17:])} count[string] = {'count': count[string]['count'] + 1, 'string': '%s %s' % (new['date'], count[string]['string'][17:])}
strings = splitTitle(new['title']) strings = split_title(new['title'])
if strings[0] != new['title1'] or strings[1] != new['title2']: if strings[0] != new['title1'] or strings[1] != new['title2']:
colon.append('%s %s %s: %s' % (new['date'], new['title'], new['title1'], new['title2'])) colon.append('%s %s %s: %s' % (new['date'], new['title'], new['title1'], new['title2']))
for key in sortDictByKey(count): for key in sorted(count):
print '%6d %-24s %s' % (count[key]['count'], key, count[key]['string']) print '%6d %-24s %s' % (count[key]['count'], key, count[key]['string'])
for value in colon: for value in colon:
print value print value
def sortDictByKey(d):
keys = d.keys()
keys.sort()
return keys
if __name__ == '__main__': if __name__ == '__main__':
# spiegel = Spiegel(2008, 8) # spiegel = Spiegel(2008, 8)
# print spiegel.getContents() # print spiegel.getContents()
@ -281,12 +276,12 @@ if __name__ == '__main__':
news = getNews(2008, 2, d) news = getNews(2008, 2, d)
for new in news: for new in news:
strings = new['url'].split('/') strings = new['url'].split('/')
string = formatSection(strings[3]) string = format_section(strings[3])
if len(strings) == 6: if len(strings) == 6:
string += '/' + formatSubsection(strings[4]) string += '/' + format_subsection(strings[4])
if not string in x: if not string in x:
x.append(string) x.append(string)
print x print x
''' '''
# archiveIssues() # archive_issues()
archiveNews() archive_news()

View file

@ -22,7 +22,9 @@ def read_url(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_
headers['Cookie'] = 'language=en_EN' headers['Cookie'] = 'language=en_EN'
return cache.read_url(url, data, headers, timeout, unicode=unicode) return cache.read_url(url, data, headers, timeout, unicode=unicode)
def findMovies(query, max_results=10): def find_movies(query=None, imdb=None, max_results=10):
if imdb:
query = "tt" + normalize_imdbid(imdb)
results = [] results = []
next = ["http://thepiratebay.org/search/%s/0/3/200" % quote(query), ] next = ["http://thepiratebay.org/search/%s/0/3/200" % quote(query), ]
page_count = 1 page_count = 1
@ -47,10 +49,7 @@ def findMovies(query, max_results=10):
next = re.compile('<a.*?href="(.*?)".*?>.*?next.gif.*?</a>').findall(data) next = re.compile('<a.*?href="(.*?)".*?>.*?next.gif.*?</a>').findall(data)
return results return results
def findMovieByImdb(imdb): def get_id(piratebayId):
return findMovies("tt" + normalize_imdbid(imdb))
def getId(piratebayId):
if piratebayId.startswith('http://torrents.thepiratebay.org/'): if piratebayId.startswith('http://torrents.thepiratebay.org/'):
piratebayId = piratebayId.split('org/')[1] piratebayId = piratebayId.split('org/')[1]
d = find_re(piratebayId, "tor/(\d+)") d = find_re(piratebayId, "tor/(\d+)")
@ -62,10 +61,10 @@ def getId(piratebayId):
return piratebayId return piratebayId
def exists(piratebayId): def exists(piratebayId):
piratebayId = getId(piratebayId) piratebayId = get_id(piratebayId)
return ox.net.exists("http://thepiratebay.org/torrent/%s" % piratebayId) return ox.net.exists("http://thepiratebay.org/torrent/%s" % piratebayId)
def getData(piratebayId): def get_data(piratebayId):
_key_map = { _key_map = {
'spoken language(s)': u'language', 'spoken language(s)': u'language',
'texted language(s)': u'subtitle language', 'texted language(s)': u'subtitle language',
@ -73,7 +72,7 @@ def getData(piratebayId):
'leechers': 'leecher', 'leechers': 'leecher',
'seeders': 'seeder', 'seeders': 'seeder',
} }
piratebayId = getId(piratebayId) piratebayId = get_id(piratebayId)
torrent = dict() torrent = dict()
torrent[u'id'] = piratebayId torrent[u'id'] = piratebayId
torrent[u'domain'] = 'thepiratebay.org' torrent[u'domain'] = 'thepiratebay.org'
@ -108,7 +107,7 @@ class Thepiratebay(Torrent):
'4e84415d36ed7b54066160c05a0b0f061898d12b' '4e84415d36ed7b54066160c05a0b0f061898d12b'
''' '''
def __init__(self, piratebayId): def __init__(self, piratebayId):
self.data = getData(piratebayId) self.data = get_data(piratebayId)
if not self.data: if not self.data:
return return
Torrent.__init__(self) Torrent.__init__(self)

View file

@ -7,12 +7,12 @@ from ox import strip_tags, find_re
from ox.cache import read_url from ox.cache import read_url
def getEpisodeData(url): def get_episode_data(url):
''' '''
prases informatin on tvcom episode pages prases informatin on tvcom episode pages
returns dict with title, show, description, score returns dict with title, show, description, score
example: example:
getEpisodeData('http://www.tv.com/lost/do-no-harm/episode/399310/summary.html') get_episode_data('http://www.tv.com/lost/do-no-harm/episode/399310/summary.html')
''' '''
data = read_url(url, unicode=True) data = read_url(url, unicode=True)
r = {} r = {}

View file

@ -8,7 +8,7 @@ from ox.cache import read_url
from ox import find_string, find_re from ox import find_string, find_re
def getData(id): def get_data(id):
url = 'http://www.vimeo.com/moogaloop/load/clip:%s' %id url = 'http://www.vimeo.com/moogaloop/load/clip:%s' %id
xml = read_url(url) xml = read_url(url)
tree = ET.parse(StringIO(xml)) tree = ET.parse(StringIO(xml))

View file

@ -8,52 +8,45 @@ from ox.cache import read_url
from ox import find_re, decode_html from ox import find_re, decode_html
def getId(url): def get_id(url):
return url.split("/")[-1] return url.split("/")[-1]
def getUrl(id): def get_url(id=None, imdb=None, allmovie=None):
if imdb:
query = '"%s"'% imdbId
result = find(query)
if result:
url = result[0][1]
data = get_movie_data(url)
if 'imdb_id' in data:
return url
return ""
if allmovie:
query = '"amg_id = 1:%s"'% allmovie
result = find(query)
if result:
url = result[0][1]
return url
return ''
return "http://en.wikipedia.org/wiki/%s" % id return "http://en.wikipedia.org/wiki/%s" % id
def get_movie_id(title, director='', year=''):
def getMovieId(title, director='', year=''):
query = '"%s" film %s %s' % (title, director, year) query = '"%s" film %s %s' % (title, director, year)
result = find(query, 1) result = find(query, 1)
if result: if result:
return result[0][1] return result[0][1]
return '' return ''
def getUrlByImdbId(imdbId): def get_wiki_data(wikipedia_url):
query = '"%s"'% imdbId url = wikipedia_url.replace('wikipedia.org/wiki/', 'wikipedia.org/w/index.php?title=')
result = find(query)
if result:
url = result[0][1]
data = getMovieData(url)
if 'imdb_id' in data:
return url
return ""
def getUrlByImdb(imdbId):
# deprecated, use getUrlByImdbId()
return getUrlByImdbId(imdbId)
def getUrlByAllmovieId(allmovieId):
query = '"amg_id = 1:%s"'% allmovieId
result = find(query)
if result:
url = result[0][1]
return url
return ''
def getWikiData(wikipediaUrl):
url = wikipediaUrl.replace('wikipedia.org/wiki/', 'wikipedia.org/w/index.php?title=')
url = "%s&action=raw" % url url = "%s&action=raw" % url
data = read_url(url).decode('utf-8') data = read_url(url).decode('utf-8')
return data return data
def getMovieData(wikipediaUrl): def get_movie_data(wikipedia_url):
if not wikipediaUrl.startswith('http'): if not wikipedia_url.startswith('http'):
wikipediaUrl = getUrl(wikipediaUrl) wikipedia_url = get_url(wikipedia_url)
data = getWikiData(wikipediaUrl) data = get_wiki_data(wikipedia_url)
filmbox_data = find_re(data, '''\{\{[Ii]nfobox.[Ff]ilm(.*?)\n\}\}''') filmbox_data = find_re(data, '''\{\{[Ii]nfobox.[Ff]ilm(.*?)\n\}\}''')
filmbox = {} filmbox = {}
_box = filmbox_data.strip().split('|') _box = filmbox_data.strip().split('|')
@ -104,7 +97,7 @@ def getMovieData(wikipediaUrl):
filmbox['title_sort'] = find_re(data, '''\{\{DEFAULTSORT:(.*?)\}\}''') filmbox['title_sort'] = find_re(data, '''\{\{DEFAULTSORT:(.*?)\}\}''')
return filmbox return filmbox
def getImageUrl(name): def get_image_url(name):
url = 'http://en.wikipedia.org/wiki/Image:' + name.replace(' ', '%20') url = 'http://en.wikipedia.org/wiki/Image:' + name.replace(' ', '%20')
data = read_url(url, unicode=True) data = read_url(url, unicode=True)
url = find_re(data, 'href="(http://upload.wikimedia.org/.*?)"') url = find_re(data, 'href="(http://upload.wikimedia.org/.*?)"')
@ -114,19 +107,19 @@ def getImageUrl(name):
url = 'http:' + url url = 'http:' + url
return url return url
def getPosterUrl(wikipediaUrl): def get_poster_url(wikipedia_url):
if not wikipediaUrl.startswith('http'): wikipediaUrl = getUrl(wikipediaUrl) if not wikipedia_url.startswith('http'): wikipedia_url = get_url(wikipedia_url)
data = getMovieData(wikipediaUrl) data = get_movie_data(wikipedia_url)
if 'image' in data: if 'image' in data:
return getImageUrl(data['image']) return get_image_url(data['image'])
return '' return ''
def getMoviePoster(wikipediaUrl): def get_movie_poster(wikipedia_url):
# deprecated, use getPosterUrl() # deprecated, use get_poster_url()
return getPosterUrl(wikipediaUrl) return get_poster_url(wikipedia_url)
def getAllmovieId(wikipediaUrl): def get_allmovie_id(wikipedia_url):
data = getMovieData(wikipediaUrl) data = get_movie_data(wikipedia_url)
return data.get('amg_id', '') return data.get('amg_id', '')
def find(query, max_results=10): def find(query, max_results=10):

View file

@ -8,7 +8,7 @@ import feedparser
from ox.cache import read_url, cache_timeout from ox.cache import read_url, cache_timeout
def getVideoUrl(youtubeId, format='mp4', timeout=cache_timeout): def video_url(youtubeId, format='mp4', timeout=cache_timeout):
""" """
youtubeId - if of video youtubeId - if of video
format - video format, options: webm, 1080p, 720p, mp4, high format - video format, options: webm, 1080p, 720p, mp4, high