From a4fd3c930fe31f8320893f2265740f0b1b9a07f5 Mon Sep 17 00:00:00 2001 From: j <0x006A@0x2620.org> Date: Wed, 15 Aug 2012 17:15:40 +0200 Subject: [PATCH] ox.web under_score api rewrite --- ox/movie.py | 5 ++- ox/oembed.py | 2 +- ox/web/allmovie.py | 58 +++++++++++++++--------------- ox/web/amazon.py | 18 +++++----- ox/web/apple.py | 12 +++---- ox/web/archive.py | 8 ++--- ox/web/criterion.py | 24 ++++++------- ox/web/dailymotion.py | 2 +- ox/web/epguides.py | 4 +-- ox/web/flixter.py | 31 ++++++++-------- ox/web/freebase.py | 2 +- ox/web/imdb.py | 40 ++++++++++----------- ox/web/impawards.py | 37 ++++++++++--------- ox/web/itunes.py | 48 ++++++++++++------------- ox/web/lyricsfly.py | 2 +- ox/web/metacritic.py | 21 ++++++----- ox/web/mininova.py | 29 +++++++-------- ox/web/movieposterdb.py | 22 ++++++------ ox/web/opensubtitles.py | 4 +-- ox/web/oxdb.py | 2 +- ox/web/piratecinema.py | 4 +-- ox/web/rottentomatoes.py | 19 +++++----- ox/web/siteparser.py | 4 +-- ox/web/spiegel.py | 53 +++++++++++++-------------- ox/web/thepiratebay.py | 17 +++++---- ox/web/tv.py | 4 +-- ox/web/vimeo.py | 2 +- ox/web/wikipedia.py | 77 ++++++++++++++++++---------------------- ox/web/youtube.py | 2 +- 29 files changed, 268 insertions(+), 285 deletions(-) diff --git a/ox/movie.py b/ox/movie.py index a0966e6..611968d 100644 --- a/ox/movie.py +++ b/ox/movie.py @@ -307,6 +307,8 @@ def parse_movie_path(path): title = title.replace('_ ', ': ') if title.endswith('_'): title = title[:-1] + '.' + if title.startswith('_'): + title = '.' + title[1:] year = find_re(title, '(\(\d{4}\))') if not year: @@ -344,8 +346,9 @@ def parse_movie_path(path): else: season = None - episode = find_re(parts[-1], '\.Episode (\d+)\.') + episode = find_re(parts[-1], '\.Episode[s]* ([\d+]+)\.') if episode: + episode = episode.split('+')[0] episode = int(episode) else: episode = None diff --git a/ox/oembed.py b/ox/oembed.py index 26e100d..be0eba0 100644 --- a/ox/oembed.py +++ b/ox/oembed.py @@ -7,7 +7,7 @@ from utils import json, ET def get_embed_code(url, maxwidth=None, maxheight=None): embed = {} - header = cache.getHeaders(url) + header = cache.get_headers(url) if header.get('content-type', '').startswith('text/html'): html = cache.readUrl(url) json_oembed = filter(lambda l: 'json+oembed' in l, re.compile('').findall(html)) diff --git a/ox/web/allmovie.py b/ox/web/allmovie.py index 76e961b..815555e 100644 --- a/ox/web/allmovie.py +++ b/ox/web/allmovie.py @@ -7,68 +7,68 @@ from ox import strip_tags, find_re from ox.cache import read_url -def getId(url): +def get_id(url): return url.split("/")[-1] -def getData(id): +def get_data(id): ''' - >>> getData('129689')['cast'][1][1] + >>> get_data('129689')['cast'][1][1] u'Marianne' - >>> getData('129689')['credits'][0][0] + >>> get_data('129689')['credits'][0][0] u'Jean-Luc Godard' - >>> getData('129689')['posters'][0] + >>> get_data('129689')['posters'][0] u'http://image.allmusic.com/00/adg/cov200/dru800/u812/u81260bbffr.jpg' - >>> getData('129689')['rating'] + >>> get_data('129689')['rating'] u'4.5' ''' if id.startswith('http'): - id = getId(id) + id = get_id(id) data = { - "url": getUrl(id) + "url": get_url(id) } html = read_url(data["url"], unicode=True) - data['aka'] = parseList(html, 'AKA') + data['aka'] = parse_list(html, 'AKA') data['category'] = find_re(html, '
category
.*?
(.*?)
') - data['countries'] = parseList(html, 'countries') - data['director'] = parseEntry(html, 'directed by') - data['genres'] = parseList(html, 'genres') - data['keywords'] = parseList(html, 'keywords') + data['countries'] = parse_list(html, 'countries') + data['director'] = parse_entry(html, 'directed by') + data['genres'] = parse_list(html, 'genres') + data['keywords'] = parse_list(html, 'keywords') data['posters'] = [find_re(html, '(.*?)')).strip() - data['themes'] = parseList(html, 'themes') - data['types'] = parseList(html, 'types') + data['themes'] = parse_list(html, 'themes') + data['types'] = parse_list(html, 'types') data['year'] = find_re(html, '.*?(\d+)') #data['stills'] = [re.sub('_derived.*?/', '', i) for i in re.compile('(.*?)')).strip() return data -def getUrl(id): +def get_url(id): return "http://allmovie.com/work/%s" % id -def parseEntry(html, title): +def parse_entry(html, title): html = find_re(html, '
%s
.*?
(.*?)
' % title) return strip_tags(html).strip() -def parseList(html, title): +def parse_list(html, title): html = find_re(html, '
%s
.*?
(.*?)
' % title.lower()) r = map(lambda x: strip_tags(x), re.compile('
  • (.*?)
  • ', re.DOTALL).findall(html)) if not r and html: r = [strip_tags(html)] return r -def parseTable(html): +def parse_table(html): return map( lambda x: map( lambda x: strip_tags(x).strip().replace(' ', ''), @@ -77,10 +77,10 @@ def parseTable(html): find_re(html, '
    (.*?)').split('')[:-1] ) -def parseText(html, title): +def parse_text(html, title): return strip_tags(find_re(html, '%s.*?

    (.*?)' % title)).strip() if __name__ == '__main__': - print getData('129689') - # print getData('177524') + print get_data('129689') + # print get_data('177524') diff --git a/ox/web/amazon.py b/ox/web/amazon.py index 75351e7..64289c8 100644 --- a/ox/web/amazon.py +++ b/ox/web/amazon.py @@ -13,17 +13,17 @@ def findISBN(title, author): data = read_url(url, unicode=True) links = re.compile('href="(http://www.amazon.com/.*?/dp/.*?)"').findall(data) id = find_re(re.compile('href="(http://www.amazon.com/.*?/dp/.*?)"').findall(data)[0], '/dp/(.*?)/') - data = getData(id) + data = get_data(id) if author in data['authors']: return data return {} -def getData(id): +def get_data(id): url = "http://www.amazon.com/title/dp/%s/" % id data = read_url(url, unicode=True) - def findData(key): + def find_data(key): return find_re(data, '

  • %s:(.*?)
  • '% key).strip() r = {} @@ -34,15 +34,15 @@ def getData(id): t = re.compile('>(.*?)
    \(Translator\)').findall(data) if t: r['translator'] = t - r['publisher'] = findData('Publisher') - r['language'] = findData('Language') - r['isbn-10'] = findData('ISBN-10') - r['isbn-13'] = findData('ISBN-13').replace('-', '') + r['publisher'] = find_data('Publisher') + r['language'] = find_data('Language') + r['isbn-10'] = find_data('ISBN-10') + r['isbn-13'] = find_data('ISBN-13').replace('-', '') r['dimensions'] = find_re(data, '
  • .*?Product Dimensions:.*?(.*?)
  • ') - r['pages'] = findData('Paperback') + r['pages'] = find_data('Paperback') if not r['pages']: - r['pages'] = findData('Hardcover') + r['pages'] = find_data('Hardcover') r['review'] = strip_tags(find_re(data, '

    Review

    .*?
    (.*?)
    ').replace('
    ', '\n')).strip() diff --git a/ox/web/apple.py b/ox/web/apple.py index 2b8b34f..2725aac 100644 --- a/ox/web/apple.py +++ b/ox/web/apple.py @@ -14,7 +14,7 @@ HEADERS = { USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7) ' USER_AGENT += 'AppleWebKit/534.48.3 (KHTML, like Gecko) Version/5.1 Safari/534.48.3' -def getMovieData(title, director): +def get_movie_data(title, director): if isinstance(title, unicode): title = title.encode('utf-8') if isinstance(director, unicode): @@ -60,8 +60,8 @@ def getMovieData(title, director): return data if __name__ == '__main__': - print getMovieData('Alphaville', 'Jean-Luc Godard') - print getMovieData('Sin City', 'Roberto Rodriguez') - print getMovieData('Breathless', 'Jean-Luc Godard') - print getMovieData('Capitalism: A Love Story', 'Michael Moore') - print getMovieData('Film Socialisme', 'Jean-Luc Godard') + print get_movie_data('Alphaville', 'Jean-Luc Godard') + print get_movie_data('Sin City', 'Roberto Rodriguez') + print get_movie_data('Breathless', 'Jean-Luc Godard') + print get_movie_data('Capitalism: A Love Story', 'Michael Moore') + print get_movie_data('Film Socialisme', 'Jean-Luc Godard') diff --git a/ox/web/archive.py b/ox/web/archive.py index b79be2a..7f4b572 100644 --- a/ox/web/archive.py +++ b/ox/web/archive.py @@ -3,15 +3,15 @@ from .. import cache from ..utils import json -def getId(url): +def get_id(url): return url.split("/")[-1] -def getUrl(id): +def get_url(id): return "http://www.archive.org/details/%s" % id -def getData(id): +def get_data(id): data = {} - url = getUrl(id) + url = get_url(id) details = cache.read_url('%s?output=json' % url) details = json.loads(details) for key in ('title', 'description', 'runtime'): diff --git a/ox/web/criterion.py b/ox/web/criterion.py index 850b8b5..077e5ee 100644 --- a/ox/web/criterion.py +++ b/ox/web/criterion.py @@ -9,25 +9,25 @@ from ox.text import find_re, remove_special_characters import imdb -def getId(url): +def get_id(url): return url.split("/")[-1] -def getUrl(id): +def get_url(id): return "http://www.criterion.com/films/%s" % id -def getData(id, timeout=ox.cache.cache_timeout, get_imdb=False): +def get_data(id, timeout=ox.cache.cache_timeout, get_imdb=False): ''' - >>> getData('1333')['imdbId'] + >>> get_data('1333')['imdbId'] u'0060304' - >>> getData('236')['posters'][0] + >>> get_data('236')['posters'][0] u'http://criterion_production.s3.amazonaws.com/release_images/1586/ThirdManReplace.jpg' - >>> getData('786')['posters'][0] + >>> get_data('786')['posters'][0] u'http://criterion_production.s3.amazonaws.com/product_images/185/343_box_348x490.jpg' ''' data = { - "url": getUrl(id) + "url": get_url(id) } try: html = read_url(data["url"], timeout=timeout, unicode=True) @@ -71,21 +71,21 @@ def getData(id, timeout=ox.cache.cache_timeout, get_imdb=False): if timeout == ox.cache.cache_timeout: timeout = -1 if get_imdb: - data['imdbId'] = imdb.getMovieId(data['title'], + data['imdbId'] = imdb.get_movie_id(data['title'], data['director'], data['year'], timeout=timeout) return data -def getIds(): +def get_ids(): ids = [] html = read_url("http://www.criterion.com/library/expanded_view?m=dvd&p=1&pp=50&s=spine", unicode=True) results = re.compile("\&p=(\d+)\&").findall(html) pages = max(map(int, results)) for page in range(1, pages): - for id in getIdsByPage(page): + for id in get_idsByPage(page): ids.append(id) return map(lambda id: str(id), sorted(map(lambda id: int(id), set(ids)))) -def getIdsByPage(page): +def get_idsByPage(page): ids = [] url = "http://www.criterion.com/library/expanded_view?m=dvd&p=%s&pp=50&s=spine" % page html = read_url(url, unicode=True) @@ -101,4 +101,4 @@ def getIdsByPage(page): return set(ids) if __name__ == '__main__': - print getIds() + print get_ids() diff --git a/ox/web/dailymotion.py b/ox/web/dailymotion.py index 3015fbf..cddae88 100644 --- a/ox/web/dailymotion.py +++ b/ox/web/dailymotion.py @@ -5,7 +5,7 @@ from urllib import unquote from ox.cache import read_url -def getVideoUrl(url): +def get_video_url(url): ''' >>> getVideoUrl('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3opar_priere-pour-refuznik-1-jeanluc-goda_shortfilms').split('?auth')[0] 'http://www.dailymotion.com/cdn/FLV-320x240/video/x3opar_priere-pour-refuznik-1-jean-luc-god_shortfilms.flv' diff --git a/ox/web/epguides.py b/ox/web/epguides.py index 885cb2e..c5a7fcb 100644 --- a/ox/web/epguides.py +++ b/ox/web/epguides.py @@ -9,7 +9,7 @@ from ox.cache import read_url import google -def getShowUrl(title): +def get_show_url(title): ''' Search Epguide Url for Show via Show Title. Use Google to search the url, this is also done on Epguide. @@ -20,7 +20,7 @@ def getShowUrl(title): return url return None -def getShowData(url): +def get_show_data(url): data = read_url(url, unicode=True) r = {} r['title'] = strip_tags(find_re(data, '

    (.*?)

    ')) diff --git a/ox/web/flixter.py b/ox/web/flixter.py index 5cf3a8e..2d793b4 100644 --- a/ox/web/flixter.py +++ b/ox/web/flixter.py @@ -9,28 +9,28 @@ from ox import find_re, strip_tags from ox.web.imdb import ImdbCombined -def getData(id, timeout=-1): +def get_data(id, timeout=-1): ''' - >>> getData('the-matrix')['poster'] + >>> get_data('the-matrix')['poster'] 'http://content7.flixster.com/movie/16/90/52/1690525_gal.jpg' - >>> getData('0133093')['poster'] + >>> get_data('0133093')['poster'] 'http://content7.flixster.com/movie/16/90/52/1690525_gal.jpg' - >>> getData('2-or-3-things-i-know-about-her')['poster'] + >>> get_data('2-or-3-things-i-know-about-her')['poster'] 'http://content6.flixster.com/movie/10/95/43/10954392_gal.jpg' - >>> getData('0078875')['rottentomatoes_id'] + >>> get_data('0078875')['rottentomatoes_id'] 'http://www.rottentomatoes.com/m/the-tin-drum/' ''' if len(id) == 7: try: int(id) - id = getIdByImdb(id) + id = get_id(imdb=id) except: pass data = { - "url": getUrl(id), + "url": get_url(id), } html = read_url(data['url'], timeout=timeout, timeout=True) doc = document_fromstring(html) @@ -55,21 +55,20 @@ def getData(id, timeout=-1): return None return data -def getIdByImdb(imdbId): +def get_id(url=None, imdb=None): ''' - >>> getIdByImdb('0133093') + >>> get_id(imdb='0133093') u'the-matrix' - #>>> getIdByImdb('0060304') + #>>> get_id(imdb='0060304') #u'2-or-3-things-i-know-about-her' ''' - i = ImdbCombined(imdbId) - title = i['title'] - return title.replace(' ', '-').lower().replace("'", '') - -def getId(url): + if imdb: + i = ImdbCombined(imdb) + title = i['title'] + return title.replace(' ', '-').lower().replace("'", '') return url.split('/')[-1] -def getUrl(id): +def get_url(id): return "http://www.flixster.com/movie/%s"%id diff --git a/ox/web/freebase.py b/ox/web/freebase.py index d3a5313..c1cf37e 100644 --- a/ox/web/freebase.py +++ b/ox/web/freebase.py @@ -5,7 +5,7 @@ import json from ox.cache import read_url from ox import find_re -class Imdb(dict): +class Freebase(dict): def __init__(self, id, timeout=-1): url = "http://ids.freebaseapps.com/get_ids?id=/authority/imdb/title/tt%s" % id ''' diff --git a/ox/web/imdb.py b/ox/web/imdb.py index 0e55b14..0da40b4 100644 --- a/ox/web/imdb.py +++ b/ox/web/imdb.py @@ -20,7 +20,7 @@ def read_url(url, data=None, headers=ox.cache.DEFAULT_HEADERS, timeout=ox.cache. headers = headers.copy() return ox.cache.read_url(url, data, headers, timeout, unicode=unicode) -def getUrl(id): +def get_url(id): return "http://www.imdb.com/title/tt%s/" % id class Imdb(SiteParser): @@ -420,7 +420,7 @@ class ImdbCombined(Imdb): self.regex = _regex super(ImdbCombined, self).__init__(id, timeout) -def getMovieIdByTitle(title, timeout=-1): +def get_movie_by_title(title, timeout=-1): ''' This only works for exact title matches from the data dump Usually in the format @@ -431,22 +431,22 @@ def getMovieIdByTitle(title, timeout=-1): If there is more than one film with that title for the year Title (Year/I) - >>> getMovieIdByTitle(u'"Father Knows Best" (1954) {(#5.34)}') + >>> get_movie_by_title(u'"Father Knows Best" (1954) {(#5.34)}') u'1602860' - >>> getMovieIdByTitle(u'The Matrix (1999)') + >>> get_movie_by_title(u'The Matrix (1999)') u'0133093' - >>> getMovieIdByTitle(u'Little Egypt (1951)') + >>> get_movie_by_title(u'Little Egypt (1951)') u'0043748' - >>> getMovieIdByTitle(u'Little Egypt (1897/I)') + >>> get_movie_by_title(u'Little Egypt (1897/I)') u'0214882' - >>> getMovieIdByTitle(u'Little Egypt') + >>> get_movie_by_title(u'Little Egypt') None - >>> getMovieIdByTitle(u'"Dexter" (2006) {Father Knows Best (#1.9)}') + >>> get_movie_by_title(u'"Dexter" (2006) {Father Knows Best (#1.9)}') u'0866567' ''' params = {'s':'tt','q': title} @@ -465,21 +465,21 @@ def getMovieIdByTitle(title, timeout=-1): return results[0] return None -def getMovieId(title, director='', year='', timeout=-1): +def get_movie_id(title, director='', year='', timeout=-1): ''' - >>> getMovieId('The Matrix') + >>> get_movie_id('The Matrix') u'0133093' - >>> getMovieId('2 or 3 Things I Know About Her', 'Jean-Luc Godard') + >>> get_movie_id('2 or 3 Things I Know About Her', 'Jean-Luc Godard') u'0060304' - >>> getMovieId('2 or 3 Things I Know About Her', 'Jean-Luc Godard', '1967') + >>> get_movie_id('2 or 3 Things I Know About Her', 'Jean-Luc Godard', '1967') u'0060304' - >>> getMovieId(u"Histoire(s) du cinema: Le controle de l'univers", 'Jean-Luc Godard') + >>> get_movie_id(u"Histoire(s) du cinema: Le controle de l'univers", 'Jean-Luc Godard') u'0179214' - >>> getMovieId(u"Histoire(s) du cinéma: Le contrôle de l'univers", 'Jean-Luc Godard') + >>> get_movie_id(u"Histoire(s) du cinéma: Le contrôle de l'univers", 'Jean-Luc Godard') u'0179214' ''' imdbId = { @@ -555,12 +555,12 @@ def getMovieId(title, director='', year='', timeout=-1): #or nothing return '' -def getMoviePoster(imdbId): +def get_movie_poster(imdbId): ''' - >>> getMoviePoster('0133093') + >>> get_movie_poster('0133093') 'http://ia.media-imdb.com/images/M/MV5BMjEzNjg1NTg2NV5BMl5BanBnXkFtZTYwNjY3MzQ5._V1._SX338_SY475_.jpg' - >>> getMoviePoster('0994352') + >>> get_movie_poster('0994352') 'http://ia.media-imdb.com/images/M/MV5BMjA3NzMyMzU1MV5BMl5BanBnXkFtZTcwNjc1ODUwMg@@._V1._SX594_SY755_.jpg' ''' info = ImdbCombined(imdbId) @@ -570,10 +570,10 @@ def getMoviePoster(imdbId): poster = find_re(data, 'img id="primary-img".*?src="(.*?)"') return poster elif 'series' in info: - return getMoviePoster(info['series']) + return get_movie_poster(info['series']) return '' -def maxVotes(): +def max_votes(): url = 'http://www.imdb.com/search/title?num_votes=500000,&sort=num_votes,desc' data = ox.cache.read_url(url) votes = max([int(v.replace(',', '')) @@ -581,7 +581,7 @@ def maxVotes(): return votes def guess(title, director='', timeout=-1): - return getMovieId(title, director, timeout=timeout) + return get_movie_id(title, director, timeout=timeout) if __name__ == "__main__": import json diff --git a/ox/web/impawards.py b/ox/web/impawards.py index a423afc..9323fee 100644 --- a/ox/web/impawards.py +++ b/ox/web/impawards.py @@ -7,19 +7,19 @@ from ox.html import strip_tags from ox.text import find_re -def getData(id): +def get_data(id): ''' - >>> getData('1991/silence_of_the_lambs')['imdbId'] + >>> get_data('1991/silence_of_the_lambs')['imdbId'] u'0102926' - >>> getData('1991/silence_of_the_lambs')['posters'][0] + >>> get_data('1991/silence_of_the_lambs')['posters'][0] u'http://www.impawards.com/1991/posters/silence_of_the_lambs_ver1.jpg' - >>> getData('1991/silence_of_the_lambs')['url'] + >>> get_data('1991/silence_of_the_lambs')['url'] u'http://www.impawards.com/1991/silence_of_the_lambs_ver1.html' ''' data = { - 'url': getUrl(id) + 'url': get_url(id) } html = read_url(data['url'], unicode=True) data['imdbId'] = find_re(html, 'imdb.com/title/tt(\d{7})') @@ -48,7 +48,7 @@ def getData(id): return data -def getId(url): +def get_id(url): split = url.split('/') year = split[3] split = split[4][:-5].split('_') @@ -59,26 +59,25 @@ def getId(url): id = '%s/%s' % (year, '_'.join(split)) return id -def getIds(): +def get_ids(page=None): ids = [] + if page: + html = read_url('http://www.impawards.com/archives/page%s.html' % page, timeout = -1, unicode=True) + results = re.compile('', re.DOTALL).findall(html) + for result in results: + url = 'http://impawards.com/%s' % result + ids.append(get_id(url)) + return set(ids) + #get all html = read_url('http://www.impawards.com/archives/latest.html', timeout = 60*60, unicode=True) pages = int(find_re(html, '')) + 1 for page in range(pages, 0, -1): - for id in getIdsByPage(page): + for id in get_ids(page): if not id in ids: ids.append(id) return ids -def getIdsByPage(page): - ids = [] - html = read_url('http://www.impawards.com/archives/page%s.html' % page, timeout = -1, unicode=True) - results = re.compile('', re.DOTALL).findall(html) - for result in results: - url = 'http://impawards.com/%s' % result - ids.append(getId(url)) - return set(ids) - -def getUrl(id): +def get_url(id): url = u"http://www.impawards.com/%s.html" % id html = read_url(url, unicode=True) if find_re(html, "No Movie Posters on This Page"): @@ -297,5 +296,5 @@ _id_map = { } if __name__ == '__main__': - ids = getIds() + ids = get_ids() print sorted(ids), len(ids) diff --git a/ox/web/itunes.py b/ox/web/itunes.py index 951b121..db8c7da 100644 --- a/ox/web/itunes.py +++ b/ox/web/itunes.py @@ -24,7 +24,7 @@ ITUNES_HEADERS = { 'Connection': 'close', } -def composeUrl(request, parameters): +def compose_url(request, parameters): if request == 'advancedSearch': url = 'http://ax.phobos.apple.com.edgesuite.net/WebObjects/MZSearch.woa/wa/advancedSearch?' if parameters['media'] == 'music': @@ -60,7 +60,7 @@ def composeUrl(request, parameters): url = 'http://phobos.apple.com/WebObjects/MZStore.woa/wa/viewMovie?id=%s&prvw=1' % parameters['id'] return url -def parseXmlDict(xml): +def parse_xml_dict(xml): values = {} strings = xml.split('') for string in strings: @@ -78,7 +78,7 @@ def parseXmlDict(xml): values[key] = value return values -def parseCast(xml, title): +def parse_cast(xml, title): list = [] try: strings = find_re(xml, '%s(.*?)' % title[:-1].upper()).split('') @@ -89,7 +89,7 @@ def parseCast(xml, title): except: return list -def parseMovies(xml, title): +def parse_movies(xml, title): list = [] try: strings = find_re(xml, '%s(.*?)' % title[:-1].upper()).split('') @@ -109,17 +109,17 @@ class ItunesAlbum: self.title = title self.artist = artist if not id: - self.id = self.getId() + self.id = self.get_id() - def getId(self): - url = composeUrl('advancedSearch', {'media': 'music', 'title': self.title, 'artist': self.artist}) + def get_id(self): + url = compose_url('advancedSearch', {'media': 'music', 'title': self.title, 'artist': self.artist}) xml = read_url(url, headers = ITUNES_HEADERS) id = find_re(xml, 'viewAlbum\?id=(.*?)&') return id - def getData(self): + def get_data(self): data = {'id': self.id} - url = composeUrl('viewAlbum', {'id': self.id}) + url = compose_url('viewAlbum', {'id': self.id}) xml = read_url(url, None, ITUNES_HEADERS) data['albumName'] = find_re(xml, '(.*?)') data['artistName'] = find_re(xml, '(.*?)') @@ -130,7 +130,7 @@ class ItunesAlbum: data['tracks'] = [] strings = find_re(xml, 'items.*?(.*?)$').split('') for string in strings: - data['tracks'].append(parseXmlDict(string)) + data['tracks'].append(parse_xml_dict(string)) data['type'] = find_re(xml, 'listType(.*?)<') return data @@ -140,48 +140,48 @@ class ItunesMovie: self.title = title self.director = director if not id: - self.id = self.getId() + self.id = self.get_id() - def getId(self): - url = composeUrl('advancedSearch', {'media': 'movie', 'title': self.title, 'director': self.director}) + def get_id(self): + url = compose_url('advancedSearch', {'media': 'movie', 'title': self.title, 'director': self.director}) xml = read_url(url, headers = ITUNES_HEADERS) id = find_re(xml, 'viewMovie\?id=(.*?)&') return id - def getData(self): + def get_data(self): data = {'id': self.id} - url = composeUrl('viewMovie', {'id': self.id}) + url = compose_url('viewMovie', {'id': self.id}) xml = read_url(url, None, ITUNES_HEADERS) f = open('/Users/rolux/Desktop/iTunesData.xml', 'w') f.write(xml) f.close() - data['actors'] = parseCast(xml, 'actors') + data['actors'] = parse_cast(xml, 'actors') string = find_re(xml, 'Average Rating:(.*?)') data['averageRating'] = string.count('rating_star_000033.png') + string.count('½') * 0.5 - data['directors'] = parseCast(xml, 'directors') + data['directors'] = parse_cast(xml, 'directors') data['format'] = find_re(xml, 'Format:(.*?)<') data['genre'] = decode_html(find_re(xml, 'Genre:(.*?)<')) data['plotSummary'] = decode_html(find_re(xml, 'PLOT SUMMARY.*?(.*?)')) data['posterUrl'] = find_re(xml, 'reflection="." url="(.*?)"') - data['producers'] = parseCast(xml, 'producers') + data['producers'] = parse_cast(xml, 'producers') data['rated'] = find_re(xml, 'Rated(.*?)<') - data['relatedMovies'] = parseMovies(xml, 'related movies') + data['relatedMovies'] = parse_movies(xml, 'related movies') data['releaseDate'] = find_re(xml, 'Released(.*?)<') data['runTime'] = find_re(xml, 'Run Time:(.*?)<') - data['screenwriters'] = parseCast(xml, 'screenwriters') + data['screenwriters'] = parse_cast(xml, 'screenwriters') data['soundtrackId'] = find_re(xml, 'viewAlbum\?id=(.*?)&') data['trailerUrl'] = find_re(xml, 'autoplay="." url="(.*?)"') return data if __name__ == '__main__': from ox.utils import json - data = ItunesAlbum(title = 'So Red the Rose', artist = 'Arcadia').getData() + data = ItunesAlbum(title = 'So Red the Rose', artist = 'Arcadia').get_data() print json.dumps(data, sort_keys = True, indent = 4) - data = ItunesMovie(title = 'The Matrix', director = 'Wachowski').getData() + data = ItunesMovie(title = 'The Matrix', director = 'Wachowski').get_data() print json.dumps(data, sort_keys = True, indent = 4) for v in data['relatedMovies']: - data = ItunesMovie(id = v['id']).getData() + data = ItunesMovie(id = v['id']).get_data() print json.dumps(data, sort_keys = True, indent = 4) - data = ItunesMovie(id='272960052').getData() + data = ItunesMovie(id='272960052').get_data() print json.dumps(data, sort_keys = True, indent = 4) diff --git a/ox/web/lyricsfly.py b/ox/web/lyricsfly.py index 7b8e9bb..12d821a 100644 --- a/ox/web/lyricsfly.py +++ b/ox/web/lyricsfly.py @@ -5,7 +5,7 @@ from ox.html import decode_html from ox.text import find_re -def getLyrics(title, artist): +def get_lyrics(title, artist): html = read_url('http://lyricsfly.com/api/') key = find_re(html, '(.*?)') url = 'http://lyricsfly.com/api/api.php?i=%s&a=%s&t=%s' % (key, artist, title) diff --git a/ox/web/metacritic.py b/ox/web/metacritic.py index 8d25855..e59504a 100644 --- a/ox/web/metacritic.py +++ b/ox/web/metacritic.py @@ -7,25 +7,24 @@ from lxml.html import document_fromstring from ox.cache import read_url from ox import find_re, strip_tags -def getUrl(id): +def get_url(id=None, imdb=None): + if imdb: + url = "http://www.imdb.com/title/tt%s/criticreviews" % imdb + data = read_url(url) + metacritic_url = find_re(data, '"(http://www.metacritic.com/movie/.*?)"') + return metacritic_url or None return 'http://www.metacritic.com/movie/%s' % id -def getId(url): +def get_id(url): return url.split('/')[-1] -def getUrlByImdb(imdb): - url = "http://www.imdb.com/title/tt%s/criticreviews" % imdb - data = read_url(url) - metacritic_url = find_re(data, '"(http://www.metacritic.com/movie/.*?)"') - return metacritic_url or None - -def getMetacriticShowUrl(title): +def get_show_url(title): title = quote(title) url = "http://www.metacritic.com/search/process?ty=6&ts=%s&tfs=tvshow_title&x=0&y=0&sb=0&release_date_s=&release_date_e=&metascore_s=&metascore_e=" % title data = read_url(url) return find_re(data, '(http://www.metacritic.com/tv/shows/.*?)\?') -def getData(url): +def get_data(url): data = read_url(url, unicode=True) doc = document_fromstring(data) score = filter(lambda s: s.attrib.get('property') == 'v:average', @@ -57,7 +56,7 @@ def getData(url): return { 'critics': metacritics, - 'id': getId(url), + 'id': get_id(url), 'score': score, 'url': url, } diff --git a/ox/web/mininova.py b/ox/web/mininova.py index c555051..8bba707 100644 --- a/ox/web/mininova.py +++ b/ox/web/mininova.py @@ -13,7 +13,7 @@ import ox from torrent import Torrent -def _parseResultsPage(data, max_results=10): +def _parse_results_page(data, max_results=10): results=[] regexp = '''(.*?)(.*?)(.*?).*?.*?''' for row in re.compile(regexp, re.DOTALL).findall(data): @@ -27,22 +27,17 @@ def _parseResultsPage(data, max_results=10): results.append((torrentTitle, torrentLink, '')) return results -def findMovie(query, max_results=10): +def find_movie(query=None, imdb=None, max_results=10): '''search for torrents on mininova ''' - url = "http://www.mininova.org/search/%s/seeds" % quote(query) + if imdb: + url = "http://www.mininova.org/imdb/?imdb=%s" % normalize_imdbid(imdb) + else: + url = "http://www.mininova.org/search/%s/seeds" % quote(query) data = read_url(url, unicode=True) - return _parseResultsPage(data, max_results) + return _parse_results_page(data, max_results) -def findMovieByImdb(imdbId): - '''find torrents on mininova for a given imdb id - ''' - results = [] - imdbId = normalize_imdbid(imdbId) - data = read_url("http://www.mininova.org/imdb/?imdb=%s" % imdbId, unicode=True) - return _parseResultsPage(data) - -def getId(mininovaId): +def get_id(mininovaId): mininovaId = unicode(mininovaId) d = find_re(mininovaId, "/(\d+)") if d: @@ -54,7 +49,7 @@ def getId(mininovaId): return mininovaId[-1] def exists(mininovaId): - mininovaId = getId(mininovaId) + mininovaId = get_id(mininovaId) data = ox.net.read_url("http://www.mininova.org/tor/%s" % mininovaId) if not data or 'Torrent not found...' in data: return False @@ -62,11 +57,11 @@ def exists(mininovaId): return False return True -def getData(mininovaId): +def get_data(mininovaId): _key_map = { 'by': u'uploader', } - mininovaId = getId(mininovaId) + mininovaId = get_id(mininovaId) torrent = dict() torrent[u'id'] = mininovaId torrent[u'domain'] = 'mininova.org' @@ -101,7 +96,7 @@ class Mininova(Torrent): '72dfa59d2338e4a48c78cec9de25964cddb64104' ''' def __init__(self, mininovaId): - self.data = getData(mininovaId) + self.data = get_data(mininovaId) if not self.data: return Torrent.__init__(self) diff --git a/ox/web/movieposterdb.py b/ox/web/movieposterdb.py index 27f5638..d3294c3 100644 --- a/ox/web/movieposterdb.py +++ b/ox/web/movieposterdb.py @@ -6,39 +6,39 @@ import re from ox.cache import read_url from ox import find_re -def getData(id): +def get_data(id): ''' - >>> getData('0060304')['posters'][0] + >>> get_data('0060304')['posters'][0] u'http://www.movieposterdb.com/posters/06_03/1967/0060304/l_99688_0060304_639fdd1e.jpg' - >>> getData('0123456')['posters'] + >>> get_data('0123456')['posters'] [] ''' data = { - "url": getUrl(id) + "url": get_url(id) } - data["posters"] = getPostersByUrl(data["url"]) + data["posters"] = get_posters(data["url"]) return data -def getId(url): +def get_id(url): return url.split("/")[-2] -def getPostersByUrl(url, group=True, timeout=-1): +def get_posters(url, group=True, timeout=-1): posters = [] html = read_url(url, timeout=timeout, unicode=True) if url in html: if group: results = re.compile('', re.DOTALL).findall(html) for result in results: - posters += getPostersByUrl(result, False) + posters += get_posters(result, False) results = re.compile('', re.DOTALL).findall(html) for result in results: html = read_url(result, timeout=timeout, unicode=True) posters.append(find_re(html, '"(http://www.movieposterdb.com/posters/.+?\.jpg)"')) return posters -def getUrl(id): +def get_url(id): return "http://www.movieposterdb.com/movie/%s/" % id if __name__ == '__main__': - print getData('0060304') - print getData('0133093') + print get_data('0060304') + print get_data('0133093') diff --git a/ox/web/opensubtitles.py b/ox/web/opensubtitles.py index 1b35599..e3b5f0b 100644 --- a/ox/web/opensubtitles.py +++ b/ox/web/opensubtitles.py @@ -7,7 +7,7 @@ from ox.cache import read_url from ox import find_re, strip_tags from ox import langCode2To3, langTo3Code -def findSubtitlesByImdb(imdb, parts = 1, language = "eng"): +def find_subtitles(imdb, parts = 1, language = "eng"): if len(language) == 2: language = langCode2To3(language) elif len(language) != 3: @@ -29,7 +29,7 @@ def findSubtitlesByImdb(imdb, parts = 1, language = "eng"): opensubtitleId = find_re(data, '/en/subtitles/(.*?)/') return opensubtitleId -def downloadSubtitleById(opensubtitle_id): +def download_subtitle(opensubtitle_id): srts = {} data = read_url('http://www.opensubtitles.org/en/subtitles/%s' % opensubtitle_id) reg_exp = 'href="(/en/download/file/.*?)">(.*?)' diff --git a/ox/web/oxdb.py b/ox/web/oxdb.py index 8443c94..3a9184e 100644 --- a/ox/web/oxdb.py +++ b/ox/web/oxdb.py @@ -2,7 +2,7 @@ # vi:si:et:sw=4:sts=4:ts=4 import ox.cache -def getPosterUrl(id): +def get_poster_url(id): url = "http://0xdb.org/%s/poster.0xdb.jpg" % id if ox.cache.exists(url): return url diff --git a/ox/web/piratecinema.py b/ox/web/piratecinema.py index 47902a4..fc74a3a 100644 --- a/ox/web/piratecinema.py +++ b/ox/web/piratecinema.py @@ -3,7 +3,7 @@ import re from ox.net import read_url -def getPosterUrl(id): +def get_poster_url(id): url = 'http://piratecinema.org/posters/' html = read_url(url, unicode=True) results = re.compile('src="(.+)" title=".+\((\d{7})\)"').findall(html) @@ -13,5 +13,5 @@ def getPosterUrl(id): return '' if __name__ == '__main__': - print getPosterUrl('0749451') + print get_poster_url('0749451') diff --git a/ox/web/rottentomatoes.py b/ox/web/rottentomatoes.py index cc7b041..8c89fd8 100644 --- a/ox/web/rottentomatoes.py +++ b/ox/web/rottentomatoes.py @@ -2,29 +2,30 @@ # vi:si:et:sw=4:sts=4:ts=4 import re -from ox.cache import getHeaders, read_url +from ox.cache import read_url from ox import find_re, strip_tags -def getUrlByImdb(imdb): +def get_url(id=None, imdb=None): #this would also wor but does not cache: ''' from urllib2 import urlopen u = urlopen(url) return u.url ''' - url = "http://www.rottentomatoes.com/alias?type=imdbid&s=%s" % imdb - data = read_url(url) - if "movie_title" in data: - movies = re.compile('(/m/.*?/)').findall(data) - if movies: - return "http://www.rottentomatoes.com" + movies[0] + if imdb: + url = "http://www.rottentomatoes.com/alias?type=imdbid&s=%s" % imdb + data = read_url(url) + if "movie_title" in data: + movies = re.compile('(/m/.*?/)').findall(data) + if movies: + return "http://www.rottentomatoes.com" + movies[0] return None def get_og(data, key): return find_re(data, '(.*?)') diff --git a/ox/web/siteparser.py b/ox/web/siteparser.py index f821cab..d215aef 100644 --- a/ox/web/siteparser.py +++ b/ox/web/siteparser.py @@ -27,7 +27,7 @@ class SiteParser(dict): baseUrl = '' regex = {} - def getUrl(self, page): + def get_url(self, page): return "%s%s" % (self.baseUrl, page) def read_url(self, url, timeout): @@ -35,7 +35,7 @@ class SiteParser(dict): def __init__(self, timeout=-1): for key in self.regex: - url = self.getUrl(self.regex[key]['page']) + url = self.get_url(self.regex[key]['page']) data = self.read_url(url, timeout) if isinstance(self.regex[key]['re'], basestring): data = re.compile(self.regex[key]['re'], re.DOTALL).findall(data) diff --git a/ox/web/spiegel.py b/ox/web/spiegel.py index 1c968f6..390dde8 100644 --- a/ox/web/spiegel.py +++ b/ox/web/spiegel.py @@ -9,7 +9,7 @@ from ox.html import decode_html, strip_tags import ox.net -def getNews(year, month, day): +def get_news(year, month, day): sections = [ 'politik', 'wirtschaft', 'panorama', 'sport', 'kultur', 'netzwelt', 'wissenschaft', 'unispiegel', 'schulspiegel', 'reise', 'auto' @@ -27,7 +27,7 @@ def getNews(year, month, day): for item in re.compile('
    (.*?)
    ', re.DOTALL).findall(item)[0]).strip() try: - description = formatString(re.compile('

    (.*?)<', re.DOTALL).findall(item)[0]) + description = format_string(re.compile('

    (.*?)<', re.DOTALL).findall(item)[0]) except: description = '' try: @@ -35,7 +35,7 @@ def getNews(year, month, day): except: imageUrl = '' try: - title = formatString(re.compile('alt=[\'|"](.*?)[\'|"] title=', re.DOTALL).findall(item)[0]).replace(' : ', ': ').replace('::', ':') + title = format_string(re.compile('alt=[\'|"](.*?)[\'|"] title=', re.DOTALL).findall(item)[0]).replace(' : ', ': ').replace('::', ':') except: title = '' if dateString[:10] == date and description and imageUrl and title.find(': ') != -1: @@ -45,12 +45,12 @@ def getNews(year, month, day): else: new['date'] = '%s-%s-%s %s:%s' % (dateString[6:10], dateString[3:5], dateString[:2], dateString[12:14], dateString[15:17]) # fix decode_html - # new['description'] = formatString(decode_html(description)) - new['description'] = formatString(description) + # new['description'] = format_string(decode_html(description)) + new['description'] = format_string(description) new['imageUrl'] = imageUrl - new['section'] = formatSection(section) - new['title'] = formatString(title) - new['title1'] = new['title'].replace('\xdf', '\xdf\xdf')[:len(formatString(re.compile('

    (.*?)

    ', re.DOTALL).findall(item)[0]))].replace('\xdf\xdf', '\xdf') + new['section'] = format_section(section) + new['title'] = format_string(title) + new['title1'] = new['title'].replace('\xdf', '\xdf\xdf')[:len(format_string(re.compile('

    (.*?)

    ', re.DOTALL).findall(item)[0]))].replace('\xdf\xdf', '\xdf') if new['title1'][-1:] == ':': new['title1'] = new['title1'][0:-1] new['title2'] = new['title'][len(new['title1']) + 2:] @@ -67,21 +67,21 @@ def getNews(year, month, day): ''' return news -def splitTitle(title): +def split_title(title): title1 = re.compile('(.*?): ').findall(title)[0] title2 = re.compile(': (.*?)$').findall(title)[0] return [title1, title2] -def formatString(string): +def format_string(string): string = string.replace(' ', '') string = string.replace('\n', ' ').replace(' ', ' ').strip() string = string.replace('&', '&').replace(''', '\'').replace('"', '"') return string -def formatSection(string): +def format_section(string): return string[:1].upper() + string[1:].replace('spiegel', 'SPIEGEL') -def formatSubsection(string): +def format_subsection(string): # SPIEGEL, SPIEGEL special subsection = { 'abi': 'Abi - und dann?', @@ -98,7 +98,7 @@ def formatSubsection(string): return subsection[string].replace(u'\xc3', 'ae') return string[:1].upper() + string[1:] -def getIssue(year, week): +def get_issue(year, week): coverUrl = 'http://www.spiegel.de/static/epaper/SP/%d/%d/ROSPANZ%d%03d0001-312.jpg' % (year, week, year, week) if not ox.net.exists(coverUrl): return None @@ -122,7 +122,7 @@ def getIssue(year, week): return {'pages': pages, 'contents': contents, 'coverUrl': coverUrl, 'pageUrl': pageUrl} -def archiveIssues(): +def archive_issues(): ''' this is just an example of an archiving application ''' @@ -140,8 +140,8 @@ def archiveIssues(): else: wMax = 53 for w in range(wMax, 0, -1): - print 'getIssue(%d, %d)' % (y, w) - issue = getIssue(y, w) + print 'get_issue(%d, %d)' % (y, w) + issue = get_issue(y, w) if issue: dirname = '%s/%d/%02d' % (archivePath, y, w) if not os.path.exists(dirname): @@ -188,7 +188,7 @@ def archiveIssues(): print p['min'], p['sum'] / p['num'], p['max'] -def archiveNews(): +def archive_news(): ''' this is just an example of an archiving application ''' @@ -235,7 +235,7 @@ def archiveNews(): f.close() filename = filename[:-5] + '.txt' if not os.path.exists(filename) or True: - data = splitTitle(new['title']) + data = split_title(new['title']) data.append(new['description']) data = '\n'.join(data) f = open(filename, 'w') @@ -256,19 +256,14 @@ def archiveNews(): count[string] = {'count': 1, 'string': '%s %s http://www.spiegel.de/%s/0,1518,archiv-%d-%03d,00.html' % (new['date'], new['date'], new['section'].lower(), y, int(datetime(y, m, d).strftime('%j')))} else: count[string] = {'count': count[string]['count'] + 1, 'string': '%s %s' % (new['date'], count[string]['string'][17:])} - strings = splitTitle(new['title']) + strings = split_title(new['title']) if strings[0] != new['title1'] or strings[1] != new['title2']: colon.append('%s %s %s: %s' % (new['date'], new['title'], new['title1'], new['title2'])) - for key in sortDictByKey(count): + for key in sorted(count): print '%6d %-24s %s' % (count[key]['count'], key, count[key]['string']) for value in colon: print value -def sortDictByKey(d): - keys = d.keys() - keys.sort() - return keys - if __name__ == '__main__': # spiegel = Spiegel(2008, 8) # print spiegel.getContents() @@ -281,12 +276,12 @@ if __name__ == '__main__': news = getNews(2008, 2, d) for new in news: strings = new['url'].split('/') - string = formatSection(strings[3]) + string = format_section(strings[3]) if len(strings) == 6: - string += '/' + formatSubsection(strings[4]) + string += '/' + format_subsection(strings[4]) if not string in x: x.append(string) print x ''' - # archiveIssues() - archiveNews() + # archive_issues() + archive_news() diff --git a/ox/web/thepiratebay.py b/ox/web/thepiratebay.py index cc89fae..3e8981e 100644 --- a/ox/web/thepiratebay.py +++ b/ox/web/thepiratebay.py @@ -22,7 +22,9 @@ def read_url(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_ headers['Cookie'] = 'language=en_EN' return cache.read_url(url, data, headers, timeout, unicode=unicode) -def findMovies(query, max_results=10): +def find_movies(query=None, imdb=None, max_results=10): + if imdb: + query = "tt" + normalize_imdbid(imdb) results = [] next = ["http://thepiratebay.org/search/%s/0/3/200" % quote(query), ] page_count = 1 @@ -47,10 +49,7 @@ def findMovies(query, max_results=10): next = re.compile('.*?next.gif.*?').findall(data) return results -def findMovieByImdb(imdb): - return findMovies("tt" + normalize_imdbid(imdb)) - -def getId(piratebayId): +def get_id(piratebayId): if piratebayId.startswith('http://torrents.thepiratebay.org/'): piratebayId = piratebayId.split('org/')[1] d = find_re(piratebayId, "tor/(\d+)") @@ -62,10 +61,10 @@ def getId(piratebayId): return piratebayId def exists(piratebayId): - piratebayId = getId(piratebayId) + piratebayId = get_id(piratebayId) return ox.net.exists("http://thepiratebay.org/torrent/%s" % piratebayId) -def getData(piratebayId): +def get_data(piratebayId): _key_map = { 'spoken language(s)': u'language', 'texted language(s)': u'subtitle language', @@ -73,7 +72,7 @@ def getData(piratebayId): 'leechers': 'leecher', 'seeders': 'seeder', } - piratebayId = getId(piratebayId) + piratebayId = get_id(piratebayId) torrent = dict() torrent[u'id'] = piratebayId torrent[u'domain'] = 'thepiratebay.org' @@ -108,7 +107,7 @@ class Thepiratebay(Torrent): '4e84415d36ed7b54066160c05a0b0f061898d12b' ''' def __init__(self, piratebayId): - self.data = getData(piratebayId) + self.data = get_data(piratebayId) if not self.data: return Torrent.__init__(self) diff --git a/ox/web/tv.py b/ox/web/tv.py index f5a36f1..33e3399 100644 --- a/ox/web/tv.py +++ b/ox/web/tv.py @@ -7,12 +7,12 @@ from ox import strip_tags, find_re from ox.cache import read_url -def getEpisodeData(url): +def get_episode_data(url): ''' prases informatin on tvcom episode pages returns dict with title, show, description, score example: - getEpisodeData('http://www.tv.com/lost/do-no-harm/episode/399310/summary.html') + get_episode_data('http://www.tv.com/lost/do-no-harm/episode/399310/summary.html') ''' data = read_url(url, unicode=True) r = {} diff --git a/ox/web/vimeo.py b/ox/web/vimeo.py index cf2257c..f51216f 100644 --- a/ox/web/vimeo.py +++ b/ox/web/vimeo.py @@ -8,7 +8,7 @@ from ox.cache import read_url from ox import find_string, find_re -def getData(id): +def get_data(id): url = 'http://www.vimeo.com/moogaloop/load/clip:%s' %id xml = read_url(url) tree = ET.parse(StringIO(xml)) diff --git a/ox/web/wikipedia.py b/ox/web/wikipedia.py index 48a5c5a..c809c01 100644 --- a/ox/web/wikipedia.py +++ b/ox/web/wikipedia.py @@ -8,52 +8,45 @@ from ox.cache import read_url from ox import find_re, decode_html -def getId(url): +def get_id(url): return url.split("/")[-1] -def getUrl(id): +def get_url(id=None, imdb=None, allmovie=None): + if imdb: + query = '"%s"'% imdbId + result = find(query) + if result: + url = result[0][1] + data = get_movie_data(url) + if 'imdb_id' in data: + return url + return "" + if allmovie: + query = '"amg_id = 1:%s"'% allmovie + result = find(query) + if result: + url = result[0][1] + return url + return '' return "http://en.wikipedia.org/wiki/%s" % id - -def getMovieId(title, director='', year=''): +def get_movie_id(title, director='', year=''): query = '"%s" film %s %s' % (title, director, year) result = find(query, 1) if result: return result[0][1] return '' -def getUrlByImdbId(imdbId): - query = '"%s"'% imdbId - result = find(query) - if result: - url = result[0][1] - data = getMovieData(url) - if 'imdb_id' in data: - return url - return "" - -def getUrlByImdb(imdbId): - # deprecated, use getUrlByImdbId() - return getUrlByImdbId(imdbId) - -def getUrlByAllmovieId(allmovieId): - query = '"amg_id = 1:%s"'% allmovieId - result = find(query) - if result: - url = result[0][1] - return url - return '' - -def getWikiData(wikipediaUrl): - url = wikipediaUrl.replace('wikipedia.org/wiki/', 'wikipedia.org/w/index.php?title=') +def get_wiki_data(wikipedia_url): + url = wikipedia_url.replace('wikipedia.org/wiki/', 'wikipedia.org/w/index.php?title=') url = "%s&action=raw" % url data = read_url(url).decode('utf-8') return data -def getMovieData(wikipediaUrl): - if not wikipediaUrl.startswith('http'): - wikipediaUrl = getUrl(wikipediaUrl) - data = getWikiData(wikipediaUrl) +def get_movie_data(wikipedia_url): + if not wikipedia_url.startswith('http'): + wikipedia_url = get_url(wikipedia_url) + data = get_wiki_data(wikipedia_url) filmbox_data = find_re(data, '''\{\{[Ii]nfobox.[Ff]ilm(.*?)\n\}\}''') filmbox = {} _box = filmbox_data.strip().split('|') @@ -104,7 +97,7 @@ def getMovieData(wikipediaUrl): filmbox['title_sort'] = find_re(data, '''\{\{DEFAULTSORT:(.*?)\}\}''') return filmbox -def getImageUrl(name): +def get_image_url(name): url = 'http://en.wikipedia.org/wiki/Image:' + name.replace(' ', '%20') data = read_url(url, unicode=True) url = find_re(data, 'href="(http://upload.wikimedia.org/.*?)"') @@ -114,19 +107,19 @@ def getImageUrl(name): url = 'http:' + url return url -def getPosterUrl(wikipediaUrl): - if not wikipediaUrl.startswith('http'): wikipediaUrl = getUrl(wikipediaUrl) - data = getMovieData(wikipediaUrl) +def get_poster_url(wikipedia_url): + if not wikipedia_url.startswith('http'): wikipedia_url = get_url(wikipedia_url) + data = get_movie_data(wikipedia_url) if 'image' in data: - return getImageUrl(data['image']) + return get_image_url(data['image']) return '' -def getMoviePoster(wikipediaUrl): - # deprecated, use getPosterUrl() - return getPosterUrl(wikipediaUrl) +def get_movie_poster(wikipedia_url): + # deprecated, use get_poster_url() + return get_poster_url(wikipedia_url) -def getAllmovieId(wikipediaUrl): - data = getMovieData(wikipediaUrl) +def get_allmovie_id(wikipedia_url): + data = get_movie_data(wikipedia_url) return data.get('amg_id', '') def find(query, max_results=10): diff --git a/ox/web/youtube.py b/ox/web/youtube.py index f1edfc9..6c7651e 100644 --- a/ox/web/youtube.py +++ b/ox/web/youtube.py @@ -8,7 +8,7 @@ import feedparser from ox.cache import read_url, cache_timeout -def getVideoUrl(youtubeId, format='mp4', timeout=cache_timeout): +def video_url(youtubeId, format='mp4', timeout=cache_timeout): """ youtubeId - if of video format - video format, options: webm, 1080p, 720p, mp4, high