From 4c14ce613d280a64e2cfb6b9174606ddb841d5a3 Mon Sep 17 00:00:00 2001 From: j <0x006A@0x2620.org> Date: Thu, 19 Jun 2008 11:47:02 +0200 Subject: [PATCH] vi:si:et:sw=4:sts=4:ts=4 --- ox/__init__.py | 4 +- ox/criterion.py | 1 + ox/dailymotion.py | 27 +- ox/google.py | 34 +- ox/imdb.py | 1136 +++++++++++++++++++++---------------------- ox/impawards.py | 4 +- ox/itunes.py | 288 +++++------ ox/lyricsfly.py | 4 +- ox/mininova.py | 193 ++++---- ox/opensubtitles.py | 64 ++- ox/spiegel.py | 5 +- ox/thepiratebay.py | 168 ++++--- ox/torrent.py | 62 ++- ox/wikipedia.py | 102 ++-- ox/youtube.py | 80 +-- setup.py | 50 +- 16 files changed, 1088 insertions(+), 1134 deletions(-) diff --git a/ox/__init__.py b/ox/__init__.py index 4e1b167..aaacdc1 100644 --- a/ox/__init__.py +++ b/ox/__init__.py @@ -1,7 +1,5 @@ -# -*- Mode: Python; -*- -# vi:si:et:sw=2:sts=2:ts=2 +# vi:si:et:sw=4:sts=4:ts=4 # encoding: utf-8 - __version__ = '0.1.0' import imdb diff --git a/ox/criterion.py b/ox/criterion.py index 4ba4fe6..f2cf34a 100644 --- a/ox/criterion.py +++ b/ox/criterion.py @@ -7,6 +7,7 @@ from oxutils.cache import getUrlUnicode from oxutils.html import stripTags from oxutils.text import findRe, removeSpecialCharacters + def getData(criterionId): ''' >>> getData(348)['imdbId'] diff --git a/ox/dailymotion.py b/ox/dailymotion.py index 1dafa75..f50b5eb 100644 --- a/ox/dailymotion.py +++ b/ox/dailymotion.py @@ -1,19 +1,22 @@ +# -*- coding: utf-8 -*- +# vi:si:et:sw=4:sts=4:ts=4 import re from urllib import unquote from oxutils.cache import getUrl + def getVideoUrl(url): - ''' - >>> getVideoUrl('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3opar_priere-pour-refuznik-1-jeanluc-goda_shortfilms') - 'http://www.dailymotion.com/get/16/320x240/flv/6191379.flv?key=0a710ad6ffbfe980b1252569d16f957313399d0' + ''' + >>> getVideoUrl('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3opar_priere-pour-refuznik-1-jeanluc-goda_shortfilms') + 'http://www.dailymotion.com/get/16/320x240/flv/6191379.flv?key=0a710ad6ffbfe980b1252569d16f957313399d0' - >>> getVideoUrl('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3ou94_priere-pour-refuznik-2-jeanluc-goda_shortfilms') - 'http://www.dailymotion.com/get/15/320x240/flv/6197800.flv?key=08a18365ca6962c5ff7526f69872c36813399d4' - ''' - data = getUrl(url) - video = re.compile('''video", "(.*?)"''').findall(data) - for v in video: - v = unquote(v).split('@@')[0] - return "http://www.dailymotion.com" + v - return '' + >>> getVideoUrl('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3ou94_priere-pour-refuznik-2-jeanluc-goda_shortfilms') + 'http://www.dailymotion.com/get/15/320x240/flv/6197800.flv?key=08a18365ca6962c5ff7526f69872c36813399d4' + ''' + data = getUrl(url) + video = re.compile('''video", "(.*?)"''').findall(data) + for v in video: + v = unquote(v).split('@@')[0] + return "http://www.dailymotion.com" + v + return '' diff --git a/ox/google.py b/ox/google.py index 03d0774..1fa910a 100644 --- a/ox/google.py +++ b/ox/google.py @@ -1,6 +1,5 @@ -# -*- Mode: Python; -*- # -*- coding: utf-8 -*- -# vi:si:et:sw=2:sts=2:ts=2 +# vi:si:et:sw=4:sts=4:ts=4 import re import time import urllib @@ -29,24 +28,23 @@ FIXME: how search depper than first page? DEFAULT_MAX_RESULTS = 10 def getUrl(url, data=None, headers=oxutils.net.DEFAULT_HEADERS): - google_timeout=24*60*60 - return oxutils.cache.getUrl(url, data, headers, google_timeout) + google_timeout=24*60*60 + return oxutils.cache.getUrl(url, data, headers, google_timeout) def quote_plus(s): - return urllib.quote_plus(s.encode('utf-8')) + return urllib.quote_plus(s.encode('utf-8')) def find(query, max_results=DEFAULT_MAX_RESULTS): - url = "http://www.google.com/search?q=%s" % quote_plus(query) - data = getUrl(url) - link_re = r'(?P.*?)' + \ - r'.*?(?:
|)' + \ - r'(?P.*?)' + '(?:| max_results: - results = results[:max_results] - return results - + url = "http://www.google.com/search?q=%s" % quote_plus(query) + data = getUrl(url) + link_re = r'(?P.*?)' + \ + r'.*?(?:
|)' + \ + r'(?P.*?)' + '(?:| max_results: + results = results[:max_results] + return results diff --git a/ox/imdb.py b/ox/imdb.py index 2c1f325..d776095 100644 --- a/ox/imdb.py +++ b/ox/imdb.py @@ -1,11 +1,8 @@ -# -*- Mode: Python; -*- # -*- coding: utf-8 -*- -# vi:si:et:sw=2:sts=2:ts=2 - -from oxutils import * +# vi:si:et:sw=4:sts=4:ts=4 import urllib2 from urllib import quote, unquote -import re, time +import re import os import time @@ -15,688 +12,647 @@ import oxutils from oxutils import stripTags, decodeHtml, findRe, findString from oxutils.cache import getUrl, getUrlUnicode from oxutils.normalize import normalizeTitle, normalizeImdbId +from oxutils import * import google -_timer = -1 -_timer_last = -1 -def debugTime(message=''): - global _timer, _timer_last - if _timer == -1: - _timer = time.time() - if _timer_last == -1: - _timer_last = time.time() - now = time.time() - print message," since last: %0.2f total time: %0.2f" % (now-_timer_last, now-_timer) - _timer_last = now def getMovieId(title, director='', year=''): - ''' - >>> getMovieId('The Matrix') - '0133093' - ''' - if year: - title = "%s (%s)" % (title, year) - if director: - query = 'site:imdb.com %s "%s"' % (director, title) - else: - query = 'site:imdb.com "%s"' % title - for (name, url, desc) in google.find(query, 3): - if url.startswith('http://www.imdb.com/title/tt'): - return url[28:35] - return '' + ''' + >>> getMovieId('The Matrix') + '0133093' + ''' + if year: + title = "%s (%s)" % (title, year) + if director: + query = 'site:imdb.com %s "%s"' % (director, title) + else: + query = 'site:imdb.com "%s"' % title + for (name, url, desc) in google.find(query, 3): + if url.startswith('http://www.imdb.com/title/tt'): + return url[28:35] + return '' def getMovieData(imdbId): - return IMDb(imdbId).parse() + return IMDb(imdbId).parse() # internal functions below def getUrlBase(imdbId): - return "http://www.imdb.com/title/tt%s" % imdbId + return "http://www.imdb.com/title/tt%s" % imdbId def getRawMovieData(imdbId): - imdbId = normalizeImdbId(imdbId) - data = getMovieInfo(imdbId) - data['credits'] = getMovieCredits(imdbId) - data['poster'] = getMoviePoster(imdbId) - data['company credits'] = getMovieCompanyCredits(imdbId) - data['filming locations'] = getMovieLocations(imdbId) - data['movie connections'] = getMovieConnections(imdbId) - data['external reviews'] = getMovieExternalReviews(imdbId) - data['trivia'] = getMovieTrivia(imdbId) - data['keywords'] = getMovieKeywords(imdbId) - data['media'] = {} - data['media']['images'] = getMovieImages(imdbId) - data['media']['trailers'] = getMovieTrailers(imdbId) - data['plotsummary'] = getMoviePlot(imdbId) - data['release dates'] = getMovieReleaseDates(imdbId) - data['release date'] = getMovieReleaseDate(imdbId) - return data + imdbId = normalizeImdbId(imdbId) + data = getMovieInfo(imdbId) + data['credits'] = getMovieCredits(imdbId) + data['poster'] = getMoviePoster(imdbId) + data['company credits'] = getMovieCompanyCredits(imdbId) + data['filming locations'] = getMovieLocations(imdbId) + data['movie connections'] = getMovieConnections(imdbId) + data['external reviews'] = getMovieExternalReviews(imdbId) + data['trivia'] = getMovieTrivia(imdbId) + data['keywords'] = getMovieKeywords(imdbId) + data['media'] = {} + data['media']['images'] = getMovieImages(imdbId) + data['media']['trailers'] = getMovieTrailers(imdbId) + data['plotsummary'] = getMoviePlot(imdbId) + data['release dates'] = getMovieReleaseDates(imdbId) + data['release date'] = getMovieReleaseDate(imdbId) + return data def getMovieInfo(imdbId): - data = getUrlUnicode(getUrlBase(imdbId)) - info = dict() - info['poster'] = findRe(data, 'name="poster".*?(.*?):(.*?)
(.*?):(.*?)
(.*?)
') - if not html_title: - html_title = findRe(data, '(.*?)') - if html_title: - html_title = html_title.replace('
', ' ').replace(' ', ' ') - title = decodeHtml(html_title) - title = stripTags(title) - year = findRe(title, '\((\d{4})\)') - if not year: - year = findRe(title, '\((\d{4})') - _y = findRe(title, r'(\([0-9\?]{4}[/IVXLCDM]*?\))') - if _y: - title = title.replace(_y, '') - for t in ('TV series', 'TV-Series', 'TV mini-series', '(mini)', '(VG)', '(V)', '(TV)'): - title = title.replace(t, '') - title = title.strip() - if title.find(u'\xa0') > -1: - title = title[:title.find(u'\xa0')].strip() - if title.startswith('"') and title.endswith('"'): - title = title[1:-1] - info['title'] = title - info['year'] = year + #get Title + title = '' + year = '' + html_title = findRe(data, '
(.*?)
') + if not html_title: + html_title = findRe(data, '(.*?)') + if html_title: + html_title = html_title.replace('
', ' ').replace(' ', ' ') + title = decodeHtml(html_title) + title = stripTags(title) + year = findRe(title, '\((\d{4})\)') + if not year: + year = findRe(title, '\((\d{4})') + _y = findRe(title, r'(\([0-9\?]{4}[/IVXLCDM]*?\))') + if _y: + title = title.replace(_y, '') + for t in ('TV series', 'TV-Series', 'TV mini-series', '(mini)', '(VG)', '(V)', '(TV)'): + title = title.replace(t, '') + title = title.strip() + if title.find(u'\xa0') > -1: + title = title[:title.find(u'\xa0')].strip() + if title.startswith('"') and title.endswith('"'): + title = title[1:-1] + info['title'] = title + info['year'] = year - #Rating - rating = findRe(data, '([\d\.]*?)/10') - if rating: - info['rating'] = float(rating) - else: - info['rating'] = -1 + #Rating + rating = findRe(data, '([\d\.]*?)/10') + if rating: + info['rating'] = float(rating) + else: + info['rating'] = -1 - #Votes - votes = findRe(data, '\((.*?) votes\)') - if votes: - info['votes'] = int(votes.replace(',', '')) - else: - info['votes'] = -1 - return info + #Votes + votes = findRe(data, '\((.*?) votes\)') + if votes: + info['votes'] = int(votes.replace(',', '')) + else: + info['votes'] = -1 + return info def getMoviePoster(imdbId): - info = getMovieInfo(imdbId) - return info['poster'] + info = getMovieInfo(imdbId) + return info['poster'] def getMovieYear(imdbId): - info = getMovieInfo(imdbId) - return info['year'] + info = getMovieInfo(imdbId) + return info['year'] def getMovieTitle(imdbId): - info = getMovieInfo(imdbId) - return info['title'] + info = getMovieInfo(imdbId) + return info['title'] def creditList(data, section=None): - if section == 'cast': - credits_ = re.compile('''(.*?).*?(.*?)''').findall(data) - else: - credits_ = re.compile('''.*?(.*?)(.*?)''').findall(data) - credits = [] - for c_ in credits_: - c = [decodeHtml(c_[0]).strip(), decodeHtml(c_[1]).strip()] - if section=='writers': - c[1] = c[1].replace('
', '').strip().replace(')', '').replace('(','') - if c[1].endswith(' and'): c[1] = c[1][:-4] - credits.append(c) - return credits + if section == 'cast': + credits_ = re.compile('''(.*?).*?(.*?)''').findall(data) + else: + credits_ = re.compile('''.*?(.*?)(.*?)''').findall(data) + credits = [] + for c_ in credits_: + c = [decodeHtml(c_[0]).strip(), decodeHtml(c_[1]).strip()] + if section=='writers': + c[1] = c[1].replace('
', '').strip().replace(')', '').replace('(','') + if c[1].endswith(' and'): c[1] = c[1][:-4] + credits.append(c) + return credits def getMovieCredits(imdbId): - credits = dict() - url = "%s/fullcredits" % getUrlBase(imdbId) - data = getUrlUnicode(url) - groups = data.split('
') - for g in groups: - section = re.compile('''name="(.*?)".*? href="/Glossary''').findall(g) - if section: - credits[section[0]] = creditList(g, section[0]) - return credits + credits = dict() + url = "%s/fullcredits" % getUrlBase(imdbId) + data = getUrlUnicode(url) + groups = data.split('
') + for g in groups: + section = re.compile('''name="(.*?)".*? href="/Glossary''').findall(g) + if section: + credits[section[0]] = creditList(g, section[0]) + return credits def getMovieTrailers(imdbId): - url = "%s/trailers" % getUrlBase(imdbId) - data = getUrlUnicode(url) - soup = BeautifulSoup(data) - videos = soup('div', {'class':"video-gallery"}) - trailers = [] - if videos: - for a in videos[0]('a'): - title = stripTags(unicode(a)).strip() - url = 'http://www.imdb.com' + a['href'] - videoId = findRe(url, '/(vi\d*?)/') - iframeUrl = "http://www.imdb.com/video/trailer/%s/player" % videoId - iframe = getUrlUnicode(iframeUrl) - videoUrl = unquote(findRe(iframe, 'addVariable\("file", "(.*?)"')) - trailers.append({'title': title, 'url': url, 'iframe': iframeUrl, 'flv':videoUrl}) - return trailers + url = "%s/trailers" % getUrlBase(imdbId) + data = getUrlUnicode(url) + soup = BeautifulSoup(data) + videos = soup('div', {'class':"video-gallery"}) + trailers = [] + if videos: + for a in videos[0]('a'): + title = stripTags(unicode(a)).strip() + url = 'http://www.imdb.com' + a['href'] + videoId = findRe(url, '/(vi\d*?)/') + iframeUrl = "http://www.imdb.com/video/trailer/%s/player" % videoId + iframe = getUrlUnicode(iframeUrl) + videoUrl = unquote(findRe(iframe, 'addVariable\("file", "(.*?)"')) + trailers.append({'title': title, 'url': url, 'iframe': iframeUrl, 'flv':videoUrl}) + return trailers def getMovieQuotes(imdbId): - url = "%s/quotes" % getUrlBase(imdbId) - data = getUrlUnicode(url) - quotes = re.compile('(.*?):(.*?)
', re.DOTALL).findall(findString(data, '(.*?):(.*?)
', re.DOTALL).findall(findString(data, '
(.*?)') - return plot + url = "%s/plotsummary" % getUrlBase(imdbId) + data = getUrlUnicode(url) + plot = findRe(data, '

(.*?)') + return plot def getMovieTechnical(imdbId): - url = "%s/technical" % getUrlBase(imdbId) - data = getUrlUnicode(url) - results = {} - for t in re.compile('

(.*?)
(.*?)
', re.DOTALL).findall(data): - results[t[0].strip()] = t[1].strip() - return results + url = "%s/technical" % getUrlBase(imdbId) + data = getUrlUnicode(url) + results = {} + for t in re.compile('
(.*?)
(.*?)
', re.DOTALL).findall(data): + results[t[0].strip()] = t[1].strip() + return results def getMovieCompanyCredits(imdbId): - url = "%s/companycredits" % getUrlBase(imdbId) - data = getUrlUnicode(url) - results = {} - for field, c in re.compile('

(.*?)

    (.*?)
').findall(data): - results[field.strip()] = [] - for company in re.compile('
  • (.*?)
  • ').findall(c): - results[field.strip()].append(company) - return results + url = "%s/companycredits" % getUrlBase(imdbId) + data = getUrlUnicode(url) + results = {} + for field, c in re.compile('

    (.*?)

      (.*?)
    ').findall(data): + results[field.strip()] = [] + for company in re.compile('
  • (.*?)
  • ').findall(c): + results[field.strip()].append(company) + return results def getMovieLocations(imdbId): - url = "%s/locations" % getUrlBase(imdbId) - data = getUrlUnicode(url) - soup = BeautifulSoup(data) - locations = [] - for key in soup('a', {'href': re.compile('^/List')}): - locations.append(decodeHtml(key.string)) - return locations + url = "%s/locations" % getUrlBase(imdbId) + data = getUrlUnicode(url) + soup = BeautifulSoup(data) + locations = [] + for key in soup('a', {'href': re.compile('^/List')}): + locations.append(decodeHtml(key.string)) + return locations def getMovieImages(imdbId, keys=('still_frame', 'poster', 'product')): - photos = {} - for key in keys: - url = "%s/mediaindex?refine=%s" % (getUrlBase(imdbId), key) - data = getUrlUnicode(url) - photos[key] = {} - for s in re.compile('''(.*?)', '').strip() - if t.startswith('
  • ') and t.endswith('
  • '): - t = t[4:-5].strip() - t=decodeHtml(t) - trivia.append(t) - return trivia + url = "%s/trivia" % getUrlBase(imdbId) + data = getUrlUnicode(url) + soup = BeautifulSoup(data) + trivia = [] + triviaList = [] + for i in soup('ul', {'class': "trivia"}): + for t in i('li'): + t = unicode(t).replace('
    ', '').strip() + if t.startswith('
  • ') and t.endswith('
  • '): + t = t[4:-5].strip() + t=decodeHtml(t) + trivia.append(t) + return trivia def getMovieConnections(imdbId): - url = "%s/movieconnections" % getUrlBase(imdbId) - data = getUrl(url) - connections={} - for c in re.compile('''
    (.*?)
    (.*?)\n\n''', re.DOTALL).findall(data): - connections[unicode(c[0])] = re.compile('''
    ''').findall(c[1]) - return connections + url = "%s/movieconnections" % getUrlBase(imdbId) + data = getUrl(url) + connections={} + for c in re.compile('''
    (.*?)
    (.*?)\n\n''', re.DOTALL).findall(data): + connections[unicode(c[0])] = re.compile('''
    ''').findall(c[1]) + return connections def getMovieKeywords(imdbId): - url = "%s/keywords" % getUrlBase(imdbId) - data = getUrlUnicode(url) - keywords = [] - for keyword in re.compile('''(.*?)''').findall(data): - keyword = decodeHtml(keyword) - keyword = keyword.replace(u'\xa0', ' ') - keywords.append(keyword) - return keywords + url = "%s/keywords" % getUrlBase(imdbId) + data = getUrlUnicode(url) + keywords = [] + for keyword in re.compile('''(.*?)''').findall(data): + keyword = decodeHtml(keyword) + keyword = keyword.replace(u'\xa0', ' ') + keywords.append(keyword) + return keywords def getMovieExternalReviews(imdbId): - url = "%s/externalreviews" % getUrlBase(imdbId) - data = getUrlUnicode(url) - soup = BeautifulSoup(data) - ol = soup('ol') - if ol: - ol = ol[0] - ret = {} - for li in ol('li'): - try: - a = li('a')[0] - href = a.get('href') - txt = a.contents[0] - ret[href] = txt - except: - pass - return ret - return {} + url = "%s/externalreviews" % getUrlBase(imdbId) + data = getUrlUnicode(url) + soup = BeautifulSoup(data) + ol = soup('ol') + if ol: + ol = ol[0] + ret = {} + for li in ol('li'): + try: + a = li('a')[0] + href = a.get('href') + txt = a.contents[0] + ret[href] = txt + except: + pass + return ret + return {} def getMovieReleaseDate(imdbId): - releasedates = getMovieReleaseDates(imdbId) - first_release = '' - for r in releasedates: - if not first_release or r[1] < first_release: - first_release = r[1] - return first_release + releasedates = getMovieReleaseDates(imdbId) + first_release = '' + for r in releasedates: + if not first_release or r[1] < first_release: + first_release = r[1] + return first_release def getMovieReleaseDates(imdbId): - url = "%s/releaseinfo" % getUrlBase(imdbId) - data = getUrlUnicode(url) - releasedates = [] - regexp = '''(.*?).*?(.*?).*?(.*?)''' + url = "%s/releaseinfo" % getUrlBase(imdbId) + data = getUrlUnicode(url) + releasedates = [] + regexp = '''(.*?).*?(.*?).*?(.*?)''' - def _parse_date(d): - try: - parsed_date = time.strptime(d, "%d %B %Y") - parsed_date = time.strftime('%Y-%m-%d', parsed_date) - return parsed_date - except: - return d - - for r in re.compile(regexp, re.DOTALL).findall(data): - r_ = (stripTags(r[0]).strip(), - _parse_date(stripTags(r[1]).strip()), - decodeHtml(stripTags(r[2]).strip())) - releasedates.append(r_) - return releasedates - soup = BeautifulSoup(data) - info = soup('table',{'border': '0', 'cellpadding':'2'}) - if info: - for row in info[0]('tr'): - d = row('td', {'align':'right'}) - if d: + def _parse_date(d): try: - possible_date = stripTags(unicode(d[0])).strip() - rdate = time.strptime(possible_date, "%d %B %Y") - rdate = time.strftime('%Y-%m-%d', rdate) - return rdate + parsed_date = time.strptime(d, "%d %B %Y") + parsed_date = time.strftime('%Y-%m-%d', parsed_date) + return parsed_date except: - pass - return None + return d + + for r in re.compile(regexp, re.DOTALL).findall(data): + r_ = (stripTags(r[0]).strip(), + _parse_date(stripTags(r[1]).strip()), + decodeHtml(stripTags(r[2]).strip())) + releasedates.append(r_) + return releasedates def getMovieBusinessSum(imdbId): - business = getMovieBusiness(imdbId) - b_ = {'budget': 0, 'gross': 0, 'profit': 0} - if 'budget' in business: - b_['budget'] = sum([int(intValue(i.replace(',', ''))) for i in business['budget']]) - if 'gross' in business: - b_['gross'] = sum([int(intValue(i.replace(',', ''))) for i in business['gross']]) - if 'weekend gross' in business: - b_['gross'] += sum([int(intValue(i.replace(',', ''))) for i in business['weekend gross']]) - if b_['budget'] and b_['gross']: - b_['profit'] = b_['gross'] - b_['budget'] - return b_ + business = getMovieBusiness(imdbId) + b_ = {'budget': 0, 'gross': 0, 'profit': 0} + if 'budget' in business: + b_['budget'] = sum([int(intValue(i.replace(',', ''))) for i in business['budget']]) + if 'gross' in business: + b_['gross'] = sum([int(intValue(i.replace(',', ''))) for i in business['gross']]) + if 'weekend gross' in business: + b_['gross'] += sum([int(intValue(i.replace(',', ''))) for i in business['weekend gross']]) + if b_['budget'] and b_['gross']: + b_['profit'] = b_['gross'] - b_['budget'] + return b_ def getMovieFlimingDates(imdbId): - business = getMovieBusiness(imdbId) - if 'filming dates' in business and business['filming dates']: - return business['filming dates'][0] - return '' + business = getMovieBusiness(imdbId) + if 'filming dates' in business and business['filming dates']: + return business['filming dates'][0] + return '' def getMovieBusiness(imdbId): - url = "%s/business" % getUrlBase(imdbId) - data = getUrlUnicode(url) - business = {} - for r in re.compile('''
    (.*?)
    (.*?)
    .
    ''', re.DOTALL).findall(data): - key = stripTags(r[0]).strip().lower() - value = [decodeHtml(stripTags(b).strip()) for b in r[1].split('
    ')] - business[key] = value - return business - soup = BeautifulSoup(data) - business = {'budget': 0, 'gross': 0, 'profit': 0} - content = soup('div', {'id': 'tn15content'})[0] - blocks = unicode(content).split('
    ')[1:] - for c in blocks: - cs = BeautifulSoup(c) - line = c.split('
    ') - if line: - title = line[0] - line = line[1] - if title in ['Budget', 'Gross']: - values = re.compile('\$(.*?) ').findall(line) - values = [int(value.replace(',','')) for value in values] - if values: - business[title.lower()] = max(values) - if business['budget'] and business['gross']: - business['profit'] = business['gross'] - business['budget'] - return business + url = "%s/business" % getUrlBase(imdbId) + data = getUrlUnicode(url) + business = {} + for r in re.compile('''
    (.*?)
    (.*?)
    .
    ''', re.DOTALL).findall(data): + key = stripTags(r[0]).strip().lower() + value = [decodeHtml(stripTags(b).strip()) for b in r[1].split('
    ')] + business[key] = value + return business def getMovieEpisodes(imdbId): - url = "%s/episodes" % getUrlBase(imdbId) - data = getUrlUnicode(url) - episodes = {} - regexp = r'''

    Season (.*?), Episode (.*?): (.*?)

    (.*?)
    (.*?)
    ''' - for r in re.compile(regexp, re.DOTALL).findall(data): - try: - episode = "S%02dE%02d" % (int(r[0]), int(r[1])) - episodes[episode] = {} - episodes[episode]['imdb'] = r[2] - episodes[episode]['title'] = r[3].strip() - if episodes[episode]['title'].startswith('Episode #%d'%int(r[0])): - episodes[episode]['title'] = u'' - description = decodeHtml(r[5]) - description = stripTags(description.split('Next US airings:')[0]) - episodes[episode]['description'] = description.strip() - episodes[episode]['date'] = '' - try: - d = stripTags(r[4]) - d = d.replace('Original Air Date: ', '') - d = time.strftime("%Y-%m-%d", time.strptime(d, '%d %B %Y')) - episodes[episode]['date'] = d - except: - pass - except: - import traceback - print traceback.print_exc() - pass - return episodes + url = "%s/episodes" % getUrlBase(imdbId) + data = getUrlUnicode(url) + episodes = {} + regexp = r'''

    Season (.*?), Episode (.*?): (.*?)

    (.*?)
    (.*?)
    ''' + for r in re.compile(regexp, re.DOTALL).findall(data): + try: + episode = "S%02dE%02d" % (int(r[0]), int(r[1])) + episodes[episode] = {} + episodes[episode]['imdb'] = r[2] + episodes[episode]['title'] = r[3].strip() + if episodes[episode]['title'].startswith('Episode #%d'%int(r[0])): + episodes[episode]['title'] = u'' + description = decodeHtml(r[5]) + description = stripTags(description.split('Next US airings:')[0]) + episodes[episode]['description'] = description.strip() + episodes[episode]['date'] = '' + try: + d = stripTags(r[4]) + d = d.replace('Original Air Date: ', '') + d = time.strftime("%Y-%m-%d", time.strptime(d, '%d %B %Y')) + episodes[episode]['date'] = d + except: + pass + except: + import traceback + print traceback.print_exc() + pass + return episodes '''the old code below''' class IMDb: - def __init__(self, imdbId): - self.imdb = imdbId - self.pageUrl = "http://www.imdb.com/title/tt%s/" % self.imdb + def __init__(self, imdbId): + self.imdb = imdbId + self.pageUrl = "http://www.imdb.com/title/tt%s/" % self.imdb - def getPage(self): - return getUrlUnicode(self.pageUrl) + def getPage(self): + return getUrlUnicode(self.pageUrl) - def parse_raw_value(self, key, value): - if key in ('runtime', 'language', 'genre', 'country', 'tagline', 'plot_outline'): - value = stripTags(value).strip() - if key == 'runtime': - parsed_value = findRe(value, '(.*?) min') - parsed_value = findRe(parsed_value, '([0-9]+)') - if not parsed_value: - parsed_value = findRe(value, '(.*?) sec') - parsed_value = findRe(parsed_value, '([0-9]+)') - if not parsed_value: - parsed_value = 0 + def parse_raw_value(self, key, value): + if key in ('runtime', 'language', 'genre', 'country', 'tagline', 'plot_outline'): + value = stripTags(value).strip() + if key == 'runtime': + parsed_value = findRe(value, '(.*?) min') + parsed_value = findRe(parsed_value, '([0-9]+)') + if not parsed_value: + parsed_value = findRe(value, '(.*?) sec') + parsed_value = findRe(parsed_value, '([0-9]+)') + if not parsed_value: + parsed_value = 0 + else: + parsed_value = int(parsed_value) + else: + parsed_value = int(parsed_value) * 60 + elif key in ('country', 'language'): + parsed_value = value.split(' / ') + if len(parsed_value) == 1: + parsed_value = parsed_value[0].split(' | ') + parsed_value = [v.strip() for v in parsed_value] + elif key == 'genre': + parsed_value = value.replace('more', '').strip().split(' / ') + if len(parsed_value) == 1: + parsed_value = parsed_value[0].split(' | ') + parsed_value = [v.strip() for v in parsed_value] + elif key == 'tagline': + parsed_value = value.replace('more', '').strip() + elif key == 'plot_outline': + parsed_value = value.replace('(view trailer)', '').strip() + if parsed_value.endswith('more'): + parsed_value = parsed_value[:-4].strip() + elif key == 'tv_series': + m = re.compile('(.*?)').findall(value) + if m: + parsed_value = m[0][0] + else: + parsed_value = '' + elif key == 'also_known_as': + parsed_value = '' + m = re.compile('(.*) \(International: English title').findall(value) + if m: + parsed_value = m[0] + else: + m = re.compile('(.*) \(USA').findall(value) + if m: + parsed_value = m[0] + parsed_value = parsed_value.split('
    ')[-1].split('(')[0] + director = self.getCredits().get('director', None) + if director: + director = director[0] + parsed_value = parsed_value.replace(director, '') + if parsed_value.startswith("'s"): + parsed_value = parsed_value[2:].strip() + parsed_value = decodeHtml(parsed_value.strip()) else: - parsed_value = int(parsed_value) - else: - parsed_value = int(parsed_value) * 60 - elif key in ('country', 'language'): - parsed_value = value.split(' / ') - if len(parsed_value) == 1: - parsed_value = parsed_value[0].split(' | ') - parsed_value = [v.strip() for v in parsed_value] - elif key == 'genre': - parsed_value = value.replace('more', '').strip().split(' / ') - if len(parsed_value) == 1: - parsed_value = parsed_value[0].split(' | ') - parsed_value = [v.strip() for v in parsed_value] - elif key == 'tagline': - parsed_value = value.replace('more', '').strip() - elif key == 'plot_outline': - parsed_value = value.replace('(view trailer)', '').strip() - if parsed_value.endswith('more'): - parsed_value = parsed_value[:-4].strip() - elif key == 'tv_series': - m = re.compile('(.*?)').findall(value) - if m: - parsed_value = m[0][0] - else: - parsed_value = '' - elif key == 'also_known_as': - parsed_value = '' - m = re.compile('(.*) \(International: English title').findall(value) - if m: - parsed_value = m[0] - else: - m = re.compile('(.*) \(USA').findall(value) + print value + parsed_value = value + return parsed_value + + def parseTitle(self): + title = getMovieTitle(self.imdb) + title = normalizeTitle(title) + if title.startswith('"') and title.find('"',1) > 0 and \ + title.find('"',1) == title.rfind('"'): + data = self.getPage() + se = re.compile("Season (\d*), Episode (\d*)\)").findall(data) + if se: + se = se[0] + se = ' (S%02dE%02d) ' % (int(se[0]), int(se[1])) + title = normalizeTitle(title[1:title.rfind('"')]) + se + title[title.rfind('"')+1:].strip() + else: + part2 = title[title.rfind('"')+1:] + part2 = re.sub("[\d\?-]", "", part2).strip() + title = normalizeTitle(title[1:title.rfind('"')]) + if part2: + title += ':' + part2 + return normalizeTitle(title) + + def parseYear(self): + year = '' + data = self.getPage() + soup = BeautifulSoup(data) + html_title = soup('div', {'id': 'tn15title'}) + if not html_title: + html_title = soup('title') + if html_title: + html_title = unicode(html_title[0]) + html_title = stripTags(html_title) + year = re.compile('\((\d{4})\)').findall(html_title) + if not year: + year = re.compile('\((\d{4})/').findall(html_title) + if year: + year = year[0] + else: year = '' + return year + + def parse(self): + data = self.getPage() + IMDbDict ={} + #Poster + IMDbDict['poster'] = getMoviePoster(self.imdb) + if not IMDbDict['poster']: + IMDbDict['poster'] = 'http://i.imdb.com/Heads/npa.gif' + #Title, Year + IMDbDict['year'] = self.parseYear() + IMDbDict['title'] = self.parseTitle() + + #Rating + m = re.compile('(.*?)/10', re.IGNORECASE).search(data) if m: - parsed_value = m[0] - parsed_value = parsed_value.split('
    ')[-1].split('(')[0] - director = self.getCredits().get('director', None) - if director: - director = director[0] - parsed_value = parsed_value.replace(director, '') - if parsed_value.startswith("'s"): - parsed_value = parsed_value[2:].strip() - parsed_value = decodeHtml(parsed_value.strip()) - else: - print value - parsed_value = value - return parsed_value + IMDbDict['rating'] = int(float(m.group(1)) * 1000) + else: + IMDbDict['rating'] = -1 + #Votes + m = re.compile('\((.*?) votes\)', re.IGNORECASE).findall(data) + if m: + IMDbDict['votes'] = int(m[0].replace(',', '')) + else: + IMDbDict['votes'] = -1 - def parseTitle(self): - title = getMovieTitle(self.imdb) - title = normalizeTitle(title) - if title.startswith('"') and title.find('"',1) > 0 and \ - title.find('"',1) == title.rfind('"'): - data = self.getPage() - se = re.compile("Season (\d*), Episode (\d*)\)").findall(data) - if se: - se = se[0] - se = ' (S%02dE%02d) ' % (int(se[0]), int(se[1])) - title = normalizeTitle(title[1:title.rfind('"')]) + se + title[title.rfind('"')+1:].strip() - else: - part2 = title[title.rfind('"')+1:] - part2 = re.sub("[\d\?-]", "", part2).strip() - title = normalizeTitle(title[1:title.rfind('"')]) - if part2: - title += ':' + part2 - return normalizeTitle(title) + data = data.replace('\n',' ') + #some values + keys = ('runtime', 'language', 'genre', 'country', 'tagline', 'plot_outline', 'tv_series', 'also_known_as') + for key in keys: + IMDbDict[key] = '' + IMDbDict['runtime'] = 0 + soup = BeautifulSoup(data) + for info in soup('div', {'class': 'info'}): + key = unicode(info).split('
    ')[0].split('
    ') + if len(key) > 1: + raw_value = unicode(info).split('
    ')[1] + key = key[1][:-1].lower().replace(' ', '_') + if key in keys: + IMDbDict[key] = self.parse_raw_value(key, raw_value) + IMDbDict['title_english'] = IMDbDict.pop('also_known_as', IMDbDict['title']) + #is episode + IMDbDict['episode_of'] = IMDbDict.pop('tv_series', '') - def parseYear(self): - year = '' - data = self.getPage() - soup = BeautifulSoup(data) - html_title = soup('div', {'id': 'tn15title'}) - if not html_title: - html_title = soup('title') - if html_title: - html_title = unicode(html_title[0]) - html_title = stripTags(html_title) - year = re.compile('\((\d{4})\)').findall(html_title) - if not year: - year = re.compile('\((\d{4})/').findall(html_title) - if year: - year = year[0] - else: year = '' - return year + IMDbDict['episodes'] = getMovieEpisodes(self.imdb) + if IMDbDict['episodes']: + IMDbDict['tvshow'] = True + else: + IMDbDict['tvshow'] = False + IMDbDict['credits'] = self.getCredits() + IMDbDict['plot'] = getMoviePlot(self.imdb) + IMDbDict['keywords'] = getMovieKeywords(self.imdb) + IMDbDict['trivia'] = getMovieTrivia(self.imdb) + IMDbDict['connections'] = getMovieConnections(self.imdb) + IMDbDict['locations'] = getMovieLocations(self.imdb) + IMDbDict['release_date'] = getMovieReleaseDate(self.imdb) + IMDbDict['business'] = getMovieBusinessSum(self.imdb) + IMDbDict['reviews'] = getMovieExternalReviews(self.imdb) + IMDbDict['stills'] = getMovieStills(self.imdb) + #IMDbDict['trailer'] = getMovieTrailer(self.imdb) + self.IMDbDict = IMDbDict - def parse(self): - data = self.getPage() - IMDbDict ={} - #Poster - IMDbDict['poster'] = getMoviePoster(self.imdb) - if not IMDbDict['poster']: - IMDbDict['poster'] = 'http://i.imdb.com/Heads/npa.gif' - #Title, Year - IMDbDict['year'] = self.parseYear() - IMDbDict['title'] = self.parseTitle() + if IMDbDict['episode_of']: + episode_of =IMDb(IMDbDict['episode_of']).parse() + for key in ('country', 'language'): + if not IMDbDict[key]: + IMDbDict[key] = episode_of[key] + return self.IMDbDict - #Rating - m = re.compile('(.*?)/10', re.IGNORECASE).search(data) - if m: - IMDbDict['rating'] = int(float(m.group(1)) * 1000) - else: - IMDbDict['rating'] = -1 - #Votes - m = re.compile('\((.*?) votes\)', re.IGNORECASE).findall(data) - if m: - IMDbDict['votes'] = int(m[0].replace(',', '')) - else: - IMDbDict['votes'] = -1 + def getCredits(self): + raw_credits = getMovieCredits(self.imdb) + credits = {} - data = data.replace('\n',' ') - #some values - keys = ('runtime', 'language', 'genre', 'country', 'tagline', 'plot_outline', 'tv_series', 'also_known_as') - for key in keys: - IMDbDict[key] = '' - IMDbDict['runtime'] = 0 - soup = BeautifulSoup(data) - for info in soup('div', {'class': 'info'}): - key = unicode(info).split('')[0].split('
    ') - if len(key) > 1: - raw_value = unicode(info).split('
    ')[1] - key = key[1][:-1].lower().replace(' ', '_') - if key in keys: - IMDbDict[key] = self.parse_raw_value(key, raw_value) - IMDbDict['title_english'] = IMDbDict.pop('also_known_as', IMDbDict['title']) - #is episode - IMDbDict['episode_of'] = IMDbDict.pop('tv_series', '') + def getNames(creditList): + return [stripTags(decodeHtml(c[0])) for c in creditList] - IMDbDict['episodes'] = getMovieEpisodes(self.imdb) - if IMDbDict['episodes']: - IMDbDict['tvshow'] = True - else: - IMDbDict['tvshow'] = False - IMDbDict['credits'] = self.getCredits() - IMDbDict['plot'] = getMoviePlot(self.imdb) - IMDbDict['keywords'] = getMovieKeywords(self.imdb) - IMDbDict['trivia'] = getMovieTrivia(self.imdb) - IMDbDict['connections'] = getMovieConnections(self.imdb) - IMDbDict['locations'] = getMovieLocations(self.imdb) - IMDbDict['release_date'] = getMovieReleaseDate(self.imdb) - IMDbDict['business'] = getMovieBusinessSum(self.imdb) - IMDbDict['reviews'] = getMovieExternalReviews(self.imdb) - IMDbDict['stills'] = getMovieStills(self.imdb) - #IMDbDict['trailer'] = getMovieTrailer(self.imdb) - self.IMDbDict = IMDbDict + credits['director'] = getNames(raw_credits.get('directors', '')) + credits['writer'] = getNames(raw_credits.get('writers', '')) + credits['producer'] = getNames(raw_credits.get('producers', '')) + credits['cast'] = [(stripTags(decodeHtml(c[0])),stripTags(decodeHtml(c[1]))) for c in raw_credits.get('cast', [])] - if IMDbDict['episode_of']: - episode_of =IMDb(IMDbDict['episode_of']).parse() - for key in ('country', 'language'): - if not IMDbDict[key]: - IMDbDict[key] = episode_of[key] - return self.IMDbDict - - def getCredits(self): - raw_credits = getMovieCredits(self.imdb) - credits = {} - - def getNames(creditList): - return [stripTags(decodeHtml(c[0])) for c in creditList] - - credits['director'] = getNames(raw_credits.get('directors', '')) - credits['writer'] = getNames(raw_credits.get('writers', '')) - credits['producer'] = getNames(raw_credits.get('producers', '')) - credits['cast'] = [(stripTags(decodeHtml(c[0])),stripTags(decodeHtml(c[1]))) for c in raw_credits.get('cast', [])] - - self.credits = credits - return self.credits + self.credits = credits + return self.credits def guess(title, director=''): - #FIXME: proper file -> title - title = title.split('-')[0] - title = title.split('(')[0] - title = title.split('.')[0] - title = title.strip() - imdb_url = 'http://www.imdb.com/find?q=%s' % quote(title.encode('utf-8')) - return_url = '' + #FIXME: proper file -> title + title = title.split('-')[0] + title = title.split('(')[0] + title = title.split('.')[0] + title = title.strip() + imdb_url = 'http://www.imdb.com/find?q=%s' % quote(title.encode('utf-8')) + return_url = '' - #lest first try google - #i.e. site:imdb.com Michael Stevens Sin - if director: - search = 'site:imdb.com %s "%s"' % (director, title) - else: - search = 'site:imdb.com "%s"' % title - for (name, url, desc) in google.find(search, 2): - if url.startswith('http://www.imdb.com/title/tt'): - return url[28:35] + #lest first try google + #i.e. site:imdb.com Michael Stevens Sin + if director: + search = 'site:imdb.com %s "%s"' % (director, title) + else: + search = 'site:imdb.com "%s"' % title + for (name, url, desc) in google.find(search, 2): + if url.startswith('http://www.imdb.com/title/tt'): + return url[28:35] - try: + try: + req = urllib2.Request(imdb_url, None, oxutils.net.DEFAULT_HEADERS) + u = urllib2.urlopen(req) + data = u.read() + return_url = u.url + u.close() + except: + return None + if return_url.startswith('http://www.imdb.com/title/tt'): + return return_url[28:35] + if data: + imdb_id = findRe(data.replace('\n', ' '), 'Popular Results.*?
    1. .*?') - for string in strings: - if string.find('') != -1: - key = findRe(string, '(.*?)') - type = findRe(string, '<(.*?)>') - if type == 'true/': - value = True - else: - value = findRe(string, '<%s>(.*?)' % (type, type)) - if type == 'integer': - value = int(value) - elif type == 'string': - value = decodeHtml(value) - values[key] = value - return values + values = {} + strings = xml.split('') + for string in strings: + if string.find('') != -1: + key = findRe(string, '(.*?)') + type = findRe(string, '<(.*?)>') + if type == 'true/': + value = True + else: + value = findRe(string, '<%s>(.*?)' % (type, type)) + if type == 'integer': + value = int(value) + elif type == 'string': + value = decodeHtml(value) + values[key] = value + return values def parseCast(xml, title): - list = [] - try: - strings = findRe(xml, '%s(.*?)' % title[:-1].upper()).split('') - strings.pop() - for string in strings: - list.append(findRe(string, '(.*?)')) - return list - except: - return list + list = [] + try: + strings = findRe(xml, '%s(.*?)' % title[:-1].upper()).split('') + strings.pop() + for string in strings: + list.append(findRe(string, '(.*?)')) + return list + except: + return list def parseMovies(xml, title): - list = [] - try: - strings = findRe(xml, '%s(.*?)' % title[:-1].upper()).split('') - strings.pop() - for string in strings: - list.append({ - 'id': findRe(string, 'viewMovie\?id=(.*?)&'), - 'title': findRe(string, '(.*?)') - }) - return list - except: - return list + list = [] + try: + strings = findRe(xml, '%s(.*?)' % title[:-1].upper()).split('') + strings.pop() + for string in strings: + list.append({ + 'id': findRe(string, 'viewMovie\?id=(.*?)&'), + 'title': findRe(string, '(.*?)') + }) + return list + except: + return list class ItunesAlbum: - def __init__(self, id = '', title = '', artist = ''): - self.id = id - self.title = title - self.artist = artist - if not id: - self.id = self.getId() + def __init__(self, id = '', title = '', artist = ''): + self.id = id + self.title = title + self.artist = artist + if not id: + self.id = self.getId() - def getId(self): - url = composeUrl('advancedSearch', {'media': 'music', 'title': self.title, 'artist': self.artist}) - xml = getUrl(url, headers = ITUNES_HEADERS) - id = findRe(xml, 'viewAlbum\?id=(.*?)&') - return id + def getId(self): + url = composeUrl('advancedSearch', {'media': 'music', 'title': self.title, 'artist': self.artist}) + xml = getUrl(url, headers = ITUNES_HEADERS) + id = findRe(xml, 'viewAlbum\?id=(.*?)&') + return id - def getData(self): - data = {'id': self.id} - url = composeUrl('viewAlbum', {'id': self.id}) - xml = getUrl(url, None, ITUNES_HEADERS) - data['albumName'] = findRe(xml, '(.*?)') - data['artistName'] = findRe(xml, '(.*?)') - data['coverUrl'] = findRe(xml, 'reflection="." url="(.*?)"') - data['genre'] = findRe(xml, 'Genre:(.*?)<') - data['releaseDate'] = findRe(xml, 'Released(.*?)<') - data['review'] = stripTags(findRe(xml, 'REVIEW.*?(.*?)')) - data['tracks'] = [] - strings = findRe(xml, 'items.*?(.*?)$').split('') - for string in strings: - data['tracks'].append(parseXmlDict(string)) - data['type'] = findRe(xml, 'listType(.*?)<') - return data + def getData(self): + data = {'id': self.id} + url = composeUrl('viewAlbum', {'id': self.id}) + xml = getUrl(url, None, ITUNES_HEADERS) + data['albumName'] = findRe(xml, '(.*?)') + data['artistName'] = findRe(xml, '(.*?)') + data['coverUrl'] = findRe(xml, 'reflection="." url="(.*?)"') + data['genre'] = findRe(xml, 'Genre:(.*?)<') + data['releaseDate'] = findRe(xml, 'Released(.*?)<') + data['review'] = stripTags(findRe(xml, 'REVIEW.*?(.*?)')) + data['tracks'] = [] + strings = findRe(xml, 'items.*?(.*?)$').split('') + for string in strings: + data['tracks'].append(parseXmlDict(string)) + data['type'] = findRe(xml, 'listType(.*?)<') + return data class ItunesMovie: - def __init__(self, id = '', title = '', director = ''): - self.id = id - self.title = title - self.director = director - if not id: - self.id = self.getId() + def __init__(self, id = '', title = '', director = ''): + self.id = id + self.title = title + self.director = director + if not id: + self.id = self.getId() - def getId(self): - url = composeUrl('advancedSearch', {'media': 'movie', 'title': self.title, 'director': self.director}) - xml = getUrl(url, headers = ITUNES_HEADERS) - id = findRe(xml, 'viewMovie\?id=(.*?)&') - return id + def getId(self): + url = composeUrl('advancedSearch', {'media': 'movie', 'title': self.title, 'director': self.director}) + xml = getUrl(url, headers = ITUNES_HEADERS) + id = findRe(xml, 'viewMovie\?id=(.*?)&') + return id - def getData(self): - data = {'id': self.id} - url = composeUrl('viewMovie', {'id': self.id}) - xml = getUrl(url, None, ITUNES_HEADERS) - f = open('/Users/rolux/Desktop/iTunesData.xml', 'w') - f.write(xml) - f.close() - data['actors'] = parseCast(xml, 'actors') - string = findRe(xml, 'Average Rating:(.*?)') - data['averageRating'] = string.count('rating_star_000033.png') + string.count('½') * 0.5 - data['directors'] = parseCast(xml, 'directors') - data['format'] = findRe(xml, 'Format:(.*?)<') - data['genre'] = decodeHtml(findRe(xml, 'Genre:(.*?)<')) - data['plotSummary'] = decodeHtml(findRe(xml, 'PLOT SUMMARY.*?(.*?)')) - data['posterUrl'] = findRe(xml, 'reflection="." url="(.*?)"') - data['producers'] = parseCast(xml, 'producers') - data['rated'] = findRe(xml, 'Rated(.*?)<') - data['relatedMovies'] = parseMovies(xml, 'related movies') - data['releaseDate'] = findRe(xml, 'Released(.*?)<') - data['runTime'] = findRe(xml, 'Run Time:(.*?)<') - data['screenwriters'] = parseCast(xml, 'screenwriters') - data['soundtrackId'] = findRe(xml, 'viewAlbum\?id=(.*?)&') - data['trailerUrl'] = findRe(xml, 'autoplay="." url="(.*?)"') - return data + def getData(self): + data = {'id': self.id} + url = composeUrl('viewMovie', {'id': self.id}) + xml = getUrl(url, None, ITUNES_HEADERS) + f = open('/Users/rolux/Desktop/iTunesData.xml', 'w') + f.write(xml) + f.close() + data['actors'] = parseCast(xml, 'actors') + string = findRe(xml, 'Average Rating:(.*?)') + data['averageRating'] = string.count('rating_star_000033.png') + string.count('½') * 0.5 + data['directors'] = parseCast(xml, 'directors') + data['format'] = findRe(xml, 'Format:(.*?)<') + data['genre'] = decodeHtml(findRe(xml, 'Genre:(.*?)<')) + data['plotSummary'] = decodeHtml(findRe(xml, 'PLOT SUMMARY.*?(.*?)')) + data['posterUrl'] = findRe(xml, 'reflection="." url="(.*?)"') + data['producers'] = parseCast(xml, 'producers') + data['rated'] = findRe(xml, 'Rated(.*?)<') + data['relatedMovies'] = parseMovies(xml, 'related movies') + data['releaseDate'] = findRe(xml, 'Released(.*?)<') + data['runTime'] = findRe(xml, 'Run Time:(.*?)<') + data['screenwriters'] = parseCast(xml, 'screenwriters') + data['soundtrackId'] = findRe(xml, 'viewAlbum\?id=(.*?)&') + data['trailerUrl'] = findRe(xml, 'autoplay="." url="(.*?)"') + return data if __name__ == '__main__': - import simplejson - data = ItunesAlbum(title = 'So Red the Rose', artist = 'Arcadia').getData() - print simplejson.dumps(data, sort_keys = True, indent = 4) - data = ItunesMovie(title = 'The Matrix', director = 'Wachowski').getData() - print simplejson.dumps(data, sort_keys = True, indent = 4) - for v in data['relatedMovies']: - data = ItunesMovie(id = v['id']).getData() + import simplejson + data = ItunesAlbum(title = 'So Red the Rose', artist = 'Arcadia').getData() print simplejson.dumps(data, sort_keys = True, indent = 4) - data = ItunesMovie(id='272960052').getData() - print simplejson.dumps(data, sort_keys = True, indent = 4) + data = ItunesMovie(title = 'The Matrix', director = 'Wachowski').getData() + print simplejson.dumps(data, sort_keys = True, indent = 4) + for v in data['relatedMovies']: + data = ItunesMovie(id = v['id']).getData() + print simplejson.dumps(data, sort_keys = True, indent = 4) + data = ItunesMovie(id='272960052').getData() + print simplejson.dumps(data, sort_keys = True, indent = 4) + diff --git a/ox/lyricsfly.py b/ox/lyricsfly.py index 7ae489e..2b2fe8b 100644 --- a/ox/lyricsfly.py +++ b/ox/lyricsfly.py @@ -1,3 +1,5 @@ +# -*- coding: utf-8 -*- +# vi:si:et:sw=4:sts=4:ts=4 from oxutils.cache import getUrl from oxutils.html import decodeHtml from oxutils.text import findRe @@ -16,4 +18,4 @@ def getLyrics(title, artist): return lyrics if __name__ == '__main__': - print getLyrics('Election Day', 'Arcadia') \ No newline at end of file + print getLyrics('Election Day', 'Arcadia') diff --git a/ox/mininova.py b/ox/mininova.py index c569e6a..36357e0 100644 --- a/ox/mininova.py +++ b/ox/mininova.py @@ -1,7 +1,5 @@ -# -*- Mode: Python; -*- # -*- coding: utf-8 -*- -# vi:si:et:sw=2:sts=2:ts=2 - +# vi:si:et:sw=4:sts=4:ts=4 from datetime import datetime import re import socket @@ -14,116 +12,115 @@ import oxutils from torrent import Torrent -socket.setdefaulttimeout(10.0) def _parseResultsPage(data, max_results=10): - results=[] - regexp = '''(.*?)(.*?)(.*?).*?.*?''' - for row in re.compile(regexp, re.DOTALL).findall(data): - torrentDate = row[0] - torrentExtra = row[1] - torrentId = row[2] - torrentTitle = decodeHtml(row[3]).strip() - torrentLink = "http://www.mininova.org/tor/" + torrentId - privateTracker = 'priv.gif' in torrentExtra - if not privateTracker: - results.append((torrentTitle, torrentLink, '')) - return results + results=[] + regexp = '''(.*?)(.*?)(.*?).*?.*?''' + for row in re.compile(regexp, re.DOTALL).findall(data): + torrentDate = row[0] + torrentExtra = row[1] + torrentId = row[2] + torrentTitle = decodeHtml(row[3]).strip() + torrentLink = "http://www.mininova.org/tor/" + torrentId + privateTracker = 'priv.gif' in torrentExtra + if not privateTracker: + results.append((torrentTitle, torrentLink, '')) + return results def findMovie(query, max_results=10): - '''search for torrents on mininova - ''' - url = "http://www.mininova.org/search/%s/seeds" % quote(query) - data = getUrlUnicode(url) - return _parseResultsPage(data, max_results) + '''search for torrents on mininova + ''' + url = "http://www.mininova.org/search/%s/seeds" % quote(query) + data = getUrlUnicode(url) + return _parseResultsPage(data, max_results) def findMovieByImdb(imdbId): - '''find torrents on mininova for a given imdb id - ''' - results = [] - imdbId = normalizeImdbId(imdbId) - data = getUrlUnicode("http://www.mininova.org/imdb/?imdb=%s" % imdbId) - return _parseResultsPage(data) + '''find torrents on mininova for a given imdb id + ''' + results = [] + imdbId = normalizeImdbId(imdbId) + data = getUrlUnicode("http://www.mininova.org/imdb/?imdb=%s" % imdbId) + return _parseResultsPage(data) def getId(mininovaId): - mininovaId = unicode(mininovaId) - d = findRe(mininovaId, "/(\d+)") - if d: - return d - mininovaId = mininovaId.split('/') - if len(mininovaId) == 1: - return mininovaId[0] - else: - return mininovaId[-1] + mininovaId = unicode(mininovaId) + d = findRe(mininovaId, "/(\d+)") + if d: + return d + mininovaId = mininovaId.split('/') + if len(mininovaId) == 1: + return mininovaId[0] + else: + return mininovaId[-1] def exists(mininovaId): - mininovaId = getId(mininovaId) - data = oxutils.net.getUrl("http://www.mininova.org/tor/%s" % mininovaId) - if not data or 'Torrent not found...' in data: - return False - if 'tracker of this torrent requires registration.' in data: - return False - return True + mininovaId = getId(mininovaId) + data = oxutils.net.getUrl("http://www.mininova.org/tor/%s" % mininovaId) + if not data or 'Torrent not found...' in data: + return False + if 'tracker of this torrent requires registration.' in data: + return False + return True def getData(mininovaId): - _key_map = { - 'by': u'uploader', - } - mininovaId = getId(mininovaId) - torrent = dict() - torrent[u'id'] = mininovaId - torrent[u'domain'] = 'mininova.org' - torrent[u'comment_link'] = "http://www.mininova.org/tor/%s" % mininovaId - torrent[u'torrent_link'] = "http://www.mininova.org/get/%s" % mininovaId - torrent[u'details_link'] = "http://www.mininova.org/det/%s" % mininovaId + _key_map = { + 'by': u'uploader', + } + mininovaId = getId(mininovaId) + torrent = dict() + torrent[u'id'] = mininovaId + torrent[u'domain'] = 'mininova.org' + torrent[u'comment_link'] = "http://www.mininova.org/tor/%s" % mininovaId + torrent[u'torrent_link'] = "http://www.mininova.org/get/%s" % mininovaId + torrent[u'details_link'] = "http://www.mininova.org/det/%s" % mininovaId - data = getUrlUnicode(torrent['comment_link']) + getUrlUnicode(torrent['details_link']) - if '

      Torrent not found...

      ' in data: - return None + data = getUrlUnicode(torrent['comment_link']) + getUrlUnicode(torrent['details_link']) + if '

      Torrent not found...

      ' in data: + return None - for d in re.compile('

      .(.*?):(.*?)

      ', re.DOTALL).findall(data): - key = d[0].lower().strip() - key = _key_map.get(key, key) - value = decodeHtml(stripTags(d[1].strip())) - torrent[key] = value + for d in re.compile('

      .(.*?):(.*?)

      ', re.DOTALL).findall(data): + key = d[0].lower().strip() + key = _key_map.get(key, key) + value = decodeHtml(stripTags(d[1].strip())) + torrent[key] = value - torrent[u'title'] = findRe(data, '(.*?):.*?') - torrent[u'imdbId'] = findRe(data, 'title/tt(\d{7})') - torrent[u'description'] = findRe(data, '
      (.*?)
      ') - if torrent['description']: - torrent['description'] = normalizeNewlines(decodeHtml(stripTags(torrent['description']))).strip() - t = getUrl(torrent[u'torrent_link']) - torrent[u'torrent_info'] = getTorrentInfo(t) - return torrent + torrent[u'title'] = findRe(data, '(.*?):.*?') + torrent[u'imdbId'] = findRe(data, 'title/tt(\d{7})') + torrent[u'description'] = findRe(data, '
      (.*?)
      ') + if torrent['description']: + torrent['description'] = normalizeNewlines(decodeHtml(stripTags(torrent['description']))).strip() + t = getUrl(torrent[u'torrent_link']) + torrent[u'torrent_info'] = getTorrentInfo(t) + return torrent class Mininova(Torrent): - ''' - >>> Mininova('123') - {} - >>> Mininova('1072195')['infohash'] - '72dfa59d2338e4a48c78cec9de25964cddb64104' - ''' - def __init__(self, mininovaId): - self.data = getData(mininovaId) - if not self.data: - return - Torrent.__init__(self) - ratio = self.data['share ratio'].split(',') - self['seeder'] = -1 - self['leecher'] = -1 - if len(ratio) == 2: - val = intValue(ratio[0].replace(',','').strip()) - if val: - self['seeder'] = int(val) - val = intValue(ratio[1].replace(',','').strip()) - if val: - self['leecher'] = int(val) - val = intValue(self.data['downloads'].replace(',','').strip()) - if val: - self['downloaded'] = int(val) - else: - self['downloaded'] = -1 - published = self.data['added on'] - published = published.split(' +')[0] - self['published'] = datetime.strptime(published, "%a, %d %b %Y %H:%M:%S") + ''' + >>> Mininova('123') + {} + >>> Mininova('1072195')['infohash'] + '72dfa59d2338e4a48c78cec9de25964cddb64104' + ''' + def __init__(self, mininovaId): + self.data = getData(mininovaId) + if not self.data: + return + Torrent.__init__(self) + ratio = self.data['share ratio'].split(',') + self['seeder'] = -1 + self['leecher'] = -1 + if len(ratio) == 2: + val = intValue(ratio[0].replace(',','').strip()) + if val: + self['seeder'] = int(val) + val = intValue(ratio[1].replace(',','').strip()) + if val: + self['leecher'] = int(val) + val = intValue(self.data['downloads'].replace(',','').strip()) + if val: + self['downloaded'] = int(val) + else: + self['downloaded'] = -1 + published = self.data['added on'] + published = published.split(' +')[0] + self['published'] = datetime.strptime(published, "%a, %d %b %Y %H:%M:%S") diff --git a/ox/opensubtitles.py b/ox/opensubtitles.py index e7fca01..abb3cee 100644 --- a/ox/opensubtitles.py +++ b/ox/opensubtitles.py @@ -1,6 +1,5 @@ # -*- coding: utf-8 -*- -# vi:si:et:sw=2:sts=2:ts=2 - +# vi:si:et:sw=4:sts=4:ts=4 import re import feedparser @@ -9,37 +8,34 @@ import oxutils from oxutils.lang import langCode2To3, langTo3Code def findSubtitlesByImdb(imdb, parts = 1, language = "eng"): - if len(language) == 2: - language = langCode2To3(language) - elif len(language) != 3: - language = langTo3Code(language) - url = "http://www.opensubtitles.org/en/search/" - if language: - url += "sublanguageid-%s/" % language - url += "subsumcd-%s/subformat-srt/imdbid-%s/rss_2_00" % (parts, imdb) - data = getUrl(url) - if "title>opensubtitles.com - search resultsopensubtitles.com - search results(.*?).*?''' - for row in re.compile(regexp, re.DOTALL).findall(data): - torrentType = row[0] - torrentLink = "http://thepiratebay.org" + row[1] - torrentTitle = decodeHtml(row[2]) - # 201 = Movies , 202 = Movie DVDR, 205 TV Shows - if torrentType in ['201']: - results.append((torrentTitle, torrentLink, '')) - if len(results) >= max_results: - return results - next = re.compile('.*?next.gif.*?').findall(data) - return results + results = [] + next = ["http://thepiratebay.org/search/%s/0/3/200" % quote(query), ] + page_count = 1 + while next and page_count < 4: + page_count += 1 + url = next[0] + if not url.startswith('http'): + if not url.startswith('/'): + url = "/" + url + url = "http://thepiratebay.org" + url + data = _getUrlUnicode(url) + regexp = '''(.*?).*?''' + for row in re.compile(regexp, re.DOTALL).findall(data): + torrentType = row[0] + torrentLink = "http://thepiratebay.org" + row[1] + torrentTitle = decodeHtml(row[2]) + # 201 = Movies , 202 = Movie DVDR, 205 TV Shows + if torrentType in ['201']: + results.append((torrentTitle, torrentLink, '')) + if len(results) >= max_results: + return results + next = re.compile('.*?next.gif.*?').findall(data) + return results def findMovieByImdb(imdb): - return findMovies("tt" + normalizeImdbId(imdb)) + return findMovies("tt" + normalizeImdbId(imdb)) def getId(piratebayId): - if piratebayId.startswith('http://torrents.thepiratebay.org/'): - piratebayId = piratebayId.split('org/')[1] - d = findRe(piratebayId, "tor/(\d+)") - if d: - piratebayId = d - return piratebayId + if piratebayId.startswith('http://torrents.thepiratebay.org/'): + piratebayId = piratebayId.split('org/')[1] + d = findRe(piratebayId, "tor/(\d+)") + if d: + piratebayId = d + return piratebayId def exists(piratebayId): - piratebayId = getId(piratebayId) - return oxutils.net.exists("http://thepiratebay.org/tor/%s" % piratebayId) + piratebayId = getId(piratebayId) + return oxutils.net.exists("http://thepiratebay.org/tor/%s" % piratebayId) def getData(piratebayId): - _key_map = { - 'spoken language(s)': u'language', - 'texted language(s)': u'subtitle language', - 'by': u'uploader', - 'leechers': 'leecher', - 'seeders': 'seeder', - } - piratebayId = getId(piratebayId) - torrent = dict() - torrent[u'id'] = piratebayId - torrent[u'domain'] = 'thepiratebay.org' - torrent[u'comment_link'] = 'http://thepiratebay.org/tor/%s' % piratebayId + _key_map = { + 'spoken language(s)': u'language', + 'texted language(s)': u'subtitle language', + 'by': u'uploader', + 'leechers': 'leecher', + 'seeders': 'seeder', + } + piratebayId = getId(piratebayId) + torrent = dict() + torrent[u'id'] = piratebayId + torrent[u'domain'] = 'thepiratebay.org' + torrent[u'comment_link'] = 'http://thepiratebay.org/tor/%s' % piratebayId - data = _getUrlUnicode(torrent['comment_link']) - torrent[u'title'] = findRe(data, '(.*?) \(download torrent\) - TPB') - if not torrent[u'title']: - return None - torrent[u'title'] = decodeHtml(torrent[u'title']).strip() - torrent[u'imdbId'] = findRe(data, 'title/tt(\d{7})') - title = quote(torrent['title'].encode('utf-8')) - torrent[u'torrent_link']="http://torrents.thepiratebay.org/%s/%s.torrent" % (piratebayId, title) - for d in re.compile('dt>(.*?):.*?(.*?)', re.DOTALL).findall(data): - key = d[0].lower().strip() - key = _key_map.get(key, key) - value = decodeHtml(stripTags(d[1].strip())) - torrent[key] = value - torrent[u'description'] = findRe(data, '
      (.*?)
      ') - if torrent[u'description']: - torrent['description'] = normalizeNewlines(decodeHtml(stripTags(torrent['description']))).strip() - t = _getUrl(torrent[u'torrent_link']) - torrent[u'torrent_info'] = getTorrentInfo(t) - return torrent + data = _getUrlUnicode(torrent['comment_link']) + torrent[u'title'] = findRe(data, '(.*?) \(download torrent\) - TPB') + if not torrent[u'title']: + return None + torrent[u'title'] = decodeHtml(torrent[u'title']).strip() + torrent[u'imdbId'] = findRe(data, 'title/tt(\d{7})') + title = quote(torrent['title'].encode('utf-8')) + torrent[u'torrent_link']="http://torrents.thepiratebay.org/%s/%s.torrent" % (piratebayId, title) + for d in re.compile('dt>(.*?):.*?(.*?)', re.DOTALL).findall(data): + key = d[0].lower().strip() + key = _key_map.get(key, key) + value = decodeHtml(stripTags(d[1].strip())) + torrent[key] = value + torrent[u'description'] = findRe(data, '
      (.*?)
      ') + if torrent[u'description']: + torrent['description'] = normalizeNewlines(decodeHtml(stripTags(torrent['description']))).strip() + t = _getUrl(torrent[u'torrent_link']) + torrent[u'torrent_info'] = getTorrentInfo(t) + return torrent class Thepiratebay(Torrent): - ''' - >>> Thepiratebay('123') - {} + ''' + >>> Thepiratebay('123') + {} - >>> Thepiratebay('3951349')['infohash'] - '4e84415d36ed7b54066160c05a0b0f061898d12b' - ''' - def __init__(self, piratebayId): - self.data = getData(piratebayId) - if not self.data: - return - Torrent.__init__(self) - published = self.data['uploaded'] - published = published.replace(' GMT', '').split(' +')[0] - self['published'] = datetime.strptime(published, "%Y-%m-%d %H:%M:%S") + >>> Thepiratebay('3951349')['infohash'] + '4e84415d36ed7b54066160c05a0b0f061898d12b' + ''' + def __init__(self, piratebayId): + self.data = getData(piratebayId) + if not self.data: + return + Torrent.__init__(self) + published = self.data['uploaded'] + published = published.replace(' GMT', '').split(' +')[0] + self['published'] = datetime.strptime(published, "%Y-%m-%d %H:%M:%S") diff --git a/ox/torrent.py b/ox/torrent.py index 785f604..51ce3c9 100644 --- a/ox/torrent.py +++ b/ox/torrent.py @@ -1,39 +1,37 @@ -# -*- Mode: Python; -*- # -*- coding: utf-8 -*- -# vi:si:et:sw=2:sts=2:ts=2 - +# vi:si:et:sw=4:sts=4:ts=4 from oxutils import intValue class Torrent(dict): - ''' - >>> Torrent() - {'files': 1, 'domain': u'', 'subtitle language': u'', 'seeder': -1, 'description': u'', 'language': u'', 'title': u'', 'imdbId': u'', 'downloaded': -1, 'leecher': -1, 'torrent_link': u'', 'torrent_info': {}, 'published': u'', 'announce': '', 'infohash': '', 'id': u'', 'comment_link': u'', 'size': -1} - ''' - _string_keys = ('id', 'title', 'description', 'infohash', 'torrent_link', 'comment_link', - 'imdbId', 'announce', 'domain', 'published', 'language', 'subtitle language') - _int_keys = ('size', 'seeder', 'leecher', 'downloaded', 'files') - _dict_keys = ('torrent_info', ) - _list_keys = () - data = {'torrent_info': {}} + ''' + >>> Torrent() + {'files': 1, 'domain': u'', 'subtitle language': u'', 'seeder': -1, 'description': u'', 'language': u'', 'title': u'', 'imdbId': u'', 'downloaded': -1, 'leecher': -1, 'torrent_link': u'', 'torrent_info': {}, 'published': u'', 'announce': '', 'infohash': '', 'id': u'', 'comment_link': u'', 'size': -1} + ''' + _string_keys = ('id', 'title', 'description', 'infohash', 'torrent_link', 'comment_link', + 'imdbId', 'announce', 'domain', 'published', 'language', 'subtitle language') + _int_keys = ('size', 'seeder', 'leecher', 'downloaded', 'files') + _dict_keys = ('torrent_info', ) + _list_keys = () + data = {'torrent_info': {}} - def __init__(self): - for key in self._string_keys: - self[key] = self.data.get(key, u'') - for key in self._dict_keys: - self[key] = self.data.get(key, {}) - for key in self._list_keys: - self[key] = self.data.get(key, []) - for key in self._int_keys: - value = self.data.get(key, -1) - if not isinstance(value, int): - value = int(intValue(value)) - self[key] = value - self['infohash'] = self.data['torrent_info'].get('hash', '') - self['size'] = self.data['torrent_info'].get('size', -1) - self['announce'] = self.data['torrent_info'].get('announce', '') - if 'files' in self.data['torrent_info']: - self['files'] = len(self.data['torrent_info']['files']) - else: - self['files'] = 1 + def __init__(self): + for key in self._string_keys: + self[key] = self.data.get(key, u'') + for key in self._dict_keys: + self[key] = self.data.get(key, {}) + for key in self._list_keys: + self[key] = self.data.get(key, []) + for key in self._int_keys: + value = self.data.get(key, -1) + if not isinstance(value, int): + value = int(intValue(value)) + self[key] = value + self['infohash'] = self.data['torrent_info'].get('hash', '') + self['size'] = self.data['torrent_info'].get('size', -1) + self['announce'] = self.data['torrent_info'].get('announce', '') + if 'files' in self.data['torrent_info']: + self['files'] = len(self.data['torrent_info']['files']) + else: + self['files'] = 1 diff --git a/ox/wikipedia.py b/ox/wikipedia.py index a969e24..1d969bf 100644 --- a/ox/wikipedia.py +++ b/ox/wikipedia.py @@ -1,72 +1,72 @@ -# -*- Mode: Python; -*- # -*- coding: utf-8 -*- -# vi:si:et:sw=2:sts=2:ts=2 +# vi:si:et:sw=4:sts=4:ts=4 from urllib import urlencode import simplejson from oxutils.cache import getUrl, getUrlUnicode from oxutils import findRe, decodeHtml + def getMovieId(title, director='', year=''): - query = '"%s" film %s %s' % (title, director, year) - result = find(query, 1) - if result: - return result[0][1] - return '' + query = '"%s" film %s %s' % (title, director, year) + result = find(query, 1) + if result: + return result[0][1] + return '' def getUrlByImdb(imdbId): - query = '"imdb_id = %s"'% imdbId - result = find(query) - if result: - url = result[0][1] - return url - if str(imdbId).startswith('0'): - imdbId = imdbId[1:] - return getUrlByImdb(imdbId) + query = '"imdb_id = %s"'% imdbId + result = find(query) + if result: + url = result[0][1] + return url + if str(imdbId).startswith('0'): + imdbId = imdbId[1:] + return getUrlByImdb(imdbId) def getUrlByAmbId(amg_id): - query = '"amg_id = %s"'% amg_id - result = find(query) - if result: - url = result[0][1] - return url - return '' + query = '"amg_id = %s"'% amg_id + result = find(query) + if result: + url = result[0][1] + return url + return '' def getWikiData(wikipediaUrl): - title = wikipediaUrl.replace('http://en.wikipedia.org/wiki/', '') - url = "http://en.wikipedia.org/w/index.php?title=%s&action=edit" % title - html = getUrlUnicode(url) - data = decodeHtml(findRe(html, "(.*?)")) - return data + title = wikipediaUrl.replace('http://en.wikipedia.org/wiki/', '') + url = "http://en.wikipedia.org/w/index.php?title=%s&action=edit" % title + html = getUrlUnicode(url) + data = decodeHtml(findRe(html, "(.*?)")) + return data def getMovieData(wikipediaUrl): - data = getWikiData(wikipediaUrl) - filmbox_data = findRe(data, '''\{\{Infobox Film(.*?)\}\}''') - filmbox = {} - for row in filmbox_data.strip().split('|'): - d = row.split('=') - if len(d) == 2: - key = d[0].strip() - value = d[1].strip() - filmbox[key] = value - return filmbox + data = getWikiData(wikipediaUrl) + filmbox_data = findRe(data, '''\{\{Infobox Film(.*?)\}\}''') + filmbox = {} + for row in filmbox_data.strip().split('|'): + d = row.split('=') + if len(d) == 2: + key = d[0].strip() + value = d[1].strip() + filmbox[key] = value + return filmbox def getAmgId(wikipediaUrl): - data = getMovieData(wikipediaUrl) - return data.get('amg_id', '') + data = getMovieData(wikipediaUrl) + return data.get('amg_id', '') def find(query, max_results=10): - query = {'action': 'query', 'list':'search', 'format': 'json', - 'srlimit': max_results, 'srwhat': 'text', 'srsearch': query.encode('utf-8')} - url = "http://en.wikipedia.org/w/api.php?" + urlencode(query) - data = getUrl(url) - if not data: - data = getUrl(url, timeout=0) - result = simplejson.loads(data) - results = [] - for r in result['query']['search']: - title = r['title'] - url = "http://en.wikipedia.org/wiki/%s" % title.replace(' ', '_') - results.append((title, url, '')) - return results + query = {'action': 'query', 'list':'search', 'format': 'json', + 'srlimit': max_results, 'srwhat': 'text', 'srsearch': query.encode('utf-8')} + url = "http://en.wikipedia.org/w/api.php?" + urlencode(query) + data = getUrl(url) + if not data: + data = getUrl(url, timeout=0) + result = simplejson.loads(data) + results = [] + for r in result['query']['search']: + title = r['title'] + url = "http://en.wikipedia.org/wiki/%s" % title.replace(' ', '_') + results.append((title, url, '')) + return results diff --git a/ox/youtube.py b/ox/youtube.py index f1efcb3..c17ebc3 100644 --- a/ox/youtube.py +++ b/ox/youtube.py @@ -1,6 +1,5 @@ -# -*- Mode: Python; -*- # -*- coding: utf-8 -*- -# vi:si:et:sw=2:sts=2:ts=2 +# vi:si:et:sw=4:sts=4:ts=4 from urllib import quote import xml.etree.ElementTree as ET @@ -8,49 +7,50 @@ import feedparser from oxutils.cache import getUrl from oxutils import findString + def getVideoUrl(youtubeId, format='mp4'): - url = 'http://www.youtube.com/api2_rest?method=youtube.videos.get_video_token&video_id=' + youtubeId - data = getUrl(url) - xml = ET.fromstring(data) - youtubeKey = xml.find('t').text - if format == 'mp4': - fmt=18 - url = "http://youtube.com/get_video.php?video_id=%s&t=%s&fmt=%s"%(youtubeId, youtubeKey, fmt) - else: - url = "http://youtube.com/get_video.php?video_id=%s&t=%s"%(youtubeId, youtubeKey) - return url + url = 'http://www.youtube.com/api2_rest?method=youtube.videos.get_video_token&video_id=' + youtubeId + data = getUrl(url) + xml = ET.fromstring(data) + youtubeKey = xml.find('t').text + if format == 'mp4': + fmt=18 + url = "http://youtube.com/get_video.php?video_id=%s&t=%s&fmt=%s"%(youtubeId, youtubeKey, fmt) + else: + url = "http://youtube.com/get_video.php?video_id=%s&t=%s"%(youtubeId, youtubeKey) + return url def getMovieInfo(youtubeId): - url = "http://gdata.youtube.com/feeds/api/videos/%s " % youtubeId - data = getUrl(url) - fd = feedparser.parse(data) - return getInfoFromAtom(fd.entries[0]) + url = "http://gdata.youtube.com/feeds/api/videos/%s " % youtubeId + data = getUrl(url) + fd = feedparser.parse(data) + return getInfoFromAtom(fd.entries[0]) def getInfoFromAtom(entry): - info = dict() - info['title'] = entry['title'] - info['description'] = entry['description'] - info['author'] = entry['author'] - info['published'] = entry['published_parsed'] - info['keywords'] = entry['media_keywords'].split(', ') - info['url'] = entry['links'][0]['href'] - info['id'] = findString(info['url'], "/watch?v=") - info['thumbnail'] = "http://img.youtube.com/vi/%s/0.jpg" % info['id'] - info['flv'] = getVideoUrl(info['id'], 'flv') - info['mp4'] = getVideoUrl(info['id'], 'mp4') - info['embed'] = '''''' % (info['id'], info['id']) - return info + info = dict() + info['title'] = entry['title'] + info['description'] = entry['description'] + info['author'] = entry['author'] + info['published'] = entry['published_parsed'] + info['keywords'] = entry['media_keywords'].split(', ') + info['url'] = entry['links'][0]['href'] + info['id'] = findString(info['url'], "/watch?v=") + info['thumbnail'] = "http://img.youtube.com/vi/%s/0.jpg" % info['id'] + info['flv'] = getVideoUrl(info['id'], 'flv') + info['mp4'] = getVideoUrl(info['id'], 'mp4') + info['embed'] = '''''' % (info['id'], info['id']) + return info def find(query, max_results=10, offset=1, orderBy='relevance'): - query = quote(query) - url = "http://gdata.youtube.com/feeds/api/videos?vq=%s&orderby=%s&start-index=%s&max-results=%s"%(query, orderBy, offset, max_results) - data = getUrl(url) - fd = feedparser.parse(data) - videos = [] - for entry in fd.entries: - v = getInfoFromAtom(entry) - videos.append(v) - if len(videos) >= max_results: - return videos - return videos + query = quote(query) + url = "http://gdata.youtube.com/feeds/api/videos?vq=%s&orderby=%s&start-index=%s&max-results=%s"%(query, orderBy, offset, max_results) + data = getUrl(url) + fd = feedparser.parse(data) + videos = [] + for entry in fd.entries: + v = getInfoFromAtom(entry) + videos.append(v) + if len(videos) >= max_results: + return videos + return videos diff --git a/setup.py b/setup.py index 4840537..e8a5096 100644 --- a/setup.py +++ b/setup.py @@ -1,33 +1,33 @@ #!/usr/bin/env python -# vi:si:et:sw=2:sts=2:ts=2 +# vi:si:et:sw=4:sts=4:ts=4 # encoding: utf-8 from setuptools import setup, find_packages import os setup( - name="ox", - version="0.1", - description="collection of scrapers for various websites", - author="0x", - author_email="code@0xdb.org", - url="http://code.0xdb.org/ox", - download_url="http://code.0xdb.org/ox/download", - license="GPLv3", - packages=find_packages(), - zip_safe=False, - install_requires=[ - 'oxutils', - 'feedparser', - 'beautifulsoup', - ], - keywords = [ - ], - classifiers = [ - 'Development Status :: 3 - Alpha', - 'Operating System :: OS Independent', - 'Programming Language :: Python', - 'Topic :: Software Development :: Libraries :: Python Modules', - ], - ) + name="ox", + version="0.1", + description="collection of scrapers for various websites", + author="0x", + author_email="code@0xdb.org", + url="http://code.0xdb.org/ox", + download_url="http://code.0xdb.org/ox/download", + license="GPLv3", + packages=find_packages(), + zip_safe=False, + install_requires=[ + 'oxutils', + 'feedparser', + 'beautifulsoup', + ], + keywords = [ + ], + classifiers = [ + 'Development Status :: 3 - Alpha', + 'Operating System :: OS Independent', + 'Programming Language :: Python', + 'Topic :: Software Development :: Libraries :: Python Modules', + ], +)