commit bbe4542bd23fcf3f5a31a7b4efe0489f30330952 Author: j Date: Mon Apr 28 11:52:21 2008 +0200 lets start with google and imdb diff --git a/ox/__init__.py b/ox/__init__.py new file mode 100644 index 0000000..e3c36ff --- /dev/null +++ b/ox/__init__.py @@ -0,0 +1,8 @@ +# -*- Mode: Python; -*- +# vi:si:et:sw=2:sts=2:ts=2 +# encoding: utf-8 + +__version__ = '0.1.0' + +from net import * + diff --git a/ox/google.py b/ox/google.py new file mode 100644 index 0000000..966da87 --- /dev/null +++ b/ox/google.py @@ -0,0 +1,187 @@ +# -*- Mode: Python; -*- +# -*- coding: utf-8 -*- +# vi:si:et:sw=2:sts=2:ts=2 +import re +import time +import urllib +import urllib2 +import weakref +import threading +import Queue + +import oxutils +from oxutils import stripTags + + +''' +FIXME this function should be replaced by something more minimal find function +usage: +import google +google.find(query) + + +for result in google.find(query): result + +result is title, url, description + +google.find(query, max_results) + +''' +DEFAULT_MAX_RESULTS = 10 + +def getUrl(url, data=None, headers=oxutils.net.DEFAULT_HEADERS): + google_timeout=24*60*60 + return oxutils.cache.getUrl(url, data, headers, google_timeout) + +def quote_plus(s): + return urllib.quote_plus(s.encode('utf-8')) + +def get_search_page_links(page, results_per_page, begin, end, link_re): + """ + Given str contents of search result page, return list of links. + + Returns list of (name, url, desc) str tuples. See make_searcher() + for a description of results_per_page and link_re. + """ + if begin is not None and begin in page: + page = page[page.index(begin):] + if end is not None and end in page: + page = page[:page.index(end)] + ans = [] + for match in re.compile(link_re, re.DOTALL).finditer(page): + (name, url, desc) = match.group('name', 'url', 'desc') + ans += [(stripTags(name), url, stripTags(desc))] + return ans + + +def nonblocking(f, blocking_return=None, sleep_time=0.01): + """ + Wrap a callable which returns an iter so that it no longer blocks. + + The wrapped iterator returns blocking_return while callable f is + blocking. The callable f is called in a background thread. If the + wrapped iterator is deleted, then the iterator returned by f is + deleted also and the background thread is terminated. + """ + def g(*args, **kwargs): + f_iter = f(*args, **kwargs) + g_iter = None + def run(): + while True: + g_obj = g_iter() + if g_obj is None: + return + if g_obj.q.qsize() == 0: + try: + f_next = f_iter.next() + except Exception, e: + g_obj.exc = e + return + g_obj.q.put(f_next) + else: + del g_obj + time.sleep(sleep_time) + class Iter: + def __init__(self): + self.q = Queue.Queue() + self.exc = None + self.thread = threading.Thread(target=run) + self.thread.setDaemon(True) + def next(self): + if self.exc is not None: + raise self.exc + try: + return self.q.get_nowait() + except Queue.Empty: + return blocking_return + def __iter__(self): + return self + + obj = Iter() + g_iter = weakref.ref(obj) + obj.thread.start() + try: + return obj + finally: + del obj + return g +def make_searcher(query_url, results_per_page, page_url, page_mode, + begin, end, link_re): + """ + Return a search function for the given search engine. + + Here query_url is the URL for the initial search, with %(q)s for + the query string, results_per_page is the number of search results + per page, page_url is the URL for the 2nd and subsequent pages of + search results, with %(q)s for the query string and %(n)s for the + page "number." Here page_mode controls the actual value for the + page "number:" + + - page_mode='page0': Use 0-based index of the page. + - page_mode='page1': Use 1-based index of the page. + - page_mode='offset0': Use 0-based index of the search result, + which is a multiple of results_per_page. + - page_mode='offset1': Use 1-based index of the search result + (one plus a multiple of results_per_page). + + If begin is not None, then only text after the first occurrence of + begin will be used in the search results page. If end is not None, + then only text before the first occurrence of end will be used. + + Finally, link_re is a regex string (see module re) which matches + three named groups: 'name', 'url', and 'desc'. These correspond to + the name, URL and description of each search result. The regex is + applied in re.DOTALL mode. + + Returns a search() function which has the same interface as + described in the module docstring. + """ + def search_blocking(query, max_results): + last_links = None + page_num = 0 + q = Queue.Queue() + for i in range(max_results): + if q.qsize() == 0: + if page_num == 0: + page = getUrl(query_url % {'q': quote_plus(query)}) + else: + if page_mode == 'page0': + n = page_num + elif page_mode == 'page1': + n = page_num + 1 + elif page_mode == 'offset0': + n = page_num * results_per_page + elif page_mode == 'offset1': + n = page_num * results_per_page + 1 + else: + raise ValueError('unknown page mode') + page = getUrl(page_url % {'n': n, 'q': quote_plus(query)}) + page_num += 1 + links = get_search_page_links(page, results_per_page, begin, end, link_re) + if len(links) == 0 or links == last_links: + break + last_links = links + for link in links: + q.put(link) + yield q.get() + + search_nonblocking = nonblocking(search_blocking) + + def search(query, max_results=DEFAULT_MAX_RESULTS, blocking=True): + """ + See docstring for web_search module. + """ + if blocking: + return search_blocking(query, max_results) + else: + return search_nonblocking(query, max_results) + + return search + +find = make_searcher('http://www.google.com/search?q=%(q)s', 10, + 'http://www.google.com/search?start=%(n)d&q=%(q)s', 'offset0', + None, None, + r'(?P.*?)' + + r'.*?(?:
|)' + + r'(?P.*?)' + '(?:|(.*?)').findall(value) + if m: + parsed_value = m[0][0] + else: + parsed_value = '' + elif key == 'also_known_as': + parsed_value = '' + m = re.compile('(.*) \(International: English title').findall(value) + if m: + parsed_value = m[0] + else: + m = re.compile('(.*) \(USA').findall(value) + if m: + parsed_value = m[0] + parsed_value = parsed_value.split('
')[-1].split('(')[0] + director = self.parseCredits().get('director', None) + if director: + director = director[0] + parsed_value = parsed_value.replace(director, '') + if parsed_value.startswith("'s"): + parsed_value = parsed_value[2:].strip() + parsed_value = parsed_value.strip() + else: + print value + parsed_value = value + return parsed_value + + def parseTitle(self): + title = '' + data = self.getPage() + soup = BeautifulSoup(data) + html_title = soup('div', {'id': 'tn15title'}) + if not html_title: + html_title = soup('title') + if html_title: + html_title = str(html_title[0]) + html_title = html_title.replace('
', ' ').replace(' ', ' ') + title = stripTags(html_title) + title = re.sub('\(\d\d\d\d\)', '', title) + title = re.sub('\(\d\d\d\d/I*\)', '', title) + for t in ('TV series', 'TV-Series', 'TV mini-series', '(mini)', '(VG)', '(V)', '(TV)'): + title = title.replace(t, '') + if title.find(u'\xa0') > -1: + title = title[:title.find(u'\xa0')] + title = normalizeTitle(title.strip()) + if title.startswith('"') and title.endswith('"'): + title = normalizeTitle(title[1:-1]) + elif title.startswith('"') and title.find('"',1) > 0 and \ + title.find('"',1) == title.rfind('"'): + se = re.compile("Season (\d*), Episode (\d*)\)").findall(data) + if se: + se = se[0] + se = ' (S%02dE%02d)' % (int(se[0]), int(se[1])) + title = normalizeTitle(title[1:title.rfind('"')]) + se + title[title.rfind('"')+1:] + else: + title = normalizeTitle(title[1:title.rfind('"')]) + ':' + title[title.rfind('"')+1:] + return normalizeTitle(title) + + def parseYear(self): + year = '' + data = self.getPage() + soup = BeautifulSoup(data) + html_title = soup('div', {'id': 'tn15title'}) + if not html_title: + html_title = soup('title') + if html_title: + html_title = str(html_title[0]) + html_title = stripTags(html_title) + year = re.compile('\((\d\d\d\d)\)').findall(html_title) + if not year: + year = re.compile('\((\d\d\d\d)/').findall(html_title) + if year: + year = year[0] + else: year = '' + return year + + def parse(self): + data = self.getPage() + IMDbDict ={} + #Poster + IMDbDict['poster'] = _getTerm(data, 'name="poster".*?(.*?)/10', re.IGNORECASE).search(data) + if m: + IMDbDict['rating'] = int(float(m.group(1)) * 1000) + else: + IMDbDict['rating'] = -1 + #Votes + m = re.compile('\((.*?) votes\)', re.IGNORECASE).findall(data) + if m: + IMDbDict['votes'] = int(m[0].replace(',', '')) + else: + IMDbDict['votes'] = -1 + + data = data.replace('\n',' ') + #some values + keys = ('runtime', 'language', 'genre', 'country', 'tagline', 'plot_outline', 'tv_series', 'also_known_as') + for key in keys: + IMDbDict[key] = '' + IMDbDict['runtime'] = 0 + soup = BeautifulSoup(data) + for info in soup('div', {'class': 'info'}): + key = str(info).split('')[0].split('
') + if len(key) > 1: + raw_value = str(info).split('
')[1] + key = key[1][:-1].lower().replace(' ', '_') + if key in keys: + IMDbDict[key] = self.parse_raw_value(key, raw_value) + IMDbDict['title_english'] = IMDbDict.pop('also_known_as', IMDbDict['title']) + #is episode + IMDbDict['episode_of'] = IMDbDict.pop('tv_series', '') + + IMDbDict['episodes'] = self.parseEpisodes() + if IMDbDict['episodes']: + IMDbDict['tvshow'] = True + else: + IMDbDict['tvshow'] = False + IMDbDict['credits'] = self.parseCredits() + IMDbDict['plot'] = self.parsePlot() + IMDbDict['keywords'] = self.parseKeywords() + + IMDbDict['trivia'] = self.parseTrivia() + IMDbDict['connections'] = self.parseConnections() + IMDbDict['locations'] = self.parseLocations() + IMDbDict['release_date'] = self.parseReleaseinfo() + IMDbDict['business'] = self.parseBusiness() + IMDbDict['reviews'] = self.parseExternalreviews() + IMDbDict['stills'] = getMovieStills(self.imdb) + #IMDbDict['trailer'] = self.parseTrailer() + self.IMDbDict = IMDbDict + + if IMDbDict['episode_of']: + episode_of =IMDb(IMDbDict['episode_of']).parse() + for key in ('country', 'language'): + if not IMDbDict[key]: + IMDbDict[key] = episode_of[key] + return self.IMDbDict + + def getCredits(self, forcereload = False): + if forcereload or not self.creditsSource: + self.creditsSource = getUrlUnicode(self.creditsUrl) + return self.creditsSource + + def parseCredits(self): + data = self.getCredits() + credits = {} + credits['director'] = _castList(data, 'Directed by.*?(.*?)') + credits['writer'] = _castList(data, 'Writing credits.*?(.*?)') + credits['producer'] = _castList(data, 'Produced by.*?(.*?)') + #credits['cast'] = _castList(data, 'Cast.*?(') + credits['cast'] = [] + soup = re.compile('Cast.*?(').findall(data) + soup = BeautifulSoup(data) + cast = soup('table', {'class': 'cast'}) + if cast: + cast = str(cast[0]).replace(u'\xa0', ' ') + names = re.compile('(.*?).*?(.*?)').findall(cast) + for name in names: + real_name = name[0] + role_name = name[1] + if role_name: + role_name = role_name.split('(')[0].replace('/ ...','') + credits['cast'].append((stripTags(real_name), stripTags(role_name))) + self.credits = credits + return self.credits + + def getPlot(self, forcereload = False): + if forcereload or not self.plotSource: + self.plotSource = getUrlUnicode(self.plotUrl) + return self.plotSource + + def parsePlot(self): + soup = BeautifulSoup(self.getPlot()) + plot = soup('p', {'class':'plotpar'}) + if plot: + plot = unicode(plot[0]).split('')[0] + else: + plot = u'' + plot = stripTags(plot).strip() + self.plot = plot + return plot + + def getEpisodes(self, forcereload = False): + if forcereload or not self.episodesSource: + self.episodesSource = getUrlUnicode(self.episodesUrl) + return self.episodesSource + + def parseEpisodes(self): + episodes = {} + cdata = self.getEpisodes().replace('\r\n', ' ') + regexp = r'''

Season (.*?), Episode (.*?): (.*?)

(.*?)
(.*?)
''' + reg = re.compile(regexp, re.IGNORECASE) + m = reg.findall(cdata) + for match in m: + try: + episode = "S%02dE%02d" % (int(match[0]), int(match[1])) + episodes[episode] = {} + episodes[episode]['imdb'] = match[2] + episodes[episode]['title'] = match[3].strip() + if episodes[episode]['title'].startswith('Episode #%d'%int(match[0])): + episodes[episode]['title'] = u'' + description = htmldecode(match[5]) + description = stripTags(description.split('Next US airings:')[0]) + episodes[episode]['description'] = description + episodes[episode]['date'] = '' + try: + d = stripTags(match[4]) + d = d.replace('Original Air Date: ', '') + d = time.strftime("%Y-%m-%d", time.strptime(d, '%d %B %Y')) + episodes[episode]['date'] = d + except: + pass + except: + import traceback + print traceback.print_exc() + pass + self.episodes = episodes + return self.episodes + + def getLocations(self, forcereload = False): + if forcereload or not self.locationSource: + self.keywordSource = getUrlUnicode(self.locationUrl) + return self.keywordSource + + def parseLocations(self): + soup = BeautifulSoup(self.getLocations()) + locations = [] + for key in soup('a', {'href': re.compile('^/List')}): + locations.append(htmldecode(key.string)) + self.locations = locations + return self.locations + + def getKeywords(self, forcereload = False): + if forcereload or not self.keywordSource: + self.keywordSource = getUrlUnicode(self.keywordUrl) + return self.keywordSource + + def parseKeywords(self): + soup = BeautifulSoup(self.getKeywords()) + keywords = [] + for key in soup('a', {'href': re.compile('^/keyword/')}): + k = htmldecode(key.string) + k = k.replace(u'\xa0', ' ') + keywords.append(k) + self.keywords = keywords + return self.keywords + + def getTrivia(self, forcereload = False): + if forcereload or not self.triviaSource: + self.triviaSource = getUrlUnicode(self.triviaUrl) + return self.triviaSource + + def parseTrivia(self): + trivia = [] + soup = BeautifulSoup(self.getTrivia()) + triviaList = [] + for i in soup('ul', {'class': "trivia"}): + for t in i('li'): + t = str(t).replace('
', '').strip() + if t.startswith('
  • ') and t.endswith('
  • '): + t = t[4:-5].strip() + trivia.append(t) + self.trivia = trivia + return self.trivia + + def getConnections(self, forcereload = False): + if forcereload or not self.connectionsSource: + self.connectionsSource = getUrlUnicode(self.connectionsUrl) + return self.connectionsSource + + def parseConnections(self): + connections = {} + soup = BeautifulSoup(self.getConnections()) + content = soup('div', {'id': 'tn15content'})[0] + blocks = str(content).split('
    ')[1:] + for c in blocks: + connection = c.split('
    ')[0] + cs = BeautifulSoup(c) + if connection: + #relation -> list of imdb ids + connections[connection] = [a.get('href')[-8:-1] for a in cs('a', {'href': re.compile('/title/tt')})] + return connections + + def getReleaseinfo(self, forcereload = False): + if forcereload or not self.releaseinfoSource: + self.releaseinfoSource = getUrlUnicode(self.releaseinfoUrl) + return self.releaseinfoSource + + def parseReleaseinfo(self): + soup = BeautifulSoup(self.getReleaseinfo()) + info = soup('table',{'border': '0', 'cellpadding':'2'}) + if info: + for row in info[0]('tr'): + d = row('td', {'align':'right'}) + if d: + try: + possible_date = stripTags(str(d[0])).strip() + rdate = time.strptime(possible_date, "%d %B %Y") + rdate = time.strftime('%Y-%m-%d', rdate) + return rdate + except: + pass + return None + + def getBusiness(self, forcereload = False): + if forcereload or not self.businessSource: + self.businessSource = getUrlUnicode(self.businessUrl) + return self.businessSource + + def parseBusiness(self): + soup = BeautifulSoup(self.getBusiness()) + business = {'budget': 0, 'gross': 0, 'profit': 0} + content = soup('div', {'id': 'tn15content'})[0] + blocks = str(content).split('
    ')[1:] + for c in blocks: + cs = BeautifulSoup(c) + line = c.split('
    ') + if line: + title = line[0] + line = line[1] + if title in ['Budget', 'Gross']: + values = re.compile('\$(.*?) ').findall(line) + values = [int(value.replace(',','')) for value in values] + if values: + business[title.lower()] = max(values) + if business['budget'] and business['gross']: + business['profit'] = business['gross'] - business['budget'] + return business + + def getExternalreviews(self, forcereload = False): + if forcereload or not self.externalreviewsSource: + self.externalreviewsSource = getUrlUnicode(self.externalreviewsUrl) + return self.externalreviewsSource + + def parseExternalreviews(self): + soup = BeautifulSoup(self.getExternalreviews()) + ol = soup('ol') + if ol: + ol = ol[0] + ret = {} + for li in ol('li'): + try: + a = li('a')[0] + href = a.get('href') + txt = a.contents[0] + ret[href] = txt + except: + pass + return ret + return {} + + def getTrailer(self, forcereload = False): + if forcereload or not self.trailerSource: + self.trailerSource = getUrlUnicode(self.trailerUrl) + return self.trailerSource + + def parseTrailer(self): + ret = {} + soup = BeautifulSoup(self.getTrailer()) + for p in soup('p'): + if p('a') and p.firstText(): + a = p('a')[0] + href = a['href'] + if href and href.startswith('http'): + title = a.string + title = title.replace('www.', '') + ret[href] = title + return ret + +def guess(title, director=''): + #FIXME: proper file -> title + title = title.split('-')[0] + title = title.split('(')[0] + title = title.split('.')[0] + title = title.strip() + imdb_url = 'http://www.imdb.com/find?q=%s' % quote(title.encode('utf-8')) + return_url = '' + + #lest first try google + #i.e. site:imdb.com Michael Stevens Sin + if director: + search = 'site:imdb.com %s "%s"' % (director, title) + else: + search = 'site:imdb.com "%s"' % title + for (name, url, desc) in google.find(search, 2): + if url.startswith('http://www.imdb.com/title/tt'): + return url[28:35] + + try: + req = urllib2.Request(imdb_url, None, oxutils.net.DEFAULT_HEADERS) + u = urllib2.urlopen(req) + data = u.read() + return_url = u.url + u.close() + except: + return None + if return_url.startswith('http://www.imdb.com/title/tt'): + return return_url[28:35] + if data: + imdb_id = _getTerm(data.replace('\n', ' '), 'Popular Results.*?
    1. .*? int(s[1]): + stills.append("http://i.imdb.com/Photos/Ss/%s/%s.jpg" % (id, s[2])) + if not stills: + s_ = re.compile(''' int(s[1]): + stills.append("http://%sf.jpg" % s[2]) + return stills + +if __name__ == '__main__': + import sys + #print parse(sys.argv[1]) + print "imdb:", guess(sys.argv[1]) diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..4877b36 --- /dev/null +++ b/setup.py @@ -0,0 +1,30 @@ +#!/usr/bin/env python +# vi:si:et:sw=2:sts=2:ts=2 +# encoding: utf-8 +from setuptools import setup, find_packages + +import os + +setup( + name="ox", + version="0.1", + + # uncomment the following lines if you fill them out in release.py + description="collection of scrapers for various websites", + author="bot", + author_email="bot@0xdb.org", + url="http://ox.0xdb.org", + download_url="http://ox.0xdb.org/download", + license="GPL", + packages=find_packages(), + zip_safe=False, + keywords = [ + ], + classifiers = [ + 'Development Status :: 3 - Alpha', + 'Operating System :: OS Independent', + 'Programming Language :: Python', + 'Topic :: Software Development :: Libraries :: Python Modules', + ], + ) +