# -*- coding: utf-8 -*- # vi:si:et:sw=4:sts=4:ts=4 from urllib import urlencode import simplejson from oxlib.cache import getUrl, getUrlUnicode from oxlib import findRe, decodeHtml def getMovieId(title, director='', year=''): query = '"%s" film %s %s' % (title, director, year) result = find(query, 1) if result: return result[0][1] return '' def getUrlByImdb(imdbId): query = '"imdb_id = %s"'% imdbId result = find(query) if result: url = result[0][1] return url if str(imdbId).startswith('0'): imdbId = imdbId[1:] return getUrlByImdb(imdbId) def getUrlByAmbId(amg_id): query = '"amg_id = %s"'% amg_id result = find(query) if result: url = result[0][1] return url return '' def getWikiData(wikipediaUrl): title = wikipediaUrl.replace('http://en.wikipedia.org/wiki/', '') url = "http://en.wikipedia.org/w/index.php?title=%s&action=edit" % title html = getUrlUnicode(url) data = decodeHtml(findRe(html, "(.*?)")) return data def getMovieData(wikipediaUrl): data = getWikiData(wikipediaUrl) filmbox_data = findRe(data, '''\{\{Infobox.Film(.*?)\}\}''') filmbox = {} for row in filmbox_data.strip().split('|'): d = row.split('=') if len(d) == 2: key = d[0].strip() value = d[1].strip() filmbox[key] = value return filmbox def getImageUrl(name): data = getUrlUnicode('http://en.wikipedia.org/wiki/Image:' + name) url = findRe(data, '="(http://upload.wikimedia.org/.*?)"') return url def getMoviePoster(wikipediaUrl): data = getMovieData(wikipediaUrl) if 'image' in data: return getImageUrl(data['image']) return '' def getAmgId(wikipediaUrl): data = getMovieData(wikipediaUrl) return data.get('amg_id', '') def find(query, max_results=10): query = {'action': 'query', 'list':'search', 'format': 'json', 'srlimit': max_results, 'srwhat': 'text', 'srsearch': query.encode('utf-8')} url = "http://en.wikipedia.org/w/api.php?" + urlencode(query) data = getUrl(url) if not data: data = getUrl(url, timeout=0) result = simplejson.loads(data) results = [] for r in result['query']['search']: title = r['title'] url = "http://en.wikipedia.org/wiki/%s" % title.replace(' ', '_') results.append((title, url, '')) return results