diff --git a/oxweb/criterion.py b/oxweb/criterion.py index ea983e4..a5074be 100644 --- a/oxweb/criterion.py +++ b/oxweb/criterion.py @@ -2,13 +2,18 @@ # vi:si:et:sw=4:sts=4:ts=4 import re +import oxlib.cache from oxlib.cache import getUrlUnicode from oxlib.html import stripTags -from oxlib.net import getUrl from oxlib.text import findRe, removeSpecialCharacters import imdb +def getId(url): + return url.split("/")[-1] + +def getUrl(id): + return "http://www.criterion.com/films/%s" % id def getData(id): ''' @@ -27,7 +32,7 @@ def getData(id): try: html = getUrlUnicode(data["url"]) except: - html = getUrl(data["url"]) + html = oxlib.cache.getUrl(data["url"]) data["number"] = findRe(html, "

(.*?)

") data["title"] = findRe(html, "

(.*?)

") data["director"] = findRe(html, "

(.*?)

") @@ -57,9 +62,6 @@ def getData(id): data['imdbId'] = imdb.getMovieId(data['title'], data['director'], data['year']) return data -def getId(url): - return url.split("/")[-1] - def getIds(): ids = [] html = getUrlUnicode("http://www.criterion.com/library/dvd") @@ -84,8 +86,5 @@ def getIdsByPage(page): ids.append(result) return set(ids) -def getUrl(id): - return "http://www.criterion.com/films/%s" % id - if __name__ == '__main__': - print getIds() \ No newline at end of file + print getIds() diff --git a/oxweb/wikipedia.py b/oxweb/wikipedia.py index 78ed016..d4c2e51 100644 --- a/oxweb/wikipedia.py +++ b/oxweb/wikipedia.py @@ -3,10 +3,16 @@ from urllib import urlencode import simplejson -from oxlib.cache import getUrl, getUrlUnicode +from oxlib.cache import getUrlUnicode from oxlib import findRe, decodeHtml +def getId(url): + return url.split("/")[-1] + +def getUrl(id): + return "http://en.wikipedia.org/wiki/%s" % id + def getMovieId(title, director='', year=''): query = '"%s" film %s %s' % (title, director, year) result = find(query, 1) @@ -43,6 +49,7 @@ def getWikiData(wikipediaUrl): return data def getMovieData(wikipediaUrl): + if not wikipediaUrl.startswith('http'): wikipediaUrl = getUrl(wikipediaUrl) data = getWikiData(wikipediaUrl) filmbox_data = findRe(data, '''\{\{Infobox.Film(.*?)\n\}\}''') filmbox = {} @@ -78,10 +85,11 @@ def getMovieData(wikipediaUrl): def getImageUrl(name): data = getUrlUnicode('http://en.wikipedia.org/wiki/Image:' + name) - url = findRe(data, '="(http://upload.wikimedia.org/.*?)"') + url = findRe(data, 'href="(http://upload.wikimedia.org/.*?%s)"' % name) return url def getPosterUrl(wikipediaUrl): + if not wikipediaUrl.startswith('http'): wikipediaUrl = getUrl(wikipediaUrl) data = getMovieData(wikipediaUrl) if 'image' in data: return getImageUrl(data['image']) @@ -96,6 +104,7 @@ def getAllmovieId(wikipediaUrl): return data.get('amg_id', '') def find(query, max_results=10): + from oxlib.cache import getUrl query = {'action': 'query', 'list':'search', 'format': 'json', 'srlimit': max_results, 'srwhat': 'text', 'srsearch': query.encode('utf-8')} url = "http://en.wikipedia.org/w/api.php?" + urlencode(query) @@ -111,9 +120,3 @@ def find(query, max_results=10): results.append((title, url, '')) return results -def getId(url): - return url.split("/")[-1] - -def getUrl(id): - return "http://en.wikipedia.org/wiki/%s" % id -