getUrl overlap

This commit is contained in:
j 2009-07-15 15:29:22 +02:00
parent 26aa58d3a2
commit 8e2565ccf8
2 changed files with 19 additions and 17 deletions

View file

@ -2,13 +2,18 @@
# vi:si:et:sw=4:sts=4:ts=4 # vi:si:et:sw=4:sts=4:ts=4
import re import re
import oxlib.cache
from oxlib.cache import getUrlUnicode from oxlib.cache import getUrlUnicode
from oxlib.html import stripTags from oxlib.html import stripTags
from oxlib.net import getUrl
from oxlib.text import findRe, removeSpecialCharacters from oxlib.text import findRe, removeSpecialCharacters
import imdb import imdb
def getId(url):
return url.split("/")[-1]
def getUrl(id):
return "http://www.criterion.com/films/%s" % id
def getData(id): def getData(id):
''' '''
@ -27,7 +32,7 @@ def getData(id):
try: try:
html = getUrlUnicode(data["url"]) html = getUrlUnicode(data["url"])
except: except:
html = getUrl(data["url"]) html = oxlib.cache.getUrl(data["url"])
data["number"] = findRe(html, "<p class=\"spinenumber\">(.*?)</p>") data["number"] = findRe(html, "<p class=\"spinenumber\">(.*?)</p>")
data["title"] = findRe(html, "<h2 class=\"movietitle\">(.*?)</h2>") data["title"] = findRe(html, "<h2 class=\"movietitle\">(.*?)</h2>")
data["director"] = findRe(html, "<h2 class=\"director\">(.*?)</h2>") data["director"] = findRe(html, "<h2 class=\"director\">(.*?)</h2>")
@ -57,9 +62,6 @@ def getData(id):
data['imdbId'] = imdb.getMovieId(data['title'], data['director'], data['year']) data['imdbId'] = imdb.getMovieId(data['title'], data['director'], data['year'])
return data return data
def getId(url):
return url.split("/")[-1]
def getIds(): def getIds():
ids = [] ids = []
html = getUrlUnicode("http://www.criterion.com/library/dvd") html = getUrlUnicode("http://www.criterion.com/library/dvd")
@ -84,8 +86,5 @@ def getIdsByPage(page):
ids.append(result) ids.append(result)
return set(ids) return set(ids)
def getUrl(id):
return "http://www.criterion.com/films/%s" % id
if __name__ == '__main__': if __name__ == '__main__':
print getIds() print getIds()

View file

@ -3,10 +3,16 @@
from urllib import urlencode from urllib import urlencode
import simplejson import simplejson
from oxlib.cache import getUrl, getUrlUnicode from oxlib.cache import getUrlUnicode
from oxlib import findRe, decodeHtml from oxlib import findRe, decodeHtml
def getId(url):
return url.split("/")[-1]
def getUrl(id):
return "http://en.wikipedia.org/wiki/%s" % id
def getMovieId(title, director='', year=''): def getMovieId(title, director='', year=''):
query = '"%s" film %s %s' % (title, director, year) query = '"%s" film %s %s' % (title, director, year)
result = find(query, 1) result = find(query, 1)
@ -43,6 +49,7 @@ def getWikiData(wikipediaUrl):
return data return data
def getMovieData(wikipediaUrl): def getMovieData(wikipediaUrl):
if not wikipediaUrl.startswith('http'): wikipediaUrl = getUrl(wikipediaUrl)
data = getWikiData(wikipediaUrl) data = getWikiData(wikipediaUrl)
filmbox_data = findRe(data, '''\{\{Infobox.Film(.*?)\n\}\}''') filmbox_data = findRe(data, '''\{\{Infobox.Film(.*?)\n\}\}''')
filmbox = {} filmbox = {}
@ -78,10 +85,11 @@ def getMovieData(wikipediaUrl):
def getImageUrl(name): def getImageUrl(name):
data = getUrlUnicode('http://en.wikipedia.org/wiki/Image:' + name) data = getUrlUnicode('http://en.wikipedia.org/wiki/Image:' + name)
url = findRe(data, '="(http://upload.wikimedia.org/.*?)"') url = findRe(data, 'href="(http://upload.wikimedia.org/.*?%s)"' % name)
return url return url
def getPosterUrl(wikipediaUrl): def getPosterUrl(wikipediaUrl):
if not wikipediaUrl.startswith('http'): wikipediaUrl = getUrl(wikipediaUrl)
data = getMovieData(wikipediaUrl) data = getMovieData(wikipediaUrl)
if 'image' in data: if 'image' in data:
return getImageUrl(data['image']) return getImageUrl(data['image'])
@ -96,6 +104,7 @@ def getAllmovieId(wikipediaUrl):
return data.get('amg_id', '') return data.get('amg_id', '')
def find(query, max_results=10): def find(query, max_results=10):
from oxlib.cache import getUrl
query = {'action': 'query', 'list':'search', 'format': 'json', query = {'action': 'query', 'list':'search', 'format': 'json',
'srlimit': max_results, 'srwhat': 'text', 'srsearch': query.encode('utf-8')} 'srlimit': max_results, 'srwhat': 'text', 'srsearch': query.encode('utf-8')}
url = "http://en.wikipedia.org/w/api.php?" + urlencode(query) url = "http://en.wikipedia.org/w/api.php?" + urlencode(query)
@ -111,9 +120,3 @@ def find(query, max_results=10):
results.append((title, url, '')) results.append((title, url, ''))
return results return results
def getId(url):
return url.split("/")[-1]
def getUrl(id):
return "http://en.wikipedia.org/wiki/%s" % id