This commit is contained in:
j 2018-08-01 11:13:13 +02:00
parent 7041d1b316
commit bf6c512d3e

View file

@ -3,11 +3,19 @@
from __future__ import print_function from __future__ import print_function
import re import re
from ox.cache import read_url import ox.cache
from ox.html import strip_tags from ox.html import strip_tags
from ox.text import find_re from ox.text import find_re
def read_url(url, timeout=ox.cache.DEFAULT_TIMEOUT):
data = ox.cache.read_url(url, timeout=timeout)
try:
data = data.decode('utf-8')
except UnicodeDecodeError:
data = data.decode('latin-1')
return data
def get_data(id): def get_data(id):
''' '''
>>> str(get_data('1991/silence_of_the_lambs')['imdbId']) >>> str(get_data('1991/silence_of_the_lambs')['imdbId'])
@ -22,7 +30,7 @@ def get_data(id):
data = { data = {
'url': get_url(id) 'url': get_url(id)
} }
html = read_url(data['url'], unicode=True) html = read_url(data['url'])
data['imdbId'] = find_re(html, 'imdb.com/title/tt(\d{7})') data['imdbId'] = find_re(html, 'imdb.com/title/tt(\d{7})')
if not data['imdbId']: if not data['imdbId']:
data['imdbId'] = _id_map.get(id, '') data['imdbId'] = _id_map.get(id, '')
@ -37,11 +45,11 @@ def get_data(id):
for result in results: for result in results:
result = result.replace('_xlg.html', '.html') result = result.replace('_xlg.html', '.html')
url = 'http://www.impawards.com/%s/%s' % (data['year'], result) url = 'http://www.impawards.com/%s/%s' % (data['year'], result)
html = read_url(url, unicode=True) html = read_url(url)
result = find_re(html, '<a href = (\w*?_xlg.html)') result = find_re(html, '<a href = (\w*?_xlg.html)')
if result: if result:
url = 'http://www.impawards.com/%s/%s' % (data['year'], result) url = 'http://www.impawards.com/%s/%s' % (data['year'], result)
html = read_url(url, unicode=True) html = read_url(url)
poster = 'http://www.impawards.com/%s/%s' % (data['year'], find_re(html, '<img SRC="(.*?)"')) poster = 'http://www.impawards.com/%s/%s' % (data['year'], find_re(html, '<img SRC="(.*?)"'))
else: else:
poster = 'http://www.impawards.com/%s/%s' % (data['year'], find_re(html, '<img src="(posters.*?)"')) poster = 'http://www.impawards.com/%s/%s' % (data['year'], find_re(html, '<img src="(posters.*?)"'))
@ -63,14 +71,14 @@ def get_id(url):
def get_ids(page=None): def get_ids(page=None):
ids = [] ids = []
if page: if page:
html = read_url('http://www.impawards.com/archives/page%s.html' % page, timeout=-1, unicode=True) html = read_url('http://www.impawards.com/archives/page%s.html' % page, timeout=-1)
results = re.compile('<a href = \.\./(.*?)>', re.DOTALL).findall(html) results = re.compile('<a href = \.\./(.*?)>', re.DOTALL).findall(html)
for result in results: for result in results:
url = 'http://impawards.com/%s' % result url = 'http://impawards.com/%s' % result
ids.append(get_id(url)) ids.append(get_id(url))
return set(ids) return set(ids)
# get all # get all
html = read_url('http://www.impawards.com/archives/latest.html', timeout=60*60, unicode=True) html = read_url('http://www.impawards.com/archives/latest.html', timeout=60*60)
pages = int(find_re(html, '<a href= page(.*?).html>')) + 1 pages = int(find_re(html, '<a href= page(.*?).html>')) + 1
for page in range(pages, 0, -1): for page in range(pages, 0, -1):
for id in get_ids(page): for id in get_ids(page):
@ -81,7 +89,7 @@ def get_ids(page=None):
def get_url(id): def get_url(id):
url = u"http://www.impawards.com/%s.html" % id url = u"http://www.impawards.com/%s.html" % id
html = read_url(url, unicode=True) html = read_url(url)
if find_re(html, "No Movie Posters on This Page"): if find_re(html, "No Movie Posters on This Page"):
url = u"http://www.impawards.com/%s_ver1.html" % id url = u"http://www.impawards.com/%s_ver1.html" % id
return url return url