fix allmovie parser to work a bit

This commit is contained in:
j 2012-07-08 14:16:57 +02:00
parent d860742aea
commit c374a8fec1
2 changed files with 40 additions and 29 deletions

View file

@ -8,7 +8,7 @@ from ox.cache import readUrlUnicode
def getId(url): def getId(url):
return url.split("/")[-2] return url.split("/")[-1]
def getData(id): def getData(id):
''' '''
@ -21,44 +21,52 @@ def getData(id):
>>> getData('129689')['rating'] >>> getData('129689')['rating']
u'4.5' u'4.5'
''' '''
if id.startswith('http'):
id = getId(id)
data = { data = {
"url": getUrl(id) "url": getUrl(id)
} }
html = readUrlUnicode(data["url"]) html = readUrlUnicode(data["url"])
data['aka'] = parseList(html, 'AKA') data['aka'] = parseList(html, 'AKA')
data['category'] = findRe(html, 'http://allmovie.com/explore/category/.*?">(.*?)</a>') data['category'] = findRe(html, '<dt>category</dt>.*?<dd>(.*?)</dd>')
data['countries'] = parseList(html, 'Countries') data['countries'] = parseList(html, 'countries')
data['director'] = parseEntry(html, 'Director') data['director'] = parseEntry(html, 'directed by')
data['genres'] = parseList(html, 'Genres') data['genres'] = parseList(html, 'genres')
data['keywords'] = parseList(html, 'Keywords') data['keywords'] = parseList(html, 'keywords')
data['posters'] = [findRe(html, '<img src="(http://image\..*?)"')] data['posters'] = [findRe(html, '<img src="(http://cps-.*?)"')]
data['produced'] = parseList(html, 'Produced by') data['produced'] = parseList(html, 'produced by')
data['rating'] = findRe(html, 'Stars" title="(.*?) Stars"') data['rating'] = findRe(html, 'Stars" title="(.*?) Stars"')
data['released'] = parseEntry(html, 'Released by') data['released'] = parseEntry(html, 'released by')
data['releasedate'] = parseEntry(html, 'Release')[0:10].replace(' ', '-') data['releasedate'] = parseList(html, 'release date')
data['runtime'] = findRe(html, '<td class="formed-sub" style="width: 86px;">(\d+) min.</td>') data['runtime'] = parseEntry(html, 'run time').replace('min.', '').strip()
data['set'] = parseEntry(html, 'Set In') data['set'] = parseEntry(html, 'set in')
data['synopsis'] = parseText(html, 'Plot Synopsis') data['synopsis'] = stripTags(findRe(html, '<div class="toggle-text" itemprop="description">(.*?)</div>')).strip()
data['themes'] = parseList(html, 'Themes') data['themes'] = parseList(html, 'themes')
data['types'] = parseList(html, 'Types') data['types'] = parseList(html, 'types')
data['year'] = findRe(html, '"http://allmovie.com/explore/year/(.*?)"') data['year'] = findRe(html, '<span class="year">.*?(\d+)')
html = readUrlUnicode("http://allmovie.com/work/%s/cast" % id) #data['stills'] = [re.sub('_derived.*?/', '', i) for i in re.compile('<a href="#" title="movie still".*?<img src="(.*?)"', re.DOTALL).findall(html)]
data['cast'] = parseTable(html) data['stills'] = re.compile('<a href="#" title="movie still".*?<img src="(.*?)"', re.DOTALL).findall(html)
html = readUrlUnicode("http://allmovie.com/work/%s/credits" % id) #html = readUrlUnicode("http://allmovie.com/work/%s/cast" % id)
data['credits'] = parseTable(html) #data['cast'] = parseTable(html)
#html = readUrlUnicode("http://allmovie.com/work/%s/credits" % id)
#data['credits'] = parseTable(html)
html = readUrlUnicode("http://allmovie.com/work/%s/review" % id) html = readUrlUnicode("http://allmovie.com/work/%s/review" % id)
data['review'] = parseText(html, 'Review') data['review'] = stripTags(findRe(html, '<div class="toggle-text" itemprop="description">(.*?)</div>')).strip()
return data return data
def getUrl(id): def getUrl(id):
return "http://allmovie.com/work/%s/" % id return "http://allmovie.com/work/%s" % id
def parseEntry(html, title): def parseEntry(html, title):
return stripTags(findRe(html, '<span>%s</span>(.*?)</table>' % title)).strip() html = findRe(html, '<dt>%s</dt>.*?<dd>(.*?)</dd>' % title)
return stripTags(html).strip()
def parseList(html, title): def parseList(html, title):
html = findRe(html, '<span>%s</span>(.*?)</table>' % title) html = findRe(html, '<dt>%s</dt>.*?<dd>(.*?)</dd>' % title.lower())
return map(lambda x: stripTags(x), re.compile('<li>(.*?)</li>', re.DOTALL).findall(html)) r = map(lambda x: stripTags(x), re.compile('<li>(.*?)</li>', re.DOTALL).findall(html))
if not r and html:
r = [stripTags(html)]
return r
def parseTable(html): def parseTable(html):
return map( return map(

View file

@ -56,9 +56,7 @@ def getMovieData(wikipediaUrl):
data = getWikiData(wikipediaUrl) data = getWikiData(wikipediaUrl)
filmbox_data = findRe(data, '''\{\{[Ii]nfobox.[Ff]ilm(.*?)\n\}\}''') filmbox_data = findRe(data, '''\{\{[Ii]nfobox.[Ff]ilm(.*?)\n\}\}''')
filmbox = {} filmbox = {}
_box = filmbox_data.strip().split('\n|') _box = filmbox_data.strip().split('|')
if len(_box) == 1:
_box = _box[0].split('|\n')
for row in _box: for row in _box:
d = row.split('=') d = row.split('=')
if len(d) == 2: if len(d) == 2:
@ -69,12 +67,17 @@ def getMovieData(wikipediaUrl):
if '<br>' in value: if '<br>' in value:
value = value.split('<br>') value = value.split('<br>')
filmbox[key.strip()] = value filmbox[key.strip()] = value
if not filmbox: if not filmbox_data:
return filmbox return filmbox
if 'amg_id' in filmbox and not filmbox['amg_id'].isdigit(): if 'amg_id' in filmbox and not filmbox['amg_id'].isdigit():
del filmbox['amg_id'] del filmbox['amg_id']
if 'Allmovie movie' in data: if 'Allmovie movie' in data:
filmbox['amg_id'] = findRe(data, 'Allmovie movie\|.*?(\d+)') filmbox['amg_id'] = findRe(data, 'Allmovie movie\|.*?(\d+)')
elif 'Allmovie title' in data:
filmbox['amg_id'] = findRe(data, 'Allmovie title\|.*?(\d+)')
if 'Official website' in data:
filmbox['website'] = findRe(data, 'Official website\|(.*?)}').strip()
r = re.compile('{{IMDb title\|id=(\d{7})', re.IGNORECASE).findall(data) r = re.compile('{{IMDb title\|id=(\d{7})', re.IGNORECASE).findall(data)
if r: if r: