fix allmovie parser to work a bit
This commit is contained in:
parent
d860742aea
commit
c374a8fec1
2 changed files with 40 additions and 29 deletions
|
@ -8,7 +8,7 @@ from ox.cache import readUrlUnicode
|
||||||
|
|
||||||
|
|
||||||
def getId(url):
|
def getId(url):
|
||||||
return url.split("/")[-2]
|
return url.split("/")[-1]
|
||||||
|
|
||||||
def getData(id):
|
def getData(id):
|
||||||
'''
|
'''
|
||||||
|
@ -21,44 +21,52 @@ def getData(id):
|
||||||
>>> getData('129689')['rating']
|
>>> getData('129689')['rating']
|
||||||
u'4.5'
|
u'4.5'
|
||||||
'''
|
'''
|
||||||
|
if id.startswith('http'):
|
||||||
|
id = getId(id)
|
||||||
data = {
|
data = {
|
||||||
"url": getUrl(id)
|
"url": getUrl(id)
|
||||||
}
|
}
|
||||||
html = readUrlUnicode(data["url"])
|
html = readUrlUnicode(data["url"])
|
||||||
data['aka'] = parseList(html, 'AKA')
|
data['aka'] = parseList(html, 'AKA')
|
||||||
data['category'] = findRe(html, 'http://allmovie.com/explore/category/.*?">(.*?)</a>')
|
data['category'] = findRe(html, '<dt>category</dt>.*?<dd>(.*?)</dd>')
|
||||||
data['countries'] = parseList(html, 'Countries')
|
data['countries'] = parseList(html, 'countries')
|
||||||
data['director'] = parseEntry(html, 'Director')
|
data['director'] = parseEntry(html, 'directed by')
|
||||||
data['genres'] = parseList(html, 'Genres')
|
data['genres'] = parseList(html, 'genres')
|
||||||
data['keywords'] = parseList(html, 'Keywords')
|
data['keywords'] = parseList(html, 'keywords')
|
||||||
data['posters'] = [findRe(html, '<img src="(http://image\..*?)"')]
|
data['posters'] = [findRe(html, '<img src="(http://cps-.*?)"')]
|
||||||
data['produced'] = parseList(html, 'Produced by')
|
data['produced'] = parseList(html, 'produced by')
|
||||||
data['rating'] = findRe(html, 'Stars" title="(.*?) Stars"')
|
data['rating'] = findRe(html, 'Stars" title="(.*?) Stars"')
|
||||||
data['released'] = parseEntry(html, 'Released by')
|
data['released'] = parseEntry(html, 'released by')
|
||||||
data['releasedate'] = parseEntry(html, 'Release')[0:10].replace(' ', '-')
|
data['releasedate'] = parseList(html, 'release date')
|
||||||
data['runtime'] = findRe(html, '<td class="formed-sub" style="width: 86px;">(\d+) min.</td>')
|
data['runtime'] = parseEntry(html, 'run time').replace('min.', '').strip()
|
||||||
data['set'] = parseEntry(html, 'Set In')
|
data['set'] = parseEntry(html, 'set in')
|
||||||
data['synopsis'] = parseText(html, 'Plot Synopsis')
|
data['synopsis'] = stripTags(findRe(html, '<div class="toggle-text" itemprop="description">(.*?)</div>')).strip()
|
||||||
data['themes'] = parseList(html, 'Themes')
|
data['themes'] = parseList(html, 'themes')
|
||||||
data['types'] = parseList(html, 'Types')
|
data['types'] = parseList(html, 'types')
|
||||||
data['year'] = findRe(html, '"http://allmovie.com/explore/year/(.*?)"')
|
data['year'] = findRe(html, '<span class="year">.*?(\d+)')
|
||||||
html = readUrlUnicode("http://allmovie.com/work/%s/cast" % id)
|
#data['stills'] = [re.sub('_derived.*?/', '', i) for i in re.compile('<a href="#" title="movie still".*?<img src="(.*?)"', re.DOTALL).findall(html)]
|
||||||
data['cast'] = parseTable(html)
|
data['stills'] = re.compile('<a href="#" title="movie still".*?<img src="(.*?)"', re.DOTALL).findall(html)
|
||||||
html = readUrlUnicode("http://allmovie.com/work/%s/credits" % id)
|
#html = readUrlUnicode("http://allmovie.com/work/%s/cast" % id)
|
||||||
data['credits'] = parseTable(html)
|
#data['cast'] = parseTable(html)
|
||||||
|
#html = readUrlUnicode("http://allmovie.com/work/%s/credits" % id)
|
||||||
|
#data['credits'] = parseTable(html)
|
||||||
html = readUrlUnicode("http://allmovie.com/work/%s/review" % id)
|
html = readUrlUnicode("http://allmovie.com/work/%s/review" % id)
|
||||||
data['review'] = parseText(html, 'Review')
|
data['review'] = stripTags(findRe(html, '<div class="toggle-text" itemprop="description">(.*?)</div>')).strip()
|
||||||
return data
|
return data
|
||||||
|
|
||||||
def getUrl(id):
|
def getUrl(id):
|
||||||
return "http://allmovie.com/work/%s/" % id
|
return "http://allmovie.com/work/%s" % id
|
||||||
|
|
||||||
def parseEntry(html, title):
|
def parseEntry(html, title):
|
||||||
return stripTags(findRe(html, '<span>%s</span>(.*?)</table>' % title)).strip()
|
html = findRe(html, '<dt>%s</dt>.*?<dd>(.*?)</dd>' % title)
|
||||||
|
return stripTags(html).strip()
|
||||||
|
|
||||||
def parseList(html, title):
|
def parseList(html, title):
|
||||||
html = findRe(html, '<span>%s</span>(.*?)</table>' % title)
|
html = findRe(html, '<dt>%s</dt>.*?<dd>(.*?)</dd>' % title.lower())
|
||||||
return map(lambda x: stripTags(x), re.compile('<li>(.*?)</li>', re.DOTALL).findall(html))
|
r = map(lambda x: stripTags(x), re.compile('<li>(.*?)</li>', re.DOTALL).findall(html))
|
||||||
|
if not r and html:
|
||||||
|
r = [stripTags(html)]
|
||||||
|
return r
|
||||||
|
|
||||||
def parseTable(html):
|
def parseTable(html):
|
||||||
return map(
|
return map(
|
||||||
|
|
|
@ -56,9 +56,7 @@ def getMovieData(wikipediaUrl):
|
||||||
data = getWikiData(wikipediaUrl)
|
data = getWikiData(wikipediaUrl)
|
||||||
filmbox_data = findRe(data, '''\{\{[Ii]nfobox.[Ff]ilm(.*?)\n\}\}''')
|
filmbox_data = findRe(data, '''\{\{[Ii]nfobox.[Ff]ilm(.*?)\n\}\}''')
|
||||||
filmbox = {}
|
filmbox = {}
|
||||||
_box = filmbox_data.strip().split('\n|')
|
_box = filmbox_data.strip().split('|')
|
||||||
if len(_box) == 1:
|
|
||||||
_box = _box[0].split('|\n')
|
|
||||||
for row in _box:
|
for row in _box:
|
||||||
d = row.split('=')
|
d = row.split('=')
|
||||||
if len(d) == 2:
|
if len(d) == 2:
|
||||||
|
@ -69,12 +67,17 @@ def getMovieData(wikipediaUrl):
|
||||||
if '<br>' in value:
|
if '<br>' in value:
|
||||||
value = value.split('<br>')
|
value = value.split('<br>')
|
||||||
filmbox[key.strip()] = value
|
filmbox[key.strip()] = value
|
||||||
if not filmbox:
|
if not filmbox_data:
|
||||||
return filmbox
|
return filmbox
|
||||||
if 'amg_id' in filmbox and not filmbox['amg_id'].isdigit():
|
if 'amg_id' in filmbox and not filmbox['amg_id'].isdigit():
|
||||||
del filmbox['amg_id']
|
del filmbox['amg_id']
|
||||||
if 'Allmovie movie' in data:
|
if 'Allmovie movie' in data:
|
||||||
filmbox['amg_id'] = findRe(data, 'Allmovie movie\|.*?(\d+)')
|
filmbox['amg_id'] = findRe(data, 'Allmovie movie\|.*?(\d+)')
|
||||||
|
elif 'Allmovie title' in data:
|
||||||
|
filmbox['amg_id'] = findRe(data, 'Allmovie title\|.*?(\d+)')
|
||||||
|
|
||||||
|
if 'Official website' in data:
|
||||||
|
filmbox['website'] = findRe(data, 'Official website\|(.*?)}').strip()
|
||||||
|
|
||||||
r = re.compile('{{IMDb title\|id=(\d{7})', re.IGNORECASE).findall(data)
|
r = re.compile('{{IMDb title\|id=(\d{7})', re.IGNORECASE).findall(data)
|
||||||
if r:
|
if r:
|
||||||
|
|
Loading…
Reference in a new issue