diff --git a/ox/web/allmovie.py b/ox/web/allmovie.py index b189645..836ada9 100644 --- a/ox/web/allmovie.py +++ b/ox/web/allmovie.py @@ -8,7 +8,7 @@ from ox.cache import readUrlUnicode def getId(url): - return url.split("/")[-2] + return url.split("/")[-1] def getData(id): ''' @@ -21,44 +21,52 @@ def getData(id): >>> getData('129689')['rating'] u'4.5' ''' + if id.startswith('http'): + id = getId(id) data = { "url": getUrl(id) } html = readUrlUnicode(data["url"]) data['aka'] = parseList(html, 'AKA') - data['category'] = findRe(html, 'http://allmovie.com/explore/category/.*?">(.*?)') - data['countries'] = parseList(html, 'Countries') - data['director'] = parseEntry(html, 'Director') - data['genres'] = parseList(html, 'Genres') - data['keywords'] = parseList(html, 'Keywords') - data['posters'] = [findRe(html, 'category.*?
(.*?)
') + data['countries'] = parseList(html, 'countries') + data['director'] = parseEntry(html, 'directed by') + data['genres'] = parseList(html, 'genres') + data['keywords'] = parseList(html, 'keywords') + data['posters'] = [findRe(html, '(\d+) min.') - data['set'] = parseEntry(html, 'Set In') - data['synopsis'] = parseText(html, 'Plot Synopsis') - data['themes'] = parseList(html, 'Themes') - data['types'] = parseList(html, 'Types') - data['year'] = findRe(html, '"http://allmovie.com/explore/year/(.*?)"') - html = readUrlUnicode("http://allmovie.com/work/%s/cast" % id) - data['cast'] = parseTable(html) - html = readUrlUnicode("http://allmovie.com/work/%s/credits" % id) - data['credits'] = parseTable(html) + data['released'] = parseEntry(html, 'released by') + data['releasedate'] = parseList(html, 'release date') + data['runtime'] = parseEntry(html, 'run time').replace('min.', '').strip() + data['set'] = parseEntry(html, 'set in') + data['synopsis'] = stripTags(findRe(html, '
(.*?)
')).strip() + data['themes'] = parseList(html, 'themes') + data['types'] = parseList(html, 'types') + data['year'] = findRe(html, '.*?(\d+)') + #data['stills'] = [re.sub('_derived.*?/', '', i) for i in re.compile('(.*?)')).strip() return data def getUrl(id): - return "http://allmovie.com/work/%s/" % id + return "http://allmovie.com/work/%s" % id def parseEntry(html, title): - return stripTags(findRe(html, '%s(.*?)' % title)).strip() + html = findRe(html, '
%s
.*?
(.*?)
' % title) + return stripTags(html).strip() def parseList(html, title): - html = findRe(html, '%s(.*?)' % title) - return map(lambda x: stripTags(x), re.compile('
  • (.*?)
  • ', re.DOTALL).findall(html)) + html = findRe(html, '
    %s
    .*?
    (.*?)
    ' % title.lower()) + r = map(lambda x: stripTags(x), re.compile('
  • (.*?)
  • ', re.DOTALL).findall(html)) + if not r and html: + r = [stripTags(html)] + return r def parseTable(html): return map( diff --git a/ox/web/wikipedia.py b/ox/web/wikipedia.py index e7a5530..34509e4 100644 --- a/ox/web/wikipedia.py +++ b/ox/web/wikipedia.py @@ -56,9 +56,7 @@ def getMovieData(wikipediaUrl): data = getWikiData(wikipediaUrl) filmbox_data = findRe(data, '''\{\{[Ii]nfobox.[Ff]ilm(.*?)\n\}\}''') filmbox = {} - _box = filmbox_data.strip().split('\n|') - if len(_box) == 1: - _box = _box[0].split('|\n') + _box = filmbox_data.strip().split('|') for row in _box: d = row.split('=') if len(d) == 2: @@ -69,12 +67,17 @@ def getMovieData(wikipediaUrl): if '
    ' in value: value = value.split('
    ') filmbox[key.strip()] = value - if not filmbox: + if not filmbox_data: return filmbox if 'amg_id' in filmbox and not filmbox['amg_id'].isdigit(): del filmbox['amg_id'] if 'Allmovie movie' in data: filmbox['amg_id'] = findRe(data, 'Allmovie movie\|.*?(\d+)') + elif 'Allmovie title' in data: + filmbox['amg_id'] = findRe(data, 'Allmovie title\|.*?(\d+)') + + if 'Official website' in data: + filmbox['website'] = findRe(data, 'Official website\|(.*?)}').strip() r = re.compile('{{IMDb title\|id=(\d{7})', re.IGNORECASE).findall(data) if r: