diff --git a/ox/web/allmovie.py b/ox/web/allmovie.py
index b189645..836ada9 100644
--- a/ox/web/allmovie.py
+++ b/ox/web/allmovie.py
@@ -8,7 +8,7 @@ from ox.cache import readUrlUnicode
def getId(url):
- return url.split("/")[-2]
+ return url.split("/")[-1]
def getData(id):
'''
@@ -21,44 +21,52 @@ def getData(id):
>>> getData('129689')['rating']
u'4.5'
'''
+ if id.startswith('http'):
+ id = getId(id)
data = {
"url": getUrl(id)
}
html = readUrlUnicode(data["url"])
data['aka'] = parseList(html, 'AKA')
- data['category'] = findRe(html, 'http://allmovie.com/explore/category/.*?">(.*?)')
- data['countries'] = parseList(html, 'Countries')
- data['director'] = parseEntry(html, 'Director')
- data['genres'] = parseList(html, 'Genres')
- data['keywords'] = parseList(html, 'Keywords')
- data['posters'] = [findRe(html, '
category.*?
(.*?)')
+ data['countries'] = parseList(html, 'countries')
+ data['director'] = parseEntry(html, 'directed by')
+ data['genres'] = parseList(html, 'genres')
+ data['keywords'] = parseList(html, 'keywords')
+ data['posters'] = [findRe(html, '
(\d+) min.')
- data['set'] = parseEntry(html, 'Set In')
- data['synopsis'] = parseText(html, 'Plot Synopsis')
- data['themes'] = parseList(html, 'Themes')
- data['types'] = parseList(html, 'Types')
- data['year'] = findRe(html, '"http://allmovie.com/explore/year/(.*?)"')
- html = readUrlUnicode("http://allmovie.com/work/%s/cast" % id)
- data['cast'] = parseTable(html)
- html = readUrlUnicode("http://allmovie.com/work/%s/credits" % id)
- data['credits'] = parseTable(html)
+ data['released'] = parseEntry(html, 'released by')
+ data['releasedate'] = parseList(html, 'release date')
+ data['runtime'] = parseEntry(html, 'run time').replace('min.', '').strip()
+ data['set'] = parseEntry(html, 'set in')
+ data['synopsis'] = stripTags(findRe(html, '(.*?)
')).strip()
+ data['themes'] = parseList(html, 'themes')
+ data['types'] = parseList(html, 'types')
+ data['year'] = findRe(html, '.*?(\d+)')
+ #data['stills'] = [re.sub('_derived.*?/', '', i) for i in re.compile('(.*?)')).strip()
return data
def getUrl(id):
- return "http://allmovie.com/work/%s/" % id
+ return "http://allmovie.com/work/%s" % id
def parseEntry(html, title):
- return stripTags(findRe(html, '%s(.*?)' % title)).strip()
+ html = findRe(html, '%s.*?(.*?)' % title)
+ return stripTags(html).strip()
def parseList(html, title):
- html = findRe(html, '%s(.*?)' % title)
- return map(lambda x: stripTags(x), re.compile('(.*?)', re.DOTALL).findall(html))
+ html = findRe(html, '%s.*?(.*?)' % title.lower())
+ r = map(lambda x: stripTags(x), re.compile('(.*?)', re.DOTALL).findall(html))
+ if not r and html:
+ r = [stripTags(html)]
+ return r
def parseTable(html):
return map(
diff --git a/ox/web/wikipedia.py b/ox/web/wikipedia.py
index e7a5530..34509e4 100644
--- a/ox/web/wikipedia.py
+++ b/ox/web/wikipedia.py
@@ -56,9 +56,7 @@ def getMovieData(wikipediaUrl):
data = getWikiData(wikipediaUrl)
filmbox_data = findRe(data, '''\{\{[Ii]nfobox.[Ff]ilm(.*?)\n\}\}''')
filmbox = {}
- _box = filmbox_data.strip().split('\n|')
- if len(_box) == 1:
- _box = _box[0].split('|\n')
+ _box = filmbox_data.strip().split('|')
for row in _box:
d = row.split('=')
if len(d) == 2:
@@ -69,12 +67,17 @@ def getMovieData(wikipediaUrl):
if '
' in value:
value = value.split('
')
filmbox[key.strip()] = value
- if not filmbox:
+ if not filmbox_data:
return filmbox
if 'amg_id' in filmbox and not filmbox['amg_id'].isdigit():
del filmbox['amg_id']
if 'Allmovie movie' in data:
filmbox['amg_id'] = findRe(data, 'Allmovie movie\|.*?(\d+)')
+ elif 'Allmovie title' in data:
+ filmbox['amg_id'] = findRe(data, 'Allmovie title\|.*?(\d+)')
+
+ if 'Official website' in data:
+ filmbox['website'] = findRe(data, 'Official website\|(.*?)}').strip()
r = re.compile('{{IMDb title\|id=(\d{7})', re.IGNORECASE).findall(data)
if r: