python-oxweb/oxweb/allmovie.py

# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import re
import time

from oxlib import stripTags, findRe
from oxlib.cache import getUrlUnicode

def getMovieData(allmovieId):
    data = {}
    html = getUrlUnicode("http://www.allmovie.com/cg/avg.dll?p=avg&sql=1:%s~T0" % allmovieId)
    data['poster'] = findRe(html, '<img src="(http://image.*?)"')
    data['year'] = findRe(html, '<a href="/cg/avg.dll\?p=avg&amp;sql=24:\d{4}">(\d{4})</a>')
    data['runtime'] = findRe(html, 'sql=24:\d{4}">.*?">(.*?)min.</td></tr></table>').strip()
    data['rating'] = findRe(html, '" alt="(\d+?) Stars"')
    data['country'] = findRe(html, '<a href="/cg/avg.dll\?p=avg&sql=24:D\|\|\|206">(.*?)</')
    data['director'] = stripTags(findRe(html, '<td class="formed-sub"><a href="/cg/avg.dll\?p=avg&sql=2:\d+">(.*?)</td>')).split(', ')
    data['genre'] = parseList(html, 'Genre / Type')
    data['keywords'] = parseList(html, 'Keywords')
    data['themes'] = parseList(html, 'Themes')
    data['boxoffice'] = parseEntry(html, 'Box office')[1:].replace(',', '')
    data['produced'] = parseEntry(html, 'Produced by')
    data['releasedate'] = parseEntry(html, 'Release')[0:10].replace(' ', '-')
    data['released'] = parseEntry(html, 'Released by')
    data['synopsis'] = stripTags(findRe(html, 'Plot Synopsis</td>.*?<td colspan="2"><p>(.*?)</td>'))
    html = getUrlUnicode("http://www.allmovie.com/cg/avg.dll?p=avg&sql=1:%s~T1" % allmovieId)
    data['review'] = stripTags(findRe(html, 'Review</td>.*?<td colspan="2"><p>(.*?)</td>'))
    html = getUrlUnicode("http://www.allmovie.com/cg/avg.dll?p=avg&sql=1:%s~T2" % allmovieId)
    data['cast'] = map(
        lambda x: map(
            lambda x: stripTags(x).strip(),
            x.split('&nbsp;</td><td width="305"><i>-')
        ),
        findRe(html, '<div id="results-table">(.*?)</table>').split('</td></tr>')[:-1]
    )
    html = getUrlUnicode("http://www.allmovie.com/cg/avg.dll?p=avg&sql=1:%s~T3" % allmovieId)
    data['credits'] = map(
        lambda x: map(
            lambda x: stripTags(x).strip(),
            x.split('&nbsp;</TD><TD WIDTH=305><I>-')
        ),
        findRe(html, '<div id="results-table">(.*?)</table>').split('</TD></TR>')[:-1]
    )
    return data

def getMoviePoster(allmovieId):
    data = getMovieData(allmovieId)
    if data:
        return data['poster']
    return ''

def parseEntry(html, title):
    return stripTags(findRe(html, '<span>%s</span>(.*?)</table>' % title))

def parseList(html, title):
    return map(lambda x: stripTags(x), findRe(html, '<span>%s</span>(.*?)</table>' % title).split('</li><li>'))

if __name__ == '__main__':
    print getMovieData('177524')
allmovie 2008-07-04 13:56:02 +00:00			`# -- coding: utf-8 --`
			`# vi:si:et:sw=4:sts=4:ts=4`
			`import re`
			`import time`

			`from oxlib import stripTags, findRe`
			`from oxlib.cache import getUrlUnicode`

more allmovie data; allmovie id is 123456, not 1:123456 2008-07-25 14:03:04 +00:00			`def getMovieData(allmovieId):`
better var names 2008-07-25 11:54:20 +00:00			`data = {}`
minor cleanup 2008-07-25 14:51:39 +00:00			`html = getUrlUnicode("http://www.allmovie.com/cg/avg.dll?p=avg&sql=1:%s~T0" % allmovieId)`
better var names 2008-07-25 11:54:20 +00:00			`data['poster'] = findRe(html, '<img src="(http://image.*?)"')`
			`data['year'] = findRe(html, '<a href="/cg/avg.dll\?p=avg&sql=24:\d{4}">(\d{4})</a>')`
allmovie: parse runtime 2008-07-25 14:43:45 +00:00			`data['runtime'] = findRe(html, 'sql=24:\d{4}">.?">(.?)min.</td></tr></table>').strip()`
better var names 2008-07-25 11:54:20 +00:00			`data['rating'] = findRe(html, '" alt="(\d+?) Stars"')`
			`data['country'] = findRe(html, '<a href="/cg/avg.dll\?p=avg&sql=24:D\\|\\|\\|206">(.*?)</')`
more allmovie data; allmovie id is 123456, not 1:123456 2008-07-25 14:03:04 +00:00			`data['director'] = stripTags(findRe(html, '<td class="formed-sub"><a href="/cg/avg.dll\?p=avg&sql=2:\d+">(.*?)</td>')).split(', ')`
allmovie: parseList function 2008-07-25 14:06:50 +00:00			`data['genre'] = parseList(html, 'Genre / Type')`
			`data['keywords'] = parseList(html, 'Keywords')`
			`data['themes'] = parseList(html, 'Themes')`
allmovie: more data 2008-07-25 14:35:12 +00:00			`data['boxoffice'] = parseEntry(html, 'Box office')[1:].replace(',', '')`
			`data['produced'] = parseEntry(html, 'Produced by')`
			`data['releasedate'] = parseEntry(html, 'Release')[0:10].replace(' ', '-')`
			`data['released'] = parseEntry(html, 'Released by')`
			`data['synopsis'] = stripTags(findRe(html, 'Plot Synopsis</td>.?<td colspan="2"><p>(.?)</td>'))`
more allmovie data; allmovie id is 123456, not 1:123456 2008-07-25 14:03:04 +00:00			`html = getUrlUnicode("http://www.allmovie.com/cg/avg.dll?p=avg&sql=1:%s~T1" % allmovieId)`
better var names 2008-07-25 11:54:20 +00:00			`data['review'] = stripTags(findRe(html, 'Review</td>.?<td colspan="2"><p>(.?)</td>'))`
allmovie: parse cast 2008-07-25 14:22:28 +00:00			`html = getUrlUnicode("http://www.allmovie.com/cg/avg.dll?p=avg&sql=1:%s~T2" % allmovieId)`
			`data['cast'] = map(`
			`lambda x: map(`
			`lambda x: stripTags(x).strip(),`
			`x.split(' </td><td width="305"><i>-')`
			`),`
			`findRe(html, '<div id="results-table">(.*?)</table>').split('</td></tr>')[:-1]`
			`)`
allmovie: parse credits 2008-07-25 14:26:46 +00:00			`html = getUrlUnicode("http://www.allmovie.com/cg/avg.dll?p=avg&sql=1:%s~T3" % allmovieId)`
			`data['credits'] = map(`
			`lambda x: map(`
			`lambda x: stripTags(x).strip(),`
			`x.split(' </TD><TD WIDTH=305><I>-')`
			`),`
			`findRe(html, '<div id="results-table">(.*?)</table>').split('</TD></TR>')[:-1]`
			`)`
better var names 2008-07-25 11:54:20 +00:00			`return data`
allmovie 2008-07-04 13:56:02 +00:00
hm... or rather not 2008-07-25 14:58:54 +00:00			`def getMoviePoster(allmovieId):`
more allmovie data; allmovie id is 123456, not 1:123456 2008-07-25 14:03:04 +00:00			`data = getMovieData(allmovieId)`
allmovie 2008-07-04 13:56:02 +00:00			`if data:`
			`return data['poster']`
			`return ''`

allmovie: more data 2008-07-25 14:35:12 +00:00			`def parseEntry(html, title):`
			`return stripTags(findRe(html, '<span>%s</span>(.*?)</table>' % title))`

allmovie: parseList function 2008-07-25 14:06:50 +00:00			`def parseList(html, title):`
			`return map(lambda x: stripTags(x), findRe(html, '<span>%s</span>(.*?)</table>' % title).split('</li><li>'))`

more allmovie data; allmovie id is 123456, not 1:123456 2008-07-25 14:03:04 +00:00			`if __name__ == '__main__':`
allmovie: parse runtime 2008-07-25 14:44:15 +00:00			`print getMovieData('177524')`
more allmovie data; allmovie id is 123456, not 1:123456 2008-07-25 14:03:04 +00:00