python-oxweb/oxweb/impawards.py

# vi:si:et:sw=4:sts=4:ts=4
# encoding: utf-8
import re

from oxlib.cache import getUrlUnicode
from oxlib.html import stripTags
from oxlib.text import findRe

import imdb

def getData(id):
    '''
    >>> getData('1991/silence_of_the_lambs')['imdbId']
    u'0102926'

    >>> getData('1991/silence_of_the_lambs')['posters'][0]
    u'http://www.impawards.com/1991/posters/silence_of_the_lambs_ver1_xlg.jpg'

    >>> getData('1991/silence_of_the_lambs')['url']
    u'http://www.impawards.com/1991/silence_of_the_lambs_ver1.html'
    '''
    data = {
        'url': getUrl(id)
    }
    html = getUrlUnicode(data['url'])
    data['imdbId'] = findRe(html, 'imdb.com/title/tt(.*?) ')
    data['title'] = stripTags(findRe(html, '<p class="name white">(.*?) \(<a href="alpha1.html">'))
    data['year'] = findRe(html, '\(<a href="alpha1.html">(.*?)</a>\)')
    data['posters'] = []
    results = re.compile('<a href = (%s.*?html)' % id[5:], re.DOTALL).findall(html)
    for result in results:
        result = result.replace('_xlg.html', '.html')
        url = 'http://www.impawards.com/%s/%s' % (data['year'], result)
        html = getUrlUnicode(url)
        result = findRe(html, '<a href = (\w*?_xlg.html)')
        if result:
            url = 'http://www.impawards.com/%s/%s' % (data['year'], result)
            html = getUrlUnicode(url)
            poster = 'http://www.impawards.com/%s/%s' % (data['year'], findRe(html, '<img SRC="(.*?)"'))
        else:
            poster = 'http://www.impawards.com/%s/%s' % (data['year'], findRe(html, '<img src="(posters.*?)" alt='))
        data['posters'].append(poster)
    return data

def getId(url):
    split = url.split('/')
    year = split[3]
    split = split[4][:-5].split('_')
    if split[-1] == 'xlg':
        split.pop()
    if findRe(split[-1], 'ver\d+$'):
        split.pop()
    id = '%s/%s' % (year, '_'.join(split))
    return id

def getIds():
    ids = []
    html = getUrlUnicode('http://www.impawards.com/archives/latest.html', timeout = 0)
    pages = int(findRe(html, '<a href= page(.*?).html>')) + 1
    for page in range(pages, 0, -1):
        for id in getIdsByPage(page):
            if not id in ids:
                ids.append(id)
    return ids

def getIdsByPage(page):
    ids = []
    html = getUrlUnicode('http://www.impawards.com/archives/page%s.html' % page, timeout = -1)
    results = re.compile('<a href = \.\./(.*?)>', re.DOTALL).findall(html)
    for result in results:
        url = 'http://impawards.com/%s' % result
        ids.append(getId(url))
    return set(ids)

def getUrl(id):
    url = "http://www.impawards.com/%s.html" % id
    html = getUrlUnicode(url)
    if findRe(html, "No Movie Posters on This Page"):
        url = "http://www.impawards.com/%s_ver1.html" % id
    return url

if __name__ == '__main__':
    ids = getIds()
    print sorted(ids), len(ids)
vi:si:et:sw=4:sts=4:ts=4 2008-06-19 09:47:02 +00:00			`# vi:si:et:sw=4:sts=4:ts=4`
			`# encoding: utf-8`
adding impawards module 2008-05-09 11:21:42 +00:00			`import re`

rename oxutils -> oxlib 2008-07-03 09:24:49 +00:00			`from oxlib.cache import getUrlUnicode`
			`from oxlib.html import stripTags`
			`from oxlib.text import findRe`
adding impawards module 2008-05-09 11:21:42 +00:00
rename ox -> oxweb 2008-07-03 09:21:18 +00:00			`import imdb`

cleaning up impawards module 2009-07-13 17:55:28 +00:00			`def getData(id):`
			`'''`
			`>>> getData('1991/silence_of_the_lambs')['imdbId']`
			`u'0102926'`
adding impawards module 2008-05-09 11:21:42 +00:00
cleaning up impawards module 2009-07-13 17:55:28 +00:00			`>>> getData('1991/silence_of_the_lambs')['posters'][0]`
			`u'http://www.impawards.com/1991/posters/silence_of_the_lambs_ver1_xlg.jpg'`
adding impawards module 2008-05-09 11:21:42 +00:00
cleaning up impawards module 2009-07-13 17:55:28 +00:00			`>>> getData('1991/silence_of_the_lambs')['url']`
			`u'http://www.impawards.com/1991/silence_of_the_lambs_ver1.html'`
			`'''`
			`data = {`
			`'url': getUrl(id)`
			`}`
			`html = getUrlUnicode(data['url'])`
adding impawards module 2008-05-09 11:21:42 +00:00			`data['imdbId'] = findRe(html, 'imdb.com/title/tt(.*?) ')`
new criterion module 2009-07-04 10:25:24 +00:00			`data['title'] = stripTags(findRe(html, '<p class="name white">(.*?) \(<a href="alpha1.html">'))`
			`data['year'] = findRe(html, '\(<a href="alpha1.html">(.*?)</a>\)')`
cleaning up impawards module 2009-07-13 17:55:28 +00:00			`data['posters'] = []`
			`results = re.compile('<a href = (%s.*?html)' % id[5:], re.DOTALL).findall(html)`
			`for result in results:`
			`result = result.replace('_xlg.html', '.html')`
			`url = 'http://www.impawards.com/%s/%s' % (data['year'], result)`
			`html = getUrlUnicode(url)`
			`result = findRe(html, '<a href = (\w*?_xlg.html)')`
			`if result:`
			`url = 'http://www.impawards.com/%s/%s' % (data['year'], result)`
			`html = getUrlUnicode(url)`
			`poster = 'http://www.impawards.com/%s/%s' % (data['year'], findRe(html, '<img SRC="(.*?)"'))`
			`else:`
			`poster = 'http://www.impawards.com/%s/%s' % (data['year'], findRe(html, '<img src="(posters.*?)" alt='))`
			`data['posters'].append(poster)`
adding impawards module 2008-05-09 11:21:42 +00:00			`return data`

cleaning up impawards module 2009-07-13 17:55:28 +00:00			`def getId(url):`
			`split = url.split('/')`
			`year = split[3]`
			`split = split[4][:-5].split('_')`
			`if split[-1] == 'xlg':`
			`split.pop()`
			`if findRe(split[-1], 'ver\d+$'):`
			`split.pop()`
			`id = '%s/%s' % (year, '_'.join(split))`
			`return id`

			`def getIds():`
			`ids = []`
			`html = getUrlUnicode('http://www.impawards.com/archives/latest.html', timeout = 0)`
updating criterion 2009-07-13 18:40:59 +00:00			`pages = int(findRe(html, '<a href= page(.*?).html>')) + 1`
			`for page in range(pages, 0, -1):`
cleaning up impawards module 2009-07-13 17:55:28 +00:00			`for id in getIdsByPage(page):`
			`if not id in ids:`
			`ids.append(id)`
			`return ids`

			`def getIdsByPage(page):`
			`ids = []`
			`html = getUrlUnicode('http://www.impawards.com/archives/page%s.html' % page, timeout = -1)`
			`results = re.compile('<a href = \.\./(.*?)>', re.DOTALL).findall(html)`
			`for result in results:`
			`url = 'http://impawards.com/%s' % result`
			`ids.append(getId(url))`
			`return set(ids)`
new criterion module 2009-07-04 10:25:24 +00:00
cleaning up impawards module 2009-07-13 17:55:28 +00:00			`def getUrl(id):`
			`url = "http://www.impawards.com/%s.html" % id`
			`html = getUrlUnicode(url)`
			`if findRe(html, "No Movie Posters on This Page"):`
			`url = "http://www.impawards.com/%s_ver1.html" % id`
updating criterion 2009-07-13 18:40:59 +00:00			`return url`

			`if __name__ == '__main__':`
adding kg module 2009-07-13 20:08:23 +00:00			`ids = getIds()`
			`print sorted(ids), len(ids)`