python-ox/ox/web/flixter.py

# -*- coding: UTF-8 -*-
# vi:si:et:sw=4:sts=4:ts=4

import re
from lxml.html import document_fromstring

from ox.cache import read_url
from ox import find_re, strip_tags
from ox.web.imdb import ImdbCombined


def get_data(id, timeout=-1):
    '''
    >>> get_data('the-matrix')['poster']
    'http://content7.flixster.com/movie/16/90/52/1690525_gal.jpg'

    >>> get_data('0133093')['poster']
    'http://content7.flixster.com/movie/16/90/52/1690525_gal.jpg'

    >>> get_data('2-or-3-things-i-know-about-her')['poster']
    'http://content6.flixster.com/movie/10/95/43/10954392_gal.jpg'

    >>> get_data('0078875')['rottentomatoes_id']
    'http://www.rottentomatoes.com/m/the-tin-drum/'
    '''
    if len(id) == 7:
        try:
            int(id)
            id = get_id(imdb=id)
        except:
            pass
    data = {
        "url": get_url(id),
    }
    html = read_url(data['url'], timeout=timeout, unicode=True)
    doc = document_fromstring(html)

    props = {
        'og:title': 'title',
        'og:image': 'poster',
        'og:url': 'rottentomatoes_id',
    }
    for meta in doc.head.findall('meta'):
        prop = meta.attrib.get('property', None)
        content = meta.attrib.get('content', '')
        if prop in props and content:
            data[props[prop]] = content

    for p in doc.body.find_class('synopsis'):
        data['synopsis'] = p.text.strip()

    if 'poster' in data and data['poster']:
        data['poster'] = data['poster'].replace('_pro.jpg', '_gal.jpg')
    if not 'title' in data:
        return None
    return data

def get_id(url=None, imdb=None):
    '''
    >>> get_id(imdb='0133093')
    u'the-matrix'

    #>>> get_id(imdb='0060304')
    #u'2-or-3-things-i-know-about-her'
    '''
    if imdb:
        i = ImdbCombined(imdb)
        title = i['title']
        return title.replace(' ', '-').lower().replace("'", '')
    return url.split('/')[-1]

def get_url(id):
    return "http://www.flixster.com/movie/%s"%id
add flixter 2010-09-04 10:42:37 +00:00			`# -- coding: UTF-8 --`
			`# vi:si:et:sw=4:sts=4:ts=4`

			`import re`
			`from lxml.html import document_fromstring`

net/cache readUrl->read_url / Unicode -> unicode=True format replace all CammelCase with under_score 2012-08-14 13:58:05 +00:00			`from ox.cache import read_url`
replace all CammelCase with under_score in ox 2012-08-14 14:12:43 +00:00			`from ox import find_re, strip_tags`
add flixter 2010-09-04 10:42:37 +00:00			`from ox.web.imdb import ImdbCombined`


ox.web under_score api rewrite 2012-08-15 15:15:40 +00:00			`def get_data(id, timeout=-1):`
add flixter 2010-09-04 10:42:37 +00:00			`'''`
ox.web under_score api rewrite 2012-08-15 15:15:40 +00:00			`>>> get_data('the-matrix')['poster']`
add flixter 2010-09-04 10:42:37 +00:00			`'http://content7.flixster.com/movie/16/90/52/1690525_gal.jpg'`

ox.web under_score api rewrite 2012-08-15 15:15:40 +00:00			`>>> get_data('0133093')['poster']`
add flixter 2010-09-04 10:42:37 +00:00			`'http://content7.flixster.com/movie/16/90/52/1690525_gal.jpg'`

ox.web under_score api rewrite 2012-08-15 15:15:40 +00:00			`>>> get_data('2-or-3-things-i-know-about-her')['poster']`
add flixter 2010-09-04 10:42:37 +00:00			`'http://content6.flixster.com/movie/10/95/43/10954392_gal.jpg'`

ox.web under_score api rewrite 2012-08-15 15:15:40 +00:00			`>>> get_data('0078875')['rottentomatoes_id']`
add flixter 2010-09-04 10:42:37 +00:00			`'http://www.rottentomatoes.com/m/the-tin-drum/'`
			`'''`
			`if len(id) == 7:`
			`try:`
			`int(id)`
ox.web under_score api rewrite 2012-08-15 15:15:40 +00:00			`id = get_id(imdb=id)`
add flixter 2010-09-04 10:42:37 +00:00			`except:`
			`pass`
			`data = {`
ox.web under_score api rewrite 2012-08-15 15:15:40 +00:00			`"url": get_url(id),`
add flixter 2010-09-04 10:42:37 +00:00			`}`
fix import 2012-09-09 16:48:40 +00:00			`html = read_url(data['url'], timeout=timeout, unicode=True)`
add flixter 2010-09-04 10:42:37 +00:00			`doc = document_fromstring(html)`

			`props = {`
			`'og:title': 'title',`
			`'og:image': 'poster',`
			`'og:url': 'rottentomatoes_id',`
			`}`
			`for meta in doc.head.findall('meta'):`
			`prop = meta.attrib.get('property', None)`
			`content = meta.attrib.get('content', '')`
			`if prop in props and content:`
			`data[props[prop]] = content`

			`for p in doc.body.find_class('synopsis'):`
			`data['synopsis'] = p.text.strip()`

			`if 'poster' in data and data['poster']:`
			`data['poster'] = data['poster'].replace('_pro.jpg', '_gal.jpg')`
			`if not 'title' in data:`
			`return None`
			`return data`

ox.web under_score api rewrite 2012-08-15 15:15:40 +00:00			`def get_id(url=None, imdb=None):`
add flixter 2010-09-04 10:42:37 +00:00			`'''`
ox.web under_score api rewrite 2012-08-15 15:15:40 +00:00			`>>> get_id(imdb='0133093')`
add flixter 2010-09-04 10:42:37 +00:00			`u'the-matrix'`

ox.web under_score api rewrite 2012-08-15 15:15:40 +00:00			`#>>> get_id(imdb='0060304')`
add flixter 2010-09-04 10:42:37 +00:00			`#u'2-or-3-things-i-know-about-her'`
			`'''`
ox.web under_score api rewrite 2012-08-15 15:15:40 +00:00			`if imdb:`
			`i = ImdbCombined(imdb)`
			`title = i['title']`
			`return title.replace(' ', '-').lower().replace("'", '')`
add flixter 2010-09-04 10:42:37 +00:00			`return url.split('/')[-1]`

ox.web under_score api rewrite 2012-08-15 15:15:40 +00:00			`def get_url(id):`
add flixter 2010-09-04 10:42:37 +00:00			`return "http://www.flixster.com/movie/%s"%id`