python-ox/ox/web/apple.py

from __future__ import print_function
import json
import re

from ox.cache import read_url

HEADERS = {
    'User-Agent': 'iTunes/10.4 (Macintosh; Intel Mac OS X 10.7) AppleWebKit/534.48.3',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Language': 'en-us, en;q=0.50',
    'X-Apple-Store-Front': '143441-1,12',
    'X-Apple-Tz': '7200',
    'Accept-Encoding': 'gzip, deflate'
}
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7) '
USER_AGENT += 'AppleWebKit/534.48.3 (KHTML, like Gecko) Version/5.1 Safari/534.48.3'

def get_movie_data(title, director):
    if isinstance(title, unicode):
        title = title.encode('utf-8')
    if isinstance(director, unicode):
        director = director.encode('utf-8')
    data = {}
    # itunes section (preferred source for link)
    url = 'http://ax.search.itunes.apple.com/WebObjects/MZSearch.woa/wa/advancedSearch'
    url += '?media=movie&movieTerm=' + title
    url += '&actorNames=&directorProducerName=' + director
    url += '&releaseYearTerm=&descriptionTerm=&genreIndex=1&ratingIndex=1'
    HEADERS['Referer'] = url
    html = read_url(url, headers=HEADERS, unicode=True)
    regexp = '<a href="(http://itunes.apple.com/us/movie/.*?)" class="artwork-link"><div class="artwork">'
    regexp += '<img width=".*?" height=".*?" alt=".*?" class="artwork" src="(.*?)" /></div></a>'
    results = re.compile(regexp).findall(html)
    if results:
        data['link'] = results[0][0]
        data['poster'] = results[0][1].replace('140x140', '600x600')
        html = read_url(data['link'], headers=HEADERS, unicode=True)
        results = re.compile('video-preview-url="(.*?)"').findall(html)
        if results:
            data['trailer'] = results[0]
    # trailers section (preferred source for poster and trailer)
    host = 'http://trailers.apple.com'
    url = host + '/trailers/home/scripts/quickfind.php?callback=searchCallback&q=' + title
    js = json.loads(read_url(url, unicode=True)[16:-4])
    results = js['results']
    if results:
        url = host + results[0]['location']
        if not 'link' in data:
            data['link'] = url
        headers = {
            'User-Agent': USER_AGENT
        }
        html = read_url(url, headers=headers, unicode=True)
        results = re.compile('"(' + host + '.*?poster\.jpg)"').findall(html)
        if results:
            data['poster'] = results[0].replace('poster.jpg', 'poster-xlarge.jpg')
        html = read_url(url + 'includes/playlists/web.inc', headers=headers, unicode=True)
        results = re.compile('"(' + host + '\S+\.mov)"').findall(html)
        if results:
            data['trailer'] = results[-1]
    return data

if __name__ == '__main__':
    print(get_movie_data('Alphaville', 'Jean-Luc Godard'))
    print(get_movie_data('Sin City', 'Roberto Rodriguez'))
    print(get_movie_data('Breathless', 'Jean-Luc Godard'))
    print(get_movie_data('Capitalism: A Love Story', 'Michael Moore'))
    print(get_movie_data('Film Socialisme', 'Jean-Luc Godard'))
from __futre__ import print_function 2014-09-30 19:27:26 +00:00			`from __future__ import print_function`
add parser for trailers.apple.com 2011-07-30 08:12:24 +00:00			`import json`
add apple module (movie link/poster/trailer) 2011-07-30 06:42:44 +00:00			`import re`

net/cache readUrl->read_url / Unicode -> unicode=True format replace all CammelCase with under_score 2012-08-14 13:58:05 +00:00			`from ox.cache import read_url`
add apple module (movie link/poster/trailer) 2011-07-30 06:42:44 +00:00
			`HEADERS = {`
			`'User-Agent': 'iTunes/10.4 (Macintosh; Intel Mac OS X 10.7) AppleWebKit/534.48.3',`
			`'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8',`
			`'Accept-Language': 'en-us, en;q=0.50',`
			`'X-Apple-Store-Front': '143441-1,12',`
			`'X-Apple-Tz': '7200',`
			`'Accept-Encoding': 'gzip, deflate'`
			`}`
add parser for trailers.apple.com 2011-07-30 08:12:24 +00:00			`USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7) '`
			`USER_AGENT += 'AppleWebKit/534.48.3 (KHTML, like Gecko) Version/5.1 Safari/534.48.3'`
add apple module (movie link/poster/trailer) 2011-07-30 06:42:44 +00:00
ox.web under_score api rewrite 2012-08-15 15:15:40 +00:00			`def get_movie_data(title, director):`
make utf-8 url 2011-07-30 11:17:25 +00:00			`if isinstance(title, unicode):`
			`title = title.encode('utf-8')`
			`if isinstance(director, unicode):`
			`director = director.encode('utf-8')`
add parser for trailers.apple.com 2011-07-30 08:12:24 +00:00			`data = {}`
			`# itunes section (preferred source for link)`
add apple module (movie link/poster/trailer) 2011-07-30 06:42:44 +00:00			`url = 'http://ax.search.itunes.apple.com/WebObjects/MZSearch.woa/wa/advancedSearch'`
			`url += '?media=movie&movieTerm=' + title`
			`url += '&actorNames=&directorProducerName=' + director`
			`url += '&releaseYearTerm=&descriptionTerm=&genreIndex=1&ratingIndex=1'`
			`HEADERS['Referer'] = url`
net/cache readUrl->read_url / Unicode -> unicode=True format replace all CammelCase with under_score 2012-08-14 13:58:05 +00:00			`html = read_url(url, headers=HEADERS, unicode=True)`
add apple module (movie link/poster/trailer) 2011-07-30 06:42:44 +00:00			`regexp = '<a href="(http://itunes.apple.com/us/movie/.*?)" class="artwork-link"><div class="artwork">'`
			`regexp += '<img width=".?" height=".?" alt=".?" class="artwork" src="(.?)" /></div></a>'`
			`results = re.compile(regexp).findall(html)`
			`if results:`
add parser for trailers.apple.com 2011-07-30 08:12:24 +00:00			`data['link'] = results[0][0]`
			`data['poster'] = results[0][1].replace('140x140', '600x600')`
net/cache readUrl->read_url / Unicode -> unicode=True format replace all CammelCase with under_score 2012-08-14 13:58:05 +00:00			`html = read_url(data['link'], headers=HEADERS, unicode=True)`
add parser for trailers.apple.com 2011-07-30 08:12:24 +00:00			`results = re.compile('video-preview-url="(.*?)"').findall(html)`
add apple module (movie link/poster/trailer) 2011-07-30 06:42:44 +00:00			`if results:`
			`data['trailer'] = results[0]`
add parser for trailers.apple.com 2011-07-30 08:12:24 +00:00			`# trailers section (preferred source for poster and trailer)`
			`host = 'http://trailers.apple.com'`
			`url = host + '/trailers/home/scripts/quickfind.php?callback=searchCallback&q=' + title`
net/cache readUrl->read_url / Unicode -> unicode=True format replace all CammelCase with under_score 2012-08-14 13:58:05 +00:00			`js = json.loads(read_url(url, unicode=True)[16:-4])`
add parser for trailers.apple.com 2011-07-30 08:12:24 +00:00			`results = js['results']`
			`if results:`
			`url = host + results[0]['location']`
			`if not 'link' in data:`
			`data['link'] = url`
			`headers = {`
			`'User-Agent': USER_AGENT`
			`}`
net/cache readUrl->read_url / Unicode -> unicode=True format replace all CammelCase with under_score 2012-08-14 13:58:05 +00:00			`html = read_url(url, headers=headers, unicode=True)`
add parser for trailers.apple.com 2011-07-30 08:12:24 +00:00			`results = re.compile('"(' + host + '.*?poster\.jpg)"').findall(html)`
			`if results:`
			`data['poster'] = results[0].replace('poster.jpg', 'poster-xlarge.jpg')`
net/cache readUrl->read_url / Unicode -> unicode=True format replace all CammelCase with under_score 2012-08-14 13:58:05 +00:00			`html = read_url(url + 'includes/playlists/web.inc', headers=headers, unicode=True)`
add parser for trailers.apple.com 2011-07-30 08:12:24 +00:00			`results = re.compile('"(' + host + '\S+\.mov)"').findall(html)`
			`if results:`
			`data['trailer'] = results[-1]`
add apple module (movie link/poster/trailer) 2011-07-30 06:42:44 +00:00			`return data`

			`if __name__ == '__main__':`
from __futre__ import print_function 2014-09-30 19:27:26 +00:00			`print(get_movie_data('Alphaville', 'Jean-Luc Godard'))`
			`print(get_movie_data('Sin City', 'Roberto Rodriguez'))`
			`print(get_movie_data('Breathless', 'Jean-Luc Godard'))`
			`print(get_movie_data('Capitalism: A Love Story', 'Michael Moore'))`
			`print(get_movie_data('Film Socialisme', 'Jean-Luc Godard'))`