python-ox/ox/web/itunes.py

189 lines
7.7 KiB
Python
Raw Normal View History

2010-07-07 23:25:57 +00:00
# vi:si:et:sw=4:sts=4:ts=4
# encoding: utf-8
2014-09-30 19:27:26 +00:00
from __future__ import print_function
2010-07-07 23:25:57 +00:00
import re
2023-07-27 11:07:13 +00:00
from urllib.parse import urlencode
2010-07-07 23:25:57 +00:00
from ox.cache import read_url
from ox.html import decode_html, strip_tags
from ox.text import find_re
from ox.text import find_string
2010-07-07 23:25:57 +00:00
# to sniff itunes traffic, use something like
# sudo tcpdump -i en1 -Avs 8192 host appleglobal.112.2o7.net
# http://ax.phobos.apple.com.edgesuite.net/WebObjects/MZSearch.woa/wa/advancedSearch?media=music&songTerm=&genreIndex=1&flavor=0&mediaType=2&composerTerm=&allArtistNames=Arcadia&ringtone=0&searchButton=submit
# http://ax.phobos.apple.com.edgesuite.net/WebObjects/MZSearch.woa/wa/advancedSearch?media=movie&movieTerm=The%20Matrix&descriptionTerm=&ratingIndex=1&mediaType=3&directorProducerName=Andy%20Wachowski&flavor=0&releaseYearTerm=1999&closedCaption=0&actorTerm=&searchButton=submit
ITUNES_HEADERS = {
'X-Apple-Tz': '0',
'X-Apple-Storefront': '143441-1',
'User-Agent': 'iTunes/7.6.2 (Macintosh; U; Intel Mac OS X 10.5.2)',
'Accept-Language': 'en-us, en;q=0.50',
'Accept-Encoding': 'gzip',
'Connection': 'close',
}
2012-08-15 15:15:40 +00:00
def compose_url(request, parameters):
2010-07-07 23:25:57 +00:00
if request == 'advancedSearch':
url = 'http://ax.phobos.apple.com.edgesuite.net/WebObjects/MZSearch.woa/wa/advancedSearch?'
if parameters['media'] == 'music':
2014-10-05 17:54:13 +00:00
url += urlencode({
2010-07-07 23:25:57 +00:00
'albumTerm': parameters['title'],
'allArtistNames': parameters['artist'],
'composerTerm': '',
'flavor': 0,
'genreIndex': 1,
'media': 'music',
'mediaType': 2,
'ringtone': 0,
'searchButton': 'submit',
'songTerm': ''
})
elif parameters['media'] == 'movie':
2014-10-05 17:54:13 +00:00
url += urlencode({
2010-07-07 23:25:57 +00:00
'actorTerm': '',
'closedCaption': 0,
'descriptionTerm': '',
'directorProducerName': parameters['director'],
'flavor': 0,
'media': 'movie',
'mediaType': 3,
'movieTerm': parameters['title'],
'ratingIndex': 1,
'releaseYearTerm': '',
'searchButton': 'submit'
})
elif request == 'viewAlbum':
url = 'http://phobos.apple.com/WebObjects/MZStore.woa/wa/viewAlbum?id=%s' % parameters['id']
elif request == 'viewMovie':
url = 'http://phobos.apple.com/WebObjects/MZStore.woa/wa/viewMovie?id=%s&prvw=1' % parameters['id']
return url
2012-08-15 15:15:40 +00:00
def parse_xml_dict(xml):
2010-07-07 23:25:57 +00:00
values = {}
strings = xml.split('<key>')
for string in strings:
if string.find('</key>') != -1:
key = find_re(string, '(.*?)</key>')
type = find_re(string, '</key><(.*?)>')
2010-07-07 23:25:57 +00:00
if type == 'true/':
value = True
else:
value = find_re(string, '<%s>(.*?)</%s>' % (type, type))
2010-07-07 23:25:57 +00:00
if type == 'integer':
value = int(value)
elif type == 'string':
value = decode_html(value)
2010-07-07 23:25:57 +00:00
values[key] = value
return values
2012-08-15 15:15:40 +00:00
def parse_cast(xml, title):
2010-07-07 23:25:57 +00:00
list = []
try:
strings = find_re(xml, '<SetFontStyle normalStyle="textColor">%s(.*?)</VBoxView>' % title[:-1].upper()).split('</GotoURL>')
2010-07-07 23:25:57 +00:00
strings.pop()
for string in strings:
list.append(find_re(string, '<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
2010-07-07 23:25:57 +00:00
return list
except:
return list
2012-08-15 15:15:40 +00:00
def parse_movies(xml, title):
2010-07-07 23:25:57 +00:00
list = []
try:
strings = find_re(xml, '<SetFontStyle normalStyle="outlineTitleFontStyle"><b>%s(.*?)</Test>' % title[:-1].upper()).split('</GotoURL>')
2010-07-07 23:25:57 +00:00
strings.pop()
for string in strings:
list.append({
2024-09-11 21:52:01 +00:00
'id': find_re(string, r'viewMovie\?id=(.*?)&'),
'title': find_re(string, r'<SetFontStyle normalStyle="outlineTextFontStyle"><b>(.*?)</b></SetFontStyle>')
2010-07-07 23:25:57 +00:00
})
return list
except:
return list
class ItunesAlbum:
def __init__(self, id = '', title = '', artist = ''):
self.id = id
self.title = title
self.artist = artist
if not id:
2012-08-15 15:15:40 +00:00
self.id = self.get_id()
2010-07-07 23:25:57 +00:00
2012-08-15 15:15:40 +00:00
def get_id(self):
url = compose_url('advancedSearch', {'media': 'music', 'title': self.title, 'artist': self.artist})
xml = read_url(url, headers = ITUNES_HEADERS)
2024-09-11 21:52:01 +00:00
id = find_re(xml, r'viewAlbum\?id=(.*?)&')
2010-07-07 23:25:57 +00:00
return id
2012-08-15 15:15:40 +00:00
def get_data(self):
2010-07-07 23:25:57 +00:00
data = {'id': self.id}
2012-08-15 15:15:40 +00:00
url = compose_url('viewAlbum', {'id': self.id})
xml = read_url(url, None, ITUNES_HEADERS)
data['albumName'] = find_re(xml, '<B>(.*?)</B>')
data['artistName'] = find_re(xml, '<b>(.*?)</b>')
data['coverUrl'] = find_re(xml, 'reflection="." url="(.*?)"')
data['genre'] = find_re(xml, 'Genre:(.*?)<')
data['releaseDate'] = find_re(xml, 'Released(.*?)<')
data['review'] = strip_tags(find_re(xml, 'REVIEW</b>.*?<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
2010-07-07 23:25:57 +00:00
data['tracks'] = []
strings = find_re(xml, '<key>items</key>.*?<dict>(.*?)$').split('<dict>')
2010-07-07 23:25:57 +00:00
for string in strings:
2012-08-15 15:15:40 +00:00
data['tracks'].append(parse_xml_dict(string))
data['type'] = find_re(xml, '<key>listType</key><string>(.*?)<')
2010-07-07 23:25:57 +00:00
return data
class ItunesMovie:
def __init__(self, id = '', title = '', director = ''):
self.id = id
self.title = title
self.director = director
if not id:
2012-08-15 15:15:40 +00:00
self.id = self.get_id()
2010-07-07 23:25:57 +00:00
2012-08-15 15:15:40 +00:00
def get_id(self):
url = compose_url('advancedSearch', {'media': 'movie', 'title': self.title, 'director': self.director})
xml = read_url(url, headers = ITUNES_HEADERS)
2024-09-11 21:52:01 +00:00
id = find_re(xml, r'viewMovie\?id=(.*?)&')
2010-07-07 23:25:57 +00:00
return id
2012-08-15 15:15:40 +00:00
def get_data(self):
2010-07-07 23:25:57 +00:00
data = {'id': self.id}
2012-08-15 15:15:40 +00:00
url = compose_url('viewMovie', {'id': self.id})
xml = read_url(url, None, ITUNES_HEADERS)
2010-07-07 23:25:57 +00:00
f = open('/Users/rolux/Desktop/iTunesData.xml', 'w')
f.write(xml)
f.close()
2012-08-15 15:15:40 +00:00
data['actors'] = parse_cast(xml, 'actors')
string = find_re(xml, 'Average Rating:(.*?)</HBoxView>')
2010-07-07 23:25:57 +00:00
data['averageRating'] = string.count('rating_star_000033.png') + string.count('&#189;') * 0.5
2012-08-15 15:15:40 +00:00
data['directors'] = parse_cast(xml, 'directors')
data['format'] = find_re(xml, 'Format:(.*?)<')
data['genre'] = decode_html(find_re(xml, 'Genre:(.*?)<'))
data['plotSummary'] = decode_html(find_re(xml, 'PLOT SUMMARY</b>.*?<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
data['posterUrl'] = find_re(xml, 'reflection="." url="(.*?)"')
2012-08-15 15:15:40 +00:00
data['producers'] = parse_cast(xml, 'producers')
data['rated'] = find_re(xml, 'Rated(.*?)<')
2012-08-15 15:15:40 +00:00
data['relatedMovies'] = parse_movies(xml, 'related movies')
data['releaseDate'] = find_re(xml, 'Released(.*?)<')
data['runTime'] = find_re(xml, 'Run Time:(.*?)<')
2012-08-15 15:15:40 +00:00
data['screenwriters'] = parse_cast(xml, 'screenwriters')
2024-09-11 21:52:01 +00:00
data['soundtrackId'] = find_re(xml, r'viewAlbum\?id=(.*?)&')
data['trailerUrl'] = find_re(xml, 'autoplay="." url="(.*?)"')
2010-07-07 23:25:57 +00:00
return data
if __name__ == '__main__':
from ox.utils import json
2012-08-15 15:15:40 +00:00
data = ItunesAlbum(title = 'So Red the Rose', artist = 'Arcadia').get_data()
2014-09-30 19:27:26 +00:00
print(json.dumps(data, sort_keys = True, indent = 4))
2012-08-15 15:15:40 +00:00
data = ItunesMovie(title = 'The Matrix', director = 'Wachowski').get_data()
2014-09-30 19:27:26 +00:00
print(json.dumps(data, sort_keys = True, indent = 4))
2010-07-07 23:25:57 +00:00
for v in data['relatedMovies']:
2012-08-15 15:15:40 +00:00
data = ItunesMovie(id = v['id']).get_data()
2014-09-30 19:27:26 +00:00
print(json.dumps(data, sort_keys = True, indent = 4))
2012-08-15 15:15:40 +00:00
data = ItunesMovie(id='272960052').get_data()
2014-09-30 19:27:26 +00:00
print(json.dumps(data, sort_keys = True, indent = 4))
2010-07-07 23:25:57 +00:00