Switch to python3

This commit is contained in:
j 2014-09-30 18:15:32 +02:00
commit 9ba4b6a91a
5286 changed files with 677347 additions and 576888 deletions

View file

@ -0,0 +1,9 @@
# vi:si:et:sw=4:sts=4:ts=4
# encoding: utf-8
__version__ = '1.0.0'
from . import imdb
from . import wikipedia
from . import google
from . import piratecinema
from . import oxdb

View file

@ -0,0 +1,20 @@
from ox.cache import read_url
import re
import lxml.html
def get_data(id):
info = {}
base = 'http://www.abebooks.com'
url = '%s/servlet/SearchResults?isbn=%s&sts=t' % (base, id)
data = read_url(url)
urls = re.compile('href="(/servlet/BookDetailsPL[^"]+)"').findall(data)
if urls:
details = '%s%s' % (base, urls[0])
data = read_url(details)
doc = lxml.html.document_fromstring(data)
for e in doc.xpath("//*[contains(@id, 'biblio')]"):
key = e.attrib['id'].replace('biblio-', '')
value = e.text_content()
if value and key not in ('bookcondition', 'binding'):
info[key] = value
return info

View file

@ -0,0 +1,85 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import re
from ox import strip_tags, find_re
from ox.cache import read_url
def get_id(url):
return url.split("/")[-1]
def get_data(id):
'''
>>> get_data('129689')['cast'][1][1]
u'Marianne'
>>> get_data('129689')['credits'][0][0]
u'Jean-Luc Godard'
>>> get_data('129689')['posters'][0]
u'http://image.allmusic.com/00/adg/cov200/dru800/u812/u81260bbffr.jpg'
>>> get_data('129689')['rating']
u'4.5'
'''
if id.startswith('http'):
id = get_id(id)
data = {
"url": get_url(id)
}
html = read_url(data["url"], unicode=True)
data['aka'] = parse_list(html, 'AKA')
data['category'] = find_re(html, '<dt>category</dt>.*?<dd>(.*?)</dd>')
data['countries'] = parse_list(html, 'countries')
data['director'] = parse_entry(html, 'directed by')
data['genres'] = parse_list(html, 'genres')
data['keywords'] = parse_list(html, 'keywords')
data['posters'] = [find_re(html, '<img src="(http://cps-.*?)"')]
data['produced'] = parse_list(html, 'produced by')
data['rating'] = find_re(html, 'Stars" title="(.*?) Stars"')
data['released'] = parse_entry(html, 'released by')
data['releasedate'] = parse_list(html, 'release date')
data['runtime'] = parse_entry(html, 'run time').replace('min.', '').strip()
data['set'] = parse_entry(html, 'set in')
data['synopsis'] = strip_tags(find_re(html, '<div class="toggle-text" itemprop="description">(.*?)</div>')).strip()
data['themes'] = parse_list(html, 'themes')
data['types'] = parse_list(html, 'types')
data['year'] = find_re(html, '<span class="year">.*?(\d+)')
#data['stills'] = [re.sub('_derived.*?/', '', i) for i in re.compile('<a href="#" title="movie still".*?<img src="(.*?)"', re.DOTALL).findall(html)]
data['stills'] = re.compile('<a href="#" title="movie still".*?<img src="(.*?)"', re.DOTALL).findall(html)
#html = read_url("http://allmovie.com/work/%s/cast" % id, unicode=True)
#data['cast'] = parse_table(html)
#html = read_url("http://allmovie.com/work/%s/credits" % id, unicode=True)
#data['credits'] = parse_table(html)
html = read_url("http://allmovie.com/work/%s/review" % id, unicode=True)
data['review'] = strip_tags(find_re(html, '<div class="toggle-text" itemprop="description">(.*?)</div>')).strip()
return data
def get_url(id):
return "http://allmovie.com/work/%s" % id
def parse_entry(html, title):
html = find_re(html, '<dt>%s</dt>.*?<dd>(.*?)</dd>' % title)
return strip_tags(html).strip()
def parse_list(html, title):
html = find_re(html, '<dt>%s</dt>.*?<dd>(.*?)</dd>' % title.lower())
r = map(strip_tags, re.compile('<li>(.*?)</li>', re.DOTALL).findall(html))
if not r and html:
r = [strip_tags(html)]
return r
def parse_table(html):
return [
[
strip_tags(r).strip().replace('&nbsp;', '')
for r in x.split('<td width="305">-')
]
for x in find_re(html, '<div id="results-table">(.*?)</table>').split('</tr>')[:-1]
]
def parse_text(html, title):
return strip_tags(find_re(html, '%s</td>.*?<td colspan="2"><p>(.*?)</td>' % title)).strip()
if __name__ == '__main__':
print get_data('129689')
# print get_data('177524')

View file

@ -0,0 +1,77 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import re
from six.moves.urllib.parse import quote
from ox import find_re, strip_tags, decode_html
from ox.cache import read_url
import lxml
def findISBN(title, author):
q = '%s %s' % (title, author)
url = "http://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Dstripbooks&field-keywords=" + "%s&x=0&y=0" % quote(q)
data = read_url(url, unicode=True)
links = re.compile('href="(http://www.amazon.com/.*?/dp/.*?)"').findall(data)
id = find_re(re.compile('href="(http://www.amazon.com/.*?/dp/.*?)"').findall(data)[0], '/dp/(.*?)/')
data = get_data(id)
if author in data['authors']:
return data
return {}
def get_data(id):
url = "http://www.amazon.com/title/dp/%s/" % id
data = read_url(url, unicode=True)
def find_data(key):
return find_re(data, '<li><b>%s:</b>(.*?)</li>'% key).strip()
r = {}
r['amazon'] = url
r['title'] = find_re(data, '<span id="productTitle" class="a-size-large">(.*?)</span>')
r['authors'] = []
doc = lxml.html.document_fromstring(data)
for e in doc.xpath("//span[contains(@class, 'author')]"):
print e
for secondary in e.xpath(".//span[contains(@class, 'a-color-secondary')]"):
if 'Author' in secondary.text:
author = e.xpath(".//span[contains(@class, 'a-size-medium')]")
if author:
r['authors'].append(author[0].text.strip())
else:
r['authors'].append(e.xpath('.//a')[0].text.strip())
break
elif 'Translator' in secondary.text:
r['translator'] = [e.xpath('.//a')[0].text]
break
r['publisher'] = find_data('Publisher')
r['language'] = find_data('Language')
r['isbn-10'] = find_data('ISBN-10')
r['isbn-13'] = find_data('ISBN-13').replace('-', '')
r['dimensions'] = find_re(data, '<li><b>.*?Product Dimensions:.*?</b>(.*?)</li>')
r['pages'] = find_data('Paperback')
if not r['pages']:
r['pages'] = find_data('Hardcover')
r['review'] = strip_tags(find_re(data, '<h3 class="productDescriptionSource">Review</h3>.*?<div class="productDescriptionWrapper">(.*?)</div>').replace('<br />', '\n')).strip()
for e in doc.xpath('//noscript'):
for c in e.getchildren():
if c.tag == 'div':
r['description'] = strip_tags(decode_html(lxml.html.tostring(c))).strip()
break
r['cover'] = re.findall('src="(.*?)" id="prodImage"', data)
if r['cover']:
r['cover'] = r['cover'][0].split('._BO2')[0]
if not r['cover'].endswith('.jpg'):
r['cover'] = r['cover'] + '.jpg'
if 'no-image-avail-img' in r['cover']:
del r['cover']
else:
del r['cover']
return r

View file

@ -0,0 +1,67 @@
import json
import re
from ox.cache import read_url
HEADERS = {
'User-Agent': 'iTunes/10.4 (Macintosh; Intel Mac OS X 10.7) AppleWebKit/534.48.3',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-us, en;q=0.50',
'X-Apple-Store-Front': '143441-1,12',
'X-Apple-Tz': '7200',
'Accept-Encoding': 'gzip, deflate'
}
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7) '
USER_AGENT += 'AppleWebKit/534.48.3 (KHTML, like Gecko) Version/5.1 Safari/534.48.3'
def get_movie_data(title, director):
if isinstance(title, unicode):
title = title.encode('utf-8')
if isinstance(director, unicode):
director = director.encode('utf-8')
data = {}
# itunes section (preferred source for link)
url = 'http://ax.search.itunes.apple.com/WebObjects/MZSearch.woa/wa/advancedSearch'
url += '?media=movie&movieTerm=' + title
url += '&actorNames=&directorProducerName=' + director
url += '&releaseYearTerm=&descriptionTerm=&genreIndex=1&ratingIndex=1'
HEADERS['Referer'] = url
html = read_url(url, headers=HEADERS, unicode=True)
regexp = '<a href="(http://itunes.apple.com/us/movie/.*?)" class="artwork-link"><div class="artwork">'
regexp += '<img width=".*?" height=".*?" alt=".*?" class="artwork" src="(.*?)" /></div></a>'
results = re.compile(regexp).findall(html)
if results:
data['link'] = results[0][0]
data['poster'] = results[0][1].replace('140x140', '600x600')
html = read_url(data['link'], headers=HEADERS, unicode=True)
results = re.compile('video-preview-url="(.*?)"').findall(html)
if results:
data['trailer'] = results[0]
# trailers section (preferred source for poster and trailer)
host = 'http://trailers.apple.com'
url = host + '/trailers/home/scripts/quickfind.php?callback=searchCallback&q=' + title
js = json.loads(read_url(url, unicode=True)[16:-4])
results = js['results']
if results:
url = host + results[0]['location']
if not 'link' in data:
data['link'] = url
headers = {
'User-Agent': USER_AGENT
}
html = read_url(url, headers=headers, unicode=True)
results = re.compile('"(' + host + '.*?poster\.jpg)"').findall(html)
if results:
data['poster'] = results[0].replace('poster.jpg', 'poster-xlarge.jpg')
html = read_url(url + 'includes/playlists/web.inc', headers=headers, unicode=True)
results = re.compile('"(' + host + '\S+\.mov)"').findall(html)
if results:
data['trailer'] = results[-1]
return data
if __name__ == '__main__':
print get_movie_data('Alphaville', 'Jean-Luc Godard')
print get_movie_data('Sin City', 'Roberto Rodriguez')
print get_movie_data('Breathless', 'Jean-Luc Godard')
print get_movie_data('Capitalism: A Love Story', 'Michael Moore')
print get_movie_data('Film Socialisme', 'Jean-Luc Godard')

View file

@ -0,0 +1,26 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
from .. import cache
from ..utils import json
def get_id(url):
return url.split("/")[-1]
def get_url(id):
return "http://www.archive.org/details/%s" % id
def get_data(id):
data = {}
url = get_url(id)
details = cache.read_url('%s?output=json' % url)
details = json.loads(details)
for key in ('title', 'description', 'runtime'):
data[key] = details['metadata'][key]
if isinstance(data[key], list):
data[key] = data[key][0]
data['url'] = url
data['image'] = 'http://archive.org/download/%s/format=thumbnail' % id
data['ogg'] = 'http://archive.org/download/%s/format=Ogg+video' % id
data['mp4'] = 'http://archive.org/download/%s/format=512Kb+MPEG4' % id
return data

View file

@ -0,0 +1,71 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import json
import os
import re
from ox import find_re, strip_tags
from ox.cache import read_url
def get_data(id, language='en'):
if language == 'de':
url = 'http://films.arsenal-berlin.de/index.php/Detail/Object/Show/object_id/%d/lang/de_DE' % id
else:
url = 'http://films.arsenal-berlin.de/index.php/Detail/Object/Show/object_id/%d' % id
html = read_url(url, unicode=True)
if 'ID does not exist' in html:
return None
if 'Willkommen in der Datenbank des Arsenal' in html:
return None
data = {}
data[u'id'] = id
data[u'url'] = url
m = re.compile('<h1>(.*?)</h1>').findall(html)
if m:
data[u'title'] = m[0]
m = re.compile("<b>Director: </b><a href='.*?'>(.*?)</a>").findall(html)
if m:
data[u'director'] = m[0]
m = re.compile("caUI.initImageScroller\(\[\{url:'(.*?)'").findall(html)
if m:
data[u'image'] = m[0]
units = re.compile("<div class='unit'>(.*?)</div>", re.DOTALL).findall(html)
for x in map(re.compile('<b>(.*?)</b>: (.*)', re.DOTALL).findall, units):
if x:
#data[x[0][0].lower()] = strip_tags(x[0][1])
key = x[0][0].lower()
data[key] = x[0][1]
if key == "forum catalogue pdf":
data[key] = find_re(data[key], '"(http:.*?)"')
else:
data[key] = strip_tags(data[key])
if "running time (minutes)" in data:
data[u'runtime'] = float(data.pop("running time (minutes)").replace(',', '.')) * 60
for key in ('year', 'length in metres', 'forum participation year', 'number of reels'):
if key in data and data[key].isdigit():
data[key] = int(data[key])
return data
def backup(filename):
if os.path.exists(filename):
with open(filename) as f:
data = json.load(f)
else:
data = {}
start = max(map(int, data)) or 1
for i in range(start, 11872):
info = get_data(i)
if info:
data[i] = info
if len(data) % 10 == 0:
print 'save', filename, len(data)
with open(filename, 'w') as f:
json.dump(data, f)
else:
print 'ignore', i
with open(filename, 'w') as f:
json.dump(data, f)
return data

View file

@ -0,0 +1,33 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
# GPL 2009
import os
from ox.utils import json
def get(key):
user_auth = os.environ.get('oxAUTH', os.path.expanduser('~/.ox/auth.json'))
auth = {}
if os.path.exists(user_auth):
f = open(user_auth, "r")
data = f.read()
f.close()
auth = json.loads(data)
if key in auth:
return auth[key]
print "please add key %s to json file '%s'" % (key, user_auth)
raise Exception,"no key %s found" % key
def update(key, value):
user_auth = os.environ.get('oxAUTH', os.path.expanduser('~/.ox/auth.json'))
auth = {}
if os.path.exists(user_auth):
f = open(user_auth, "r")
data = f.read()
f.close()
auth = json.loads(data)
auth[key] = value
f = open(user_auth, "w")
f.write(json.dumps(auth, indent=2))
f.close()

View file

@ -0,0 +1,100 @@
# -*- coding: UTF-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import re
import ox.cache
from ox.cache import read_url
from ox.html import strip_tags
from ox.text import find_re
import imdb
def get_id(url):
return url.split("/")[-1]
def get_url(id):
return "http://www.criterion.com/films/%s" % id
def get_data(id, timeout=ox.cache.cache_timeout, get_imdb=False):
'''
>>> get_data('1333').get('imdbId')
u'0060304'
>>> get_data('236')['posters'][0]
u'http://s3.amazonaws.com/criterion-production/release_images/1586/ThirdManReplace.jpg'
>>> get_data('786')['posters'][0]
u'http://s3.amazonaws.com/criterion-production/product_images/185/343_box_348x490.jpg'
'''
data = {
"url": get_url(id)
}
try:
html = read_url(data["url"], timeout=timeout, unicode=True)
except:
html = ox.cache.read_url(data["url"], timeout=timeout)
data["number"] = find_re(html, "<li>Spine #(\d+)")
data["title"] = find_re(html, "<h1 class=\"movietitle\">(.*?)</h1>")
data["title"] = data["title"].split(u' \u2014 The Television Version')[0]
data["director"] = strip_tags(find_re(html, "<h2 class=\"director\">(.*?)</h2>"))
results = find_re(html, '<div class="left_column">(.*?)</div>')
results = re.compile("<li>(.*?)</li>").findall(results)
data["country"] = results[0]
data["year"] = results[1]
data["synopsis"] = strip_tags(find_re(html, "<div class=\"content_block last\">.*?<p>(.*?)</p>"))
result = find_re(html, "<div class=\"purchase\">(.*?)</div>")
if 'Blu-Ray' in result or 'Essential Art House DVD' in result:
r = re.compile('<h3 class="section_title first">Other Editions</h3>(.*?)</div>', re.DOTALL).findall(html)
if r:
result = r[0]
result = find_re(result, "<a href=\"(.*?)\"")
if not "/boxsets/" in result:
data["posters"] = [result]
else:
html_ = read_url(result, unicode=True)
result = find_re(html_, '<a href="http://www.criterion.com/films/%s.*?">(.*?)</a>' % id)
result = find_re(result, "src=\"(.*?)\"")
if result:
data["posters"] = [result.replace("_w100", "")]
else:
data["posters"] = []
data['posters'] = [re.sub('(\?\d+)$', '', p) for p in data['posters']]
result = find_re(html, "<img alt=\"Film Still\" height=\"252\" src=\"(.*?)\"")
if result:
data["stills"] = [result]
data["trailers"] = []
else:
data["stills"] = filter(lambda x: x, [find_re(html, "\"thumbnailURL\", \"(.*?)\"")])
data["trailers"] = filter(lambda x: x, [find_re(html, "\"videoURL\", \"(.*?)\"")])
if timeout == ox.cache.cache_timeout:
timeout = -1
if get_imdb:
# removed year, as "title (year)" may fail to match
data['imdbId'] = imdb.get_movie_id(data['title'], data['director'], timeout=timeout)
return data
def get_ids(page=None):
ids = []
if page:
url = "http://www.criterion.com/library/expanded_view?m=dvd&p=%s&pp=50&s=spine" % page
html = read_url(url)
results = re.compile("films/(\d+)").findall(html)
ids += results
results = re.compile("boxsets/(.*?)\"").findall(html)
for result in results:
html = read_url("http://www.criterion.com/boxsets/" + result)
results = re.compile("films/(\d+)").findall(html)
ids += results
return set(ids)
html = read_url("http://www.criterion.com/library/expanded_view?m=dvd&p=1&pp=50&s=spine", unicode=True)
results = re.compile("\&amp;p=(\d+)\&").findall(html)
pages = max(map(int, results))
for page in range(1, pages):
ids += get_ids(page)
return sorted(set(ids), key=int)
if __name__ == '__main__':
print get_ids()

View file

@ -0,0 +1,21 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import re
from six.moves.urllib.parse import unquote
from ox.cache import read_url
def get_video_url(url):
'''
>>> get_video_url('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3opar_priere-pour-refuznik-1-jeanluc-goda_shortfilms').split('?auth')[0]
'http://www.dailymotion.com/cdn/FLV-320x240/video/x3opar_priere-pour-refuznik-1-jean-luc-god_shortfilms.flv'
>>> get_video_url('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3ou94_priere-pour-refuznik-2-jeanluc-goda_shortfilms').split('?auth')[0]
'http://www.dailymotion.com/cdn/FLV-320x240/video/x3ou94_priere-pour-refuznik-2-jean-luc-god_shortfilms.flv'
'''
data = read_url(url)
video = re.compile('''video", "(.*?)"''').findall(data)
for v in video:
v = unquote(v).split('@@')[0]
return v
return ''

View file

@ -0,0 +1,22 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import re
from six.moves import urllib
import ox
from ox import strip_tags, decode_html
from ox.cache import read_url
def find(query, timeout=ox.cache.cache_timeout):
if not isinstance(query, bytes):
query = query.encode('utf-8')
params = urllib.parse.urlencode({'q': query})
url = 'http://duckduckgo.com/html/?' + params
data = read_url(url, timeout=timeout).decode('utf-8')
results = []
regex = '<a .*?class="large" href="(.+?)">(.*?)</a>.*?<div class="snippet">(.*?)</div>'
for r in re.compile(regex, re.DOTALL).findall(data):
results.append((strip_tags(decode_html(r[1])), r[0], strip_tags(decode_html(r[2]))))
return results

View file

@ -0,0 +1,49 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import re
import time
from ox import strip_tags, find_re
from ox.cache import read_url
import google
def get_show_url(title):
'''
Search Epguide Url for Show via Show Title.
Use Google to search the url, this is also done on Epguide.
'''
for (name, url, desc) in google.find('allintitle: site:epguides.com %s' % title, 1):
if url.startswith('http://epguides.com'):
if re.search(title, name):
return url
return None
def get_show_data(url):
data = read_url(url, unicode=True)
r = {}
r['title'] = strip_tags(find_re(data, '<h1>(.*?)</h1>'))
r['imdb'] = find_re(data, '<h1><a href=".*?/title/tt(\d.*?)">.*?</a></h1>')
r['episodes'] = {}
#1. 1- 1 1001 7 Aug 05 You Can't Miss the Bear
for episode in re.compile('(\d+?)\..*?(\d+?-.\d.*?) .*?(\d+?) .*?(.*?) <a target="_blank" href="(.*?)">(.*?)</a>').findall(data):
air_date = episode[3].strip()
#'22 Sep 04' -> 2004-09-22
try:
air_date = time.strftime('%Y-%m-%d', time.strptime(air_date, '%d %b %y'))
except:
pass
s = episode[1].split('-')[0].strip()
e = episode[1].split('-')[-1].strip()
try:
r['episodes']['S%02dE%02d' % (int(s), int(e))] = {
'prod code': episode[2],
'air date': air_date,
'url': episode[4],
'title':episode[5],
}
except:
print "oxweb.epguides failed,", url
return r

View file

@ -0,0 +1,39 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import re
import string
import subprocess
import ox
import os
def get_ids():
result = []
for i in string.ascii_uppercase:
url = "http://www.filmsdivision.org/search.php?title=%s" % i
data = ox.cache.read_url(url)
links = re.compile('view_video.php\?movId=(.*?)[\'"]', re.DOTALL).findall(data)
result += links
return list(set(result))
def get_data(id):
result = {}
url = "http://www.filmsdivision.org/view_video.php?movId=%s" % id
data = ox.cache.read_url(url)
result['title'] = re.compile('<td.*?class="vdoheadtxt".*?>(.*?)</td>').findall(data)[0]
result['year'] = re.compile('Release: (\d{4})').findall(data)[0]
result['duration'] = int(re.compile('Duration: (\d+)mins').findall(data)[0]) * 60
result['producer'] = re.compile('Producer: (.*?)\t').findall(data)[0].strip()
if 'Director:' in data:
result['director'] = re.compile('Director: (.*?)\t').findall(data)[0].strip()
else:
result['director'] = "Unknown Director"
result['url'] = re.compile('value="(.*?.wmv)"').findall(data)[0]
return result
def download_video(url, filename):
dirname = os.path.dirname(filename)
if not os.path.exists(dirname):
os.makedirs(dirname)
p = subprocess.Popen(['gst-launch', 'mmssrc', 'location=%s'%url, '!', 'filesink', 'locaiton='%filename])
p.wait()
return p.returncode == 0

View file

@ -0,0 +1,74 @@
# -*- coding: UTF-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import re
from lxml.html import document_fromstring
from ox.cache import read_url
from ox import find_re, strip_tags
from ox.web.imdb import ImdbCombined
def get_data(id, timeout=-1):
'''
>>> get_data('the-matrix')['poster']
'http://content7.flixster.com/movie/16/90/52/1690525_gal.jpg'
>>> get_data('0133093')['poster']
'http://content7.flixster.com/movie/16/90/52/1690525_gal.jpg'
>>> get_data('2-or-3-things-i-know-about-her')['poster']
'http://content6.flixster.com/movie/10/95/43/10954392_gal.jpg'
>>> get_data('0078875')['rottentomatoes_id']
'http://www.rottentomatoes.com/m/the-tin-drum/'
'''
if len(id) == 7:
try:
int(id)
id = get_id(imdb=id)
except:
pass
data = {
"url": get_url(id),
}
html = read_url(data['url'], timeout=timeout, unicode=True)
doc = document_fromstring(html)
props = {
'og:title': 'title',
'og:image': 'poster',
'og:url': 'rottentomatoes_id',
}
for meta in doc.head.findall('meta'):
prop = meta.attrib.get('property', None)
content = meta.attrib.get('content', '')
if prop in props and content:
data[props[prop]] = content
for p in doc.body.find_class('synopsis'):
data['synopsis'] = p.text.strip()
if 'poster' in data and data['poster']:
data['poster'] = data['poster'].replace('_pro.jpg', '_gal.jpg')
if not 'title' in data:
return None
return data
def get_id(url=None, imdb=None):
'''
>>> get_id(imdb='0133093')
u'the-matrix'
#>>> get_id(imdb='0060304')
#u'2-or-3-things-i-know-about-her'
'''
if imdb:
i = ImdbCombined(imdb)
title = i['title']
return title.replace(' ', '-').lower().replace("'", '')
return url.split('/')[-1]
def get_url(id):
return "http://www.flixster.com/movie/%s"%id

View file

@ -0,0 +1,42 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import json
from ox.cache import read_url
from ox import find_re
class Freebase(dict):
def __init__(self, id, timeout=-1):
url = "http://ids.freebaseapps.com/get_ids?id=/authority/imdb/title/tt%s" % id
'''
"http://graph.freebase.com/imdb.title.tt%s" % id
might also be of interest at some point, right now not much info
'''
data = read_url(url, unicode=True)
try:
data = json.loads(data)
except ValueError:
return
'''
for key in data:
self[key] = data[key]
'''
for key in ('id', 'guid', 'name'):
self[key] = data[key]
keys = {
'wikipedia': '/wikipedia/en',
'netflix': '/authority/netflix/movie',
'nytimes': '/source/nytimes/movie',
'metacritic': '/source/metacritic/movie',
}
for key in keys:
links = filter(lambda x: x['namespace'] == keys[key],data['ids'])
if links:
self[key] = links[0]['uri']
if 'nytimes' in self:
self['nytimes'] = self['nytimes'].replace('_/overview', '%s/overview' % self['name'].replace(' ', '-'))
self['amgId'] = find_re(self['nytimes'], 'movie/(\d+)/')

View file

@ -0,0 +1,44 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import re
from six.moves import urllib
import ox
from ox import strip_tags, decode_html
DEFAULT_MAX_RESULTS = 10
DEFAULT_TIMEOUT = 24*60*60
def read_url(url, data=None, headers=ox.net.DEFAULT_HEADERS, timeout=DEFAULT_TIMEOUT):
return ox.cache.read_url(url, data, headers, timeout, unicode=True)
def quote_plus(s):
if not isinstance(s, bytes):
s = s.encode('utf-8')
return urllib.parse.quote_plus(s)
def find(query, max_results=DEFAULT_MAX_RESULTS, timeout=DEFAULT_TIMEOUT):
"""
Return max_results tuples with title, url, description
>>> find("The Matrix site:imdb.com", 1)[0][0]
u'The Matrix (1999) - IMDb'
>>> find("The Matrix site:imdb.com", 1)[0][1]
u'http://www.imdb.com/title/tt0133093/'
"""
results = []
offset = 0
while len(results) < max_results:
url = 'http://google.com/search?q=%s' % quote_plus(query)
if offset:
url += '&start=%d' % offset
data = read_url(url, timeout=timeout)
data = re.sub('<span class="f">(.*?)</span>', '\\1', data)
for a in re.compile('<a href="(htt\S+?)".*?>(.*?)</a>.*?<span class="st">(.*?)<\/span>').findall(data):
results.append((strip_tags(decode_html(a[1])), a[0], strip_tags(decode_html(a[2]))))
if len(results) >= max_results:
break
offset += 10
return results

View file

@ -0,0 +1,821 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
from __future__ import print_function
import re
import time
import unicodedata
from six.moves import urllib
from six import string_types
from .. import find_re, strip_tags, decode_html
from .. import cache
from . siteparser import SiteParser
from . import duckduckgo
from ..utils import datetime
from ..geo import normalize_country_name
def read_url(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None, unicode=False):
headers = headers.copy()
return cache.read_url(url, data, headers, timeout, unicode=unicode)
def get_url(id):
return "http://www.imdb.com/title/tt%s/" % id
class Imdb(SiteParser):
'''
>>> Imdb('0068646')['title']
u'The Godfather'
>>> Imdb('0133093')['title']
u'The Matrix'
'''
regex = {
'alternativeTitles': {
'page': 'releaseinfo',
're': [
'name="akas".*?<table.*?>(.*?)</table>',
"td>(.*?)</td>.*?<td>(.*?)</td>"
],
'type': 'list'
},
'aspectratio': {
'page': 'combined',
're': 'Aspect Ratio:</h5><div class="info-content">([\d\.]+)',
'type': 'float',
},
'budget': {
'page': 'business',
're': [
'<h5>Budget</h5>\s*?\$(.*?)<br',
lambda data: find_re(decode_html(data).replace(',', ''), '\d+')
],
'type': 'int'
},
'cast': {
'page': 'combined',
're': [
'<td class="nm">.*?>(.*?)</a>.*?<td class="char">(.*?)</td>',
lambda ll: [strip_tags(l) for l in ll]
],
'type': 'list'
},
'cinematographer': {
'page': 'combined',
're': [
lambda data: data.split('Series Crew')[0],
'Cinematography by</a>(.*?)</table>',
'<a href="/name/.*?/">(.*?)</a>'
],
'type': 'list'
},
'connections': {
'page': 'trivia?tab=mc',
're': '<h4 class="li_group">(.*?)</h4>(.*?)(<\/div>\n <a|<script)',
'type': 'list'
},
'country': {
'page': 'combined',
're': [
'<div class="info"><h5>Country:</h5>.*?<div class="info">',
#'<a href="/country/.*?">(.*?)</a>', #links changed to work with existing caches, just take all links
'<a.*?>(.*?)</a>',
],
'type': 'list'
},
'creator': {
'page': 'combined',
're': [
'<h5>Creator.?:</h5>.*?<div class="info-content">(.*?)</div>',
'<a href="/name/.*?>(.*?)</a>'
],
'type': 'list'
},
'director': {
'page': 'combined',
're': [
lambda data: data.split('<b>Series Crew</b>')[0],
'Directed by</a>(.*?)</table>',
'<a href="/name/.*?>(.*?)</a>'
],
'type': 'list'
},
'_director': {
'page': 'combined',
're': [
'<h5>Director:</h5>.*?<div class="info-content">(.*?)</div>',
'<a href="/name/.*?>(.*?)</a>'
],
'type': 'list'
},
'editor': {
'page': 'combined',
're': [
lambda data: data.split('Series Crew')[0],
'Film Editing by</a>(.*?)</table>',
'<a href="/name/.*?>(.*?)</a>'
],
'type': 'list'
},
'composer': {
'page': 'combined',
're': [
lambda data: data.split('Series Crew')[0],
'Original Music by</a>(.*?)</table>',
'<a href="/name/.*?>(.*?)</a>'
],
'type': 'list'
},
'episodeTitle': {
'page': 'combined',
're': '<div id="tn15title">.*?<em>(.*?)</em>',
'type': 'string'
},
'filmingLocations': {
'page': 'locations',
're': [
'<a href="/search/title\?locations=.*?".*?>(.*?)</a>',
lambda data: data.strip(),
],
'type': 'list'
},
'genre': {
'page': 'combined',
're': [
'<h5>Genre:</h5>(.*?)<hr',
'<a href="/Sections/Genres/.*?/">(.*?)</a>'
],
'type': 'list'
},
'gross': {
'page': 'business',
're': [
'<h5>Gross</h5>\s*?\$(.*?)<br',
lambda data: find_re(data.replace(',', ''), '\d+')
],
'type': 'int'
},
'keyword': {
'page': 'keywords',
're': '<a href="/keyword/.*?>(.*?)</a>',
'type': 'list'
},
'language': {
'page': 'combined',
're': [
'<div class="info"><h5>Language:</h5>.*?<div class="info">',
#'<a href="/language/.*?">(.*?)</a>', #links changed to work with existing caches, just take all links
'<a.*?>(.*?)</a>',
],
'type': 'list'
},
'summary': {
'page': 'plotsummary',
're': '<p class="plotSummary">(.*?)<\/p>',
'type': 'string'
},
'posterId': {
'page': 'combined',
're': '/primary-photo/media/rm(.*?)/tt',
'type': 'string'
},
'posterIds': {
'page': 'posters',
're': '/unknown-thumbnail/media/rm(.*?)/tt',
'type': 'list'
},
'producer': {
'page': 'combined',
're': [
lambda data: data.split('Series Crew')[0],
'Produced by</a>(.*?)</table>',
'<a href="/name/.*?/">(.*?)</a>'
],
'type': 'list'
},
'productionCompany': {
'page': 'combined',
're': [
'Production Companies</b><ul>(.*?)</ul>',
'<a href="/company/.*?/">(.*?)</a>'
],
'type': 'list'
},
'rating': {
'page': 'combined',
're': '<div class="starbar-meta">.*?<b>([\d,.]+?)/10</b>',
'type': 'float'
},
'releasedate': {
'page': 'releaseinfo',
're': [
'<td class="release_date">(.*?)</td>',
strip_tags,
],
'type': 'list'
},
'reviews': {
'page': 'externalreviews',
're': [
'<ol>(.*?)</ol>',
'<li><a href="(http.*?)".*?>(.*?)</a></li>'
],
'type': 'list'
},
'runtime': {
'page': 'combined',
're': '<h5>Runtime:</h5><div class="info-content">.*?([0-9]+ sec|[0-9]+ min).*?</div>',
'type': 'string'
},
'color': {
'page': 'combined',
're': [
'<h5>Color:</h5><div class="info-content">(.*?)</div>',
'<a.*?>(.*?)</a>'
],
'type': 'list'
},
'sound': {
'page': 'combined',
're': [
'<h5>Sound Mix:</h5><div class="info-content">(.*?)</div>',
'<a.*?>(.*?)</a>'
],
'type': 'list'
},
'season': {
'page': 'combined',
're': [
'<h5>Original Air Date:</h5>.*?<div class="info-content">(.*?)</div>',
'\(Season (\d+), Episode \d+\)',
],
'type': 'int'
},
'episode': {
'page': 'combined',
're': [
'<h5>Original Air Date:</h5>.*?<div class="info-content">(.*?)</div>',
'\(Season \d+, Episode (\d+)\)',
],
'type': 'int'
},
'series': {
'page': 'combined',
're': '<h5>TV Series:</h5>.*?<a href="/title/tt(\d{7})',
'type': 'string'
},
'isSeries': {
'page': 'combined',
're': '<span class="tv-extra">(TV series|TV mini-series) ',
'type': 'string'
},
'title': {
'page': 'combined',
're': '<h1>(.*?) <span>',
'type': 'string'
},
'trivia': {
'page': 'trivia',
're': [
'<div class="sodatext">(.*?)<(br|/div)',
lambda data: data[0]
],
'type': 'list',
},
'votes': {
'page': 'combined',
're': '<a href="ratings" class="tn15more">([\d,]*?) votes</a>',
'type': 'string'
},
'writer': {
'page': 'combined',
're': [
lambda data: data.split('Series Crew')[0],
'Writing credits</a>(.*?)</table>',
'<a href="/name/.*?/">(.*?)</a>'
],
'type': 'list'
},
'year': {
'page': 'combined',
're': '="og:title" content="[^"]*?\((\d{4}).*?"',
'type': 'int'
}
}
def read_url(self, url, timeout):
if not url in self._cache:
self._cache[url] = read_url(url, timeout=timeout, unicode=True)
return self._cache[url]
def __init__(self, id, timeout=-1):
#use akas.imdb.com to always get original title:
#http://www.imdb.com/help/show_leaf?titlelanguagedisplay
self.baseUrl = "http://akas.imdb.com/title/tt%s/" % id
super(Imdb, self).__init__(timeout)
url = self.baseUrl + 'combined'
page = self.read_url(url, timeout=-1)
if '<title>IMDb: Page not found</title>' in page \
or 'The requested URL was not found on our server.' in page:
return
if "<p>We're sorry, something went wrong.</p>" in page:
time.sleep(1)
super(Imdb, self).__init__(0)
if 'alternativeTitles' in self:
if len(self['alternativeTitles']) == 2 and \
isinstance(self['alternativeTitles'][0], string_types):
self['alternativeTitles'] = [self['alternativeTitles']]
#normalize country names
if 'country' in self:
self['country'] = [normalize_country_name(c) or c for c in self['country']]
if 'sound' in self:
self['sound'] = list(set(self['sound']))
types = {}
stop_words = [
'alternative spelling',
'alternative title',
'alternative transliteration',
'closing credits title',
'complete title',
'IMAX version',
'informal short title',
'International (Spanish title)',
'Japan (imdb display title)',
'longer version',
'new title',
'original subtitled version',
'pre-release title',
'promotional abbreviation',
'recut version',
'reissue title',
'restored version',
'script title',
'short title',
'(subtitle)',
'TV title',
'working title',
'World-wide (Spanish title)',
]
#ignore english japanese titles
#for movies that are not only from japan
if ['Japan'] != self.get('country', []):
stop_words += [
'Japan (English title)'
]
for t in self.get('alternativeTitles', []):
for type in t[0].split('/'):
type = type.strip()
stop_word = False
for key in stop_words:
if key in type:
stop_word = True
break
if not stop_word:
if not type in types:
types[type] = []
types[type].append(t[1])
titles = {}
for type in types:
for title in types[type]:
if not title in titles:
titles[title] = []
titles[title].append(type)
def select_title(type):
title = types[type][0]
count = 0
if len(types[type]) > 1:
for t in types[type]:
if len(titles[t]) > count:
count = len(titles[t])
title = t
return title
#FIXME: does work in python2.6, possible to import from __future__?
#types = {type: select_title(type) for type in types}
_types = {}
for type in types:
_types[type] = select_title(type)
types = _types
regexps = [
"^.+ \(imdb display title\) \(English title\)$",
"^USA \(imdb display title\)$",
"^International \(English title\)$",
"^International \(English title\)$",
"^UK \(imdb display title\)$",
"^International \(.+\) \(English title\)$",
"^World-wide \(English title\)$",
]
if 'Hong Kong' in self.get('country', []):
regexps += [
"Hong Kong \(English title\)"
]
english_countries = (
'USA', 'UK', 'United States', 'United Kingdom',
'Australia', 'New Zealand'
)
if not filter(lambda c: c in english_countries, self.get('country', [])):
regexps += [
"^[^(]+ \(English title\)$",
"^.+ \(.+\) \(English title\)$",
"^USA$",
"^UK$",
"^USA \(.+\)$",
"^UK \(.+\)$",
"^Australia \(.+\)$",
"World-wide \(English title\)",
"\(literal English title\)",
"^International \(.+ title\)$",
"^International \(.+\) \(.+ title\)$",
]
for regexp in regexps:
for type in types:
if re.compile(regexp).findall(type):
#print types[type], type
self['internationalTitle'] = types[type]
break
if 'internationalTitle' in self:
break
def cleanup_title(title):
if title.startswith('"') and title.endswith('"'):
title = title[1:-1]
if title.startswith("'") and title.endswith("'"):
title = title[1:-1]
title = re.sub('\(\#[.\d]+\)', '', title)
return title.strip()
for t in ('title', 'internationalTitle'):
if t in self:
self[t] = cleanup_title(self[t])
if 'internationalTitle' in self and \
self.get('title', '').lower() == self['internationalTitle'].lower():
del self['internationalTitle']
if 'alternativeTitles' in self:
alt = {}
for t in self['alternativeTitles']:
title = cleanup_title(t[1])
if title not in (self.get('title'), self.get('internationalTitle')):
if title not in alt:
alt[title] = []
for c in t[0].split('/'):
if not '(working title)' in c:
c = c.replace('International', '').replace('World-wide', '').split('(')[0].strip()
if c:
alt[title].append(c)
self['alternativeTitles'] = []
for t in sorted(alt, key=lambda a: sorted(alt[a])):
if alt[t]:
countries = sorted([normalize_country_name(c) or c for c in alt[t]])
self['alternativeTitles'].append((t, countries))
if not self['alternativeTitles']:
del self['alternativeTitles']
if 'internationalTitle' in self:
self['originalTitle'] = self['title']
self['title'] = self.pop('internationalTitle')
if 'runtime' in self and self['runtime']:
if 'min' in self['runtime']: base=60
else: base=1
self['runtime'] = int(find_re(self['runtime'], '([0-9]+)')) * base
if 'runtime' in self and not self['runtime']:
del self['runtime']
if 'votes' in self: self['votes'] = self['votes'].replace(',', '')
if 'cast' in self:
if isinstance(self['cast'][0], string_types):
self['cast'] = [self['cast']]
self['actor'] = [c[0] for c in self['cast']]
def cleanup_character(c):
c = c.replace('(uncredited)', '').strip()
return c
self['cast'] = [{'actor': x[0], 'character': cleanup_character(x[1])}
for x in self['cast']]
if 'connections' in self:
cc={}
if len(self['connections']) == 3 and isinstance(self['connections'][0], string_types):
self['connections'] = [self['connections']]
for rel, data, _ in self['connections']:
if isinstance(rel, bytes):
rel = rel.decode('utf-8')
#cc[rel] = re.compile('<a href="/title/tt(\d{7})/">(.*?)</a>').findall(data)
def get_conn(c):
r = {
'id': c[0],
'title': cleanup_title(c[1]),
}
description = c[2].split('<br />')
if len(description) == 2 and description[-1].strip() != '-':
r['description'] = description[-1].strip()
return r
cc[rel] = list(map(get_conn, re.compile('<a href="/title/tt(\d{7})/">(.*?)</a>(.*?)<\/div', re.DOTALL).findall(data)))
self['connections'] = cc
for key in ('country', 'genre'):
if key in self:
self[key] = list(filter(lambda x: x.lower() != 'home', self[key]))
#0092999
if '_director' in self:
if 'series' in self or 'isSeries' in self:
self['creator'] = self.pop('_director')
else:
del self['_director']
if 'isSeries' in self:
del self['isSeries']
self['isSeries'] = True
if 'episodeTitle' in self:
self['episodeTitle'] = re.sub('Episode \#\d+\.\d+', '', self['episodeTitle'])
if 'series' in self:
series = Imdb(self['series'], timeout=timeout)
self['seriesTitle'] = series['title']
if 'episodeTitle' in self:
self['seriesTitle'] = series['title']
if 'season' in self and 'episode' in self:
self['title'] = "%s (S%02dE%02d) %s" % (
self['seriesTitle'], self['season'], self['episode'], self['episodeTitle'])
else:
self['title'] = "%s (S01) %s" % (self['seriesTitle'], self['episodeTitle'])
self['season'] = 1
self['title'] = self['title'].strip()
if 'director' in self:
self['episodeDirector'] = self['director']
if not 'creator' in series and 'director' in series:
series['creator'] = series['director']
if len(series['creator']) > 10:
series['creator'] = series['director'][:1]
for key in ['creator', 'country']:
if key in series:
self[key] = series[key]
if 'year' in series:
self['seriesYear'] = series['year']
if not 'year' in self:
self['year'] = series['year']
if 'year' in self:
self['episodeYear'] = self['year']
if 'creator' in self:
self['seriesDirector'] = self['creator']
if 'originalTitle' in self:
del self['originalTitle']
else:
for key in ('seriesTitle', 'episodeTitle', 'season', 'episode'):
if key in self:
del self[key]
if 'creator' in self:
if 'director' in self:
self['episodeDirector'] = self['director']
self['director'] = self['creator']
#make lists unique but keep order
for key in ('director', 'language'):
if key in self:
self[key] = [x for i,x in enumerate(self[key])
if x not in self[key][i+1:]]
for key in ('actor', 'writer', 'producer', 'editor', 'composer'):
if key in self:
if isinstance(self[key][0], list):
self[key] = [i[0] for i in self[key] if i]
self[key] = sorted(list(set(self[key])), key=lambda a: self[key].index(a))
if 'budget' in self and 'gross' in self:
self['profit'] = self['gross'] - self['budget']
if 'releasedate' in self:
def parse_date(d):
try:
d = datetime.strptime(d, '%d %B %Y')
except:
try:
d = datetime.strptime(d, '%B %Y')
except:
return 'x'
return '%d-%02d-%02d' % (d.year, d.month, d.day)
self['releasedate'] = min([
parse_date(d) for d in self['releasedate']
])
if self['releasedate'] == 'x':
del self['releasedate']
if 'summary' in self:
if isinstance(self['summary'], list):
self['summary'] = self['summary'][0]
self['summary'] = self['summary'].split('</p')[0].strip()
class ImdbCombined(Imdb):
def __init__(self, id, timeout=-1):
_regex = {}
for key in self.regex:
if self.regex[key]['page'] in ('combined', 'releaseinfo'):
_regex[key] = self.regex[key]
self.regex = _regex
super(ImdbCombined, self).__init__(id, timeout)
def get_movie_by_title(title, timeout=-1):
'''
This only works for exact title matches from the data dump
Usually in the format
Title (Year)
"Series Title" (Year) {(#Season.Episode)}
"Series Title" (Year) {Episode Title (#Season.Episode)}
If there is more than one film with that title for the year
Title (Year/I)
>>> get_movie_by_title(u'"Father Knows Best" (1954) {(#5.34)}')
u'1602860'
>>> get_movie_by_title(u'The Matrix (1999)')
u'0133093'
>>> get_movie_by_title(u'Little Egypt (1951)')
u'0043748'
>>> get_movie_by_title(u'Little Egypt (1897/I)')
u'0214882'
>>> get_movie_by_title(u'Little Egypt')
None
>>> get_movie_by_title(u'"Dexter" (2006) {Father Knows Best (#1.9)}')
u'0866567'
'''
params = {'s':'tt','q': title}
if not isinstance(title, bytes):
try:
params['q'] = unicodedata.normalize('NFKC', params['q']).encode('latin-1')
except:
params['q'] = params['q'].encode('utf-8')
params = urllib.urlencode(params)
url = "http://akas.imdb.com/find?" + params
data = read_url(url, timeout=timeout, unicode=True)
#if search results in redirect, get id of current page
r = '<meta property="og:url" content="http://www.imdb.com/title/tt(\d{7})/" />'
results = re.compile(r).findall(data)
if results:
return results[0]
return None
def get_movie_id(title, director='', year='', timeout=-1):
'''
>>> get_movie_id('The Matrix')
u'0133093'
>>> get_movie_id('2 or 3 Things I Know About Her', 'Jean-Luc Godard')
u'0060304'
>>> get_movie_id('2 or 3 Things I Know About Her', 'Jean-Luc Godard', '1967')
u'0060304'
>>> get_movie_id(u"Histoire(s) du cinema: Le controle de l'univers", 'Jean-Luc Godard')
u'0179214'
>>> get_movie_id(u"Histoire(s) du cinéma: Le contrôle de l'univers", 'Jean-Luc Godard')
u'0179214'
'''
imdbId = {
(u'Le jour se l\xe8ve', u'Marcel Carn\xe9'): '0031514',
(u'Wings', u'Larisa Shepitko'): '0061196',
(u'The Ascent', u'Larisa Shepitko'): '0075404',
(u'Fanny and Alexander', u'Ingmar Bergman'): '0083922',
(u'Torment', u'Alf Sj\xf6berg'): '0036914',
(u'Crisis', u'Ingmar Bergman'): '0038675',
(u'To Joy', u'Ingmar Bergman'): '0043048',
(u'Humain, trop humain', u'Louis Malle'): '0071635',
(u'Place de la R\xe9publique', u'Louis Malle'): '0071999',
(u'God\u2019s Country', u'Louis Malle'): '0091125',
(u'Flunky, Work Hard', u'Mikio Naruse'): '0022036',
(u'The Courtesans of Bombay', u'Richard Robbins') : '0163591',
(u'Je tu il elle', u'Chantal Akerman') : '0071690',
(u'Hotel Monterey', u'Chantal Akerman') : '0068725',
(u'No Blood Relation', u'Mikio Naruse') : '023261',
(u'Apart from You', u'Mikio Naruse') : '0024214',
(u'Every-Night Dreams', u'Mikio Naruse') : '0024793',
(u'Street Without End', u'Mikio Naruse') : '0025338',
(u'Sisters of the Gion', u'Kenji Mizoguchi') : '0027672',
(u'Osaka Elegy', u'Kenji Mizoguchi') : '0028021',
(u'Blaise Pascal', u'Roberto Rossellini') : '0066839',
(u'Japanese Girls at the Harbor', u'Hiroshi Shimizu') : '0160535',
(u'The Private Life of Don Juan', u'Alexander Korda') : '0025681',
(u'Last Holiday', u'Henry Cass') : '0042665',
(u'A Colt Is My Passport', u'Takashi Nomura') : '0330536',
(u'Androcles and the Lion', u'Chester Erskine') : '0044355',
(u'Major Barbara', u'Gabriel Pascal') : '0033868',
(u'Come On Children', u'Allan King') : '0269104',
(u'Jimi Plays Monterey & Shake! Otis at Monterey', u'D. A. Pennebaker and Chris Hegedus') : '',
(u'Martha Graham: Dance on Film', u'Nathan Kroll') : '',
(u'Carmen', u'Carlos Saura'): '0085297',
(u'The Story of a Cheat', u'Sacha Guitry'): '0028201',
(u'Weekend', 'Andrew Haigh'): '1714210',
}.get((title, director), None)
if imdbId:
return imdbId
params = {'s':'tt','q': title}
if director:
params['q'] = u'"%s" %s' % (title, director)
if year:
params['q'] = u'"%s (%s)" %s' % (title, year, director)
google_query = "site:imdb.com %s" % params['q']
if not isinstance(params['q'], bytes):
try:
params['q'] = unicodedata.normalize('NFKC', params['q']).encode('latin-1')
except:
params['q'] = params['q'].encode('utf-8')
params = urllib.urlencode(params)
url = "http://akas.imdb.com/find?" + params
#print url
data = read_url(url, timeout=timeout, unicode=True)
#if search results in redirect, get id of current page
r = '<meta property="og:url" content="http://www.imdb.com/title/tt(\d{7})/" />'
results = re.compile(r).findall(data)
if results:
return results[0]
#otherwise get first result
r = '<td valign="top">.*?<a href="/title/tt(\d{7})/"'
results = re.compile(r).findall(data)
if results:
return results[0]
#print (title, director), ": '',"
#print google_query
#results = google.find(google_query, timeout=timeout)
results = duckduckgo.find(google_query, timeout=timeout)
if results:
for r in results[:2]:
imdbId = find_re(r[1], 'title/tt(\d{7})')
if imdbId:
return imdbId
#or nothing
return ''
def get_movie_poster(imdbId):
'''
>>> get_movie_poster('0133093')
'http://ia.media-imdb.com/images/M/MV5BMjEzNjg1NTg2NV5BMl5BanBnXkFtZTYwNjY3MzQ5._V1._SX338_SY475_.jpg'
>>> get_movie_poster('0994352')
'http://ia.media-imdb.com/images/M/MV5BMjA3NzMyMzU1MV5BMl5BanBnXkFtZTcwNjc1ODUwMg@@._V1._SX594_SY755_.jpg'
'''
info = ImdbCombined(imdbId)
if 'posterId' in info:
url = "http://www.imdb.com/media/rm%s/tt%s" % (info['posterId'], imdbId)
data = read_url(url).decode('utf-8', 'ignore')
poster = find_re(data, 'img.*?id="primary-img".*?src="(.*?)"')
return poster
elif 'series' in info:
return get_movie_poster(info['series'])
return ''
def get_episodes(imdbId, season=None):
episodes = {}
url = 'http://www.imdb.com/title/tt%s/episodes' % imdbId
if season:
url += '?season=%d' % season
data = cache.read_url(url)
for e in re.compile('<div data-const="tt(\d{7})".*?>.*?<div>S(\d+), Ep(\d+)<\/div>\n<\/div>', re.DOTALL).findall(data):
episodes['S%02dE%02d' %(int(e[1]), int(e[2]))] = e[0]
else:
data = cache.read_url(url)
match = re.compile('<strong>Season (\d+)</strong>').findall(data)
if match:
for season in range(1, int(match[0]) + 1):
episodes.update(get_episodes(imdbId, season))
return episodes
def max_votes():
url = 'http://www.imdb.com/search/title?num_votes=500000,&sort=num_votes,desc'
data = cache.read_url(url)
votes = max([int(v.replace(',', ''))
for v in re.compile('<td class="sort_col">([\d,]+)</td>').findall(data)])
return votes
def guess(title, director='', timeout=-1):
return get_movie_id(title, director, timeout=timeout)
if __name__ == "__main__":
import json
print(json.dumps(Imdb('0306414'), indent=2))
#print json.dumps(Imdb('0133093'), indent=2)

View file

@ -0,0 +1,300 @@
# vi:si:et:sw=4:sts=4:ts=4
# encoding: utf-8
import re
from ox.cache import read_url
from ox.html import strip_tags
from ox.text import find_re
def get_data(id):
'''
>>> get_data('1991/silence_of_the_lambs')['imdbId']
u'0102926'
>>> get_data('1991/silence_of_the_lambs')['posters'][0]
u'http://www.impawards.com/1991/posters/silence_of_the_lambs_ver1.jpg'
>>> get_data('1991/silence_of_the_lambs')['url']
u'http://www.impawards.com/1991/silence_of_the_lambs_ver1.html'
'''
data = {
'url': get_url(id)
}
html = read_url(data['url'], unicode=True)
data['imdbId'] = find_re(html, 'imdb.com/title/tt(\d{7})')
if not data['imdbId']:
data['imdbId'] = _id_map.get(id, '')
data['title'] = strip_tags(find_re(html, '<p class="name white">(.*?) \(<a href="alpha1.html">'))
data['year'] = find_re(html, '\(<a href="alpha1.html">(.*?)</a>\)')
data['posters'] = []
poster = find_re(html, '<img src="(posters.*?)"')
if poster:
poster = 'http://www.impawards.com/%s/%s' % (data['year'], poster)
data['posters'].append(poster)
results = re.compile('<a href = (%s.*?html)' % id[5:], re.DOTALL).findall(html)
for result in results:
result = result.replace('_xlg.html', '.html')
url = 'http://www.impawards.com/%s/%s' % (data['year'], result)
html = read_url(url, unicode=True)
result = find_re(html, '<a href = (\w*?_xlg.html)')
if result:
url = 'http://www.impawards.com/%s/%s' % (data['year'], result)
html = read_url(url, unicode=True)
poster = 'http://www.impawards.com/%s/%s' % (data['year'], find_re(html, '<img SRC="(.*?)"'))
else:
poster = 'http://www.impawards.com/%s/%s' % (data['year'], find_re(html, '<img src="(posters.*?)"'))
data['posters'].append(poster)
return data
def get_id(url):
split = url.split('/')
year = split[3]
split = split[4][:-5].split('_')
if split[-1] == 'xlg':
split.pop()
if find_re(split[-1], 'ver\d+$'):
split.pop()
id = '%s/%s' % (year, '_'.join(split))
return id
def get_ids(page=None):
ids = []
if page:
html = read_url('http://www.impawards.com/archives/page%s.html' % page, timeout = -1, unicode=True)
results = re.compile('<a href = \.\./(.*?)>', re.DOTALL).findall(html)
for result in results:
url = 'http://impawards.com/%s' % result
ids.append(get_id(url))
return set(ids)
#get all
html = read_url('http://www.impawards.com/archives/latest.html', timeout = 60*60, unicode=True)
pages = int(find_re(html, '<a href= page(.*?).html>')) + 1
for page in range(pages, 0, -1):
for id in get_ids(page):
if not id in ids:
ids.append(id)
return ids
def get_url(id):
url = u"http://www.impawards.com/%s.html" % id
html = read_url(url, unicode=True)
if find_re(html, "No Movie Posters on This Page"):
url = u"http://www.impawards.com/%s_ver1.html" % id
return url
_id_map = {
'1933/forty_second_street': '0024034',
'1933/tarzan_the_fearless': '0024645',
'1935/informer': '0026529',
'1935/thirty_nine_steps': '0026529',
'1935/top_hat': '0027125',
'1938/charlie_chaplin_cavalcade': '0284687',
'1943/falcon_and_the_co-eds': '035855',
'1969/angel_angel_down_we_go': '0065602',
'1970/crimson_altar': '0062833',
'1975/man_who_would_be_king_ver1': '0073341',
'1975/picnic_at_hanging_rock_ver1': '0073540',
'1979/electric_horseman_ver1': '0079100',
'1980/caligula_ver1': '0080491',
'1980/hollywood_knights_ver1': '0080881',
'1981/history_of_the_world_part_i': '0082517',
'1981/sea_wolves': '0081470',
'1983/krull_ver1': '0085811',
'1985/warriors_of_the_wind': '0087544',
'1989/friday_the_thirteenth_part_viii_ver1': '0097388',
'1989/high_hopes': '0095302',
'1989/millenium': '0097883',
'1989/story_of_women': '0096336',
'1990/edward_scissorhands_ver1': '0099487',
'1991/freddys_dead_ver1': '0101917',
'1993/robocop_three_ver1': '0107978',
'1993/waynes_world_two_ver1': '0108525',
'1994/above_the_rim_ver1': '0109035',
'1994/helas_pour_moi': '0107175',
'1994/house_of_the_spirits_ver1': '0107151',
'1994/i_dont_want_to_talk_about_it': '0106678',
'1994/in_custody': '0107199',
'1994/ladybird_ladybird': '0110296',
'1994/leon_the_pig_farmer': '0104710',
'1994/love_after_love': '0103710',
'1994/l_six_two_seven': '0104658',
'1994/martin_lawrence_you_so_crazy_ver1': '0111804',
'1994/savage_nights': '0105032',
'1994/sex_drugs_and_democracy': '0111135',
'1995/bye_bye_love': '0112606',
'1995/cold_comfort_farm': '0112701',
'1995/gumby_the_movie': '0113234',
'1995/les_miserables': '0113828',
'1995/mystery_of_rampo': '0110943',
'1995/pharaohs_army': '0114122',
'1995/pure_formality': '0110917',
'1995/quick_and_the_dead_ver1': '0114214',
'1995/reflections_in_the_dark': '0110956',
'1995/safe_ver1': '0114323',
'1995/search_and_destroy': '0114371',
'1995/secret_of_roan_inish_ver1': '0111112',
'1995/underneath': '0114788',
'1996/ghost_in_the_shell': '0113568',
'1996/hate': '0113247',
'1996/horseman_on_the_roof': '0113362',
'1996/kids_in_the_hall_brain_candy': '0116768',
'1996/maybe_maybe_not': '0109255',
'1996/prisoner_of_the_mountains': '0116754',
'1997/fifth_element_ver1': '0119116',
'1997/fools_rush_in_ver1': '0119141',
'1997/gi_jane_ver1': '0119173',
'1997/happy_together_ver1': '0118845',
'1997/lilies': '0116882',
'1997/mouth_to_mouth': '0112546',
'1997/mr_nice_guy': '0117786',
'1997/nenette_and_boni': '0117221',
'1997/paperback_romance': '0110405',
'1997/second_jungle_book': '0120087',
'1997/single_girl': '0113057',
'1997/super_speedway': '0120245',
'1997/temptress_moon': '0116295',
'1998/alarmist': '0119534',
'1998/barneys_great_adventure_the_movie': '0120598',
'1998/bulworth_ver1': '0118798',
'1998/celebration': '0154420',
'1998/east_palace_west_palace': '0119007',
'1998/hurricane_streets': '0119338',
'1998/i_married_a_strange_person': '0119346',
'1998/inheritors': '0141824',
'1998/killing_time': '0140312',
'1998/live_flesh': '0118819',
'1998/music_from_another_room': '0119734',
'1998/post_coitum_ver1': '0119923',
'1998/steam_the_turkish_bath': '0119248',
'1998/velocity_of_gary': '0120878',
'1999/after_life': '0165078',
'1999/emperor_and_the_assassin': '0162866',
'1999/fantasia_two_thousand': '0120910',
'1999/get_bruce': '0184510',
'1999/god_said_ha': '0119207',
'1999/jawbreaker': '0155776',
'1999/jeanne_and_the_perfect_guy': '0123923',
'1999/king_and_i': '0160429',
'1999/lovers_of_the_arctic_circle': '0133363',
'1999/plunkett_and_macleane': '0134033',
'1999/pokemon_the_first_movie': '0190641',
'1999/school_of_flesh': '0157208',
'1999/splendor': '0127296',
'1999/stranger_in_the_kingdom': '0126680',
'1999/train_of_life': '0170705',
'1999/twice_upon_a_yesterday': '0138590',
'1999/whiteboys': '0178988',
'1999/wildfire': '0194544',
'1999/windhorse': '0169388',
'2000/claim': '0218378',
'2000/color_of_paradise': '0191043',
'2000/criminal_lovers': '0205735',
'2000/everlasting_piece': '0218182',
'2000/girl_on_the_bridge_ver1': '0144201',
'2000/godzilla_two_thousand': '0188640',
'2000/goya_in_bordeaux': '0210717',
'2000/mad_about_mambo': '0156757',
'2000/picking_up_the_pieces': '0192455',
'2000/pokemon_the_movie_2000': '0257001',
'2000/seven_days_to_live': '0221928',
'2000/south_of_heaven_west_of_hell': '0179473',
'2000/suzhou_river': '0234837',
'2000/time_for_drunken_horses': '0259072',
'2000/venus_beauty_institute': '0174330',
'2001/circle': '0368646',
'2001/devils_backbone': '0256009',
'2001/kill_me_later': '0243595',
'2001/king_is_dancing': '0244173',
'2001/learning_curve': '0219126',
'2001/marco_polo__return_to_xanadu_ver1': '0296074',
'2001/me_you_them': '0244504',
'2001/our_lady_of_the_assassins': '0250809',
'2001/pinero': '0261066',
'2001/pokemon_three_the_movie_ver1': '0266860',
'2001/scratch': '0143861',
'2001/vampire_hunter_d_bloodlust_ver1': '0216651',
'2002/el_bosque_animado': '0310790',
'2002/fifty_first_state': '0227984',
'2002/les_destinees': '0216689',
'2002/sons_room': '0208990',
'2003/open_hearts': '0315543',
'2003/tulse_luper_suitcases': '0307596',
'2003/valentin': '0296915',
'2004/if_only_ver1': '0332136',
'2004/wondrous_oblivion': '0334725',
'2005/wu_ji': '0417976',
'2006/golden_door': '0465188',
'2006/kin': '1091189',
'2007/revenge_of_the_nerds': '0088000',
'2008/bad_batch': '1605644',
'2008/mercedes': '1368083',
'2008/spirit': '0831887',
'2009/dead_air': '0993841',
'2009/edge_of_love': '0819714',
'2009/fuel': '1072437',
'2009/fuel': '1072437',
'2009/one_good_man': '1239357',
'2009/st_trinians': '1210106',
'2009/surveillance': '0409345',
'2009/taken': '0936501',
'2009/vaml': '1610453',
'2010/adopting_haiti': '1764164',
'2010/afterlife': '0838247',
'2010/agora': '1186830',
'2010/athlete': '1356996',
'2010/beneath_the_blue': '1222698',
'2010/bitch_slap': '1212974',
'2010/black_waters_of_echos_pond': '0960066',
'2010/case_thirty_nine': '0795351',
'2010/finite_and_infinite_games': '1772268',
'2010/hole': '1085779',
'2010/jolene': '0867334',
'2010/lake_mungo': '0816556',
'2010/last_day_of_summer': '1242544',
'2010/leaves_of_grass': '1151359',
'2010/life_of_lemon': '1466057',
'2010/man_in_the_maze': '1721692',
'2010/mr_immortality_the_life_and_times_of_twista': '1711017',
'2010/paper_man': '0437405',
'2010/perfect_game': '0473102',
'2010/red_baron': '0365675',
'2010/satin': '0433397',
'2010/shutter_island': '1130884',
'2010/strange_powers': '1534075',
'2010/suicidegirls_must_die': '1584733',
'2010/veronika_decides_to_die': '1068678',
'2010/witchblade': '0494292',
'2010/youth_in_revolt': '0403702',
'2011/beastly': '1152398',
'2011/burning_palms': '1283887',
'2011/cabin_in_the_woods': '1259521',
'2011/conan': '0816462',
'2011/courageous': '1630036',
'2011/cruces_divided_two': '1698645',
'2011/green_with_envy': '1204342',
'2011/happythankyoumoreplease': '1481572',
'2011/homework': '1645080',
'2011/i_got_next': '1915570',
'2011/lebanon_pa': '1290082',
'2011/money_pet': '1965198',
'2011/my_suicide': '0492896',
'2011/priest': '0822847',
'2011/prowl': '1559033',
'2011/red_sonja': '0800175',
'2011/season_of_the_witch': '0479997',
'2011/stay_cool': '1235807',
'2011/sympathy_for_delicious': '1270277',
'2011/trust': '1529572',
'2011/undefeated': '1961604',
'2011/vanishing_on_seventh_street': '1452628',
'2011/where_is_robert_fisher': '2042712',
'2011/yellowbrickroad': '1398428',
'2012/haywire': '1506999',
'2012/last_call_at_the_oasis': '2043900',
}
if __name__ == '__main__':
ids = get_ids()
print sorted(ids), len(ids)

View file

@ -0,0 +1,187 @@
# vi:si:et:sw=4:sts=4:ts=4
# encoding: utf-8
import re
import urllib
from ox.cache import read_url
from ox.html import decode_html, strip_tags
from ox.text import find_re
from ox.text import find_string
# to sniff itunes traffic, use something like
# sudo tcpdump -i en1 -Avs 8192 host appleglobal.112.2o7.net
# http://ax.phobos.apple.com.edgesuite.net/WebObjects/MZSearch.woa/wa/advancedSearch?media=music&songTerm=&genreIndex=1&flavor=0&mediaType=2&composerTerm=&allArtistNames=Arcadia&ringtone=0&searchButton=submit
# http://ax.phobos.apple.com.edgesuite.net/WebObjects/MZSearch.woa/wa/advancedSearch?media=movie&movieTerm=The%20Matrix&descriptionTerm=&ratingIndex=1&mediaType=3&directorProducerName=Andy%20Wachowski&flavor=0&releaseYearTerm=1999&closedCaption=0&actorTerm=&searchButton=submit
ITUNES_HEADERS = {
'X-Apple-Tz': '0',
'X-Apple-Storefront': '143441-1',
'User-Agent': 'iTunes/7.6.2 (Macintosh; U; Intel Mac OS X 10.5.2)',
'Accept-Language': 'en-us, en;q=0.50',
'Accept-Encoding': 'gzip',
'Connection': 'close',
}
def compose_url(request, parameters):
if request == 'advancedSearch':
url = 'http://ax.phobos.apple.com.edgesuite.net/WebObjects/MZSearch.woa/wa/advancedSearch?'
if parameters['media'] == 'music':
url += urllib.urlencode({
'albumTerm': parameters['title'],
'allArtistNames': parameters['artist'],
'composerTerm': '',
'flavor': 0,
'genreIndex': 1,
'media': 'music',
'mediaType': 2,
'ringtone': 0,
'searchButton': 'submit',
'songTerm': ''
})
elif parameters['media'] == 'movie':
url += urllib.urlencode({
'actorTerm': '',
'closedCaption': 0,
'descriptionTerm': '',
'directorProducerName': parameters['director'],
'flavor': 0,
'media': 'movie',
'mediaType': 3,
'movieTerm': parameters['title'],
'ratingIndex': 1,
'releaseYearTerm': '',
'searchButton': 'submit'
})
elif request == 'viewAlbum':
url = 'http://phobos.apple.com/WebObjects/MZStore.woa/wa/viewAlbum?id=%s' % parameters['id']
elif request == 'viewMovie':
url = 'http://phobos.apple.com/WebObjects/MZStore.woa/wa/viewMovie?id=%s&prvw=1' % parameters['id']
return url
def parse_xml_dict(xml):
values = {}
strings = xml.split('<key>')
for string in strings:
if string.find('</key>') != -1:
key = find_re(string, '(.*?)</key>')
type = find_re(string, '</key><(.*?)>')
if type == 'true/':
value = True
else:
value = find_re(string, '<%s>(.*?)</%s>' % (type, type))
if type == 'integer':
value = int(value)
elif type == 'string':
value = decode_html(value)
values[key] = value
return values
def parse_cast(xml, title):
list = []
try:
strings = find_re(xml, '<SetFontStyle normalStyle="textColor">%s(.*?)</VBoxView>' % title[:-1].upper()).split('</GotoURL>')
strings.pop()
for string in strings:
list.append(find_re(string, '<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
return list
except:
return list
def parse_movies(xml, title):
list = []
try:
strings = find_re(xml, '<SetFontStyle normalStyle="outlineTitleFontStyle"><b>%s(.*?)</Test>' % title[:-1].upper()).split('</GotoURL>')
strings.pop()
for string in strings:
list.append({
'id': find_re(string, 'viewMovie\?id=(.*?)&'),
'title': find_re(string, '<SetFontStyle normalStyle="outlineTextFontStyle"><b>(.*?)</b></SetFontStyle>')
})
return list
except:
return list
class ItunesAlbum:
def __init__(self, id = '', title = '', artist = ''):
self.id = id
self.title = title
self.artist = artist
if not id:
self.id = self.get_id()
def get_id(self):
url = compose_url('advancedSearch', {'media': 'music', 'title': self.title, 'artist': self.artist})
xml = read_url(url, headers = ITUNES_HEADERS)
id = find_re(xml, 'viewAlbum\?id=(.*?)&')
return id
def get_data(self):
data = {'id': self.id}
url = compose_url('viewAlbum', {'id': self.id})
xml = read_url(url, None, ITUNES_HEADERS)
data['albumName'] = find_re(xml, '<B>(.*?)</B>')
data['artistName'] = find_re(xml, '<b>(.*?)</b>')
data['coverUrl'] = find_re(xml, 'reflection="." url="(.*?)"')
data['genre'] = find_re(xml, 'Genre:(.*?)<')
data['releaseDate'] = find_re(xml, 'Released(.*?)<')
data['review'] = strip_tags(find_re(xml, 'REVIEW</b>.*?<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
data['tracks'] = []
strings = find_re(xml, '<key>items</key>.*?<dict>(.*?)$').split('<dict>')
for string in strings:
data['tracks'].append(parse_xml_dict(string))
data['type'] = find_re(xml, '<key>listType</key><string>(.*?)<')
return data
class ItunesMovie:
def __init__(self, id = '', title = '', director = ''):
self.id = id
self.title = title
self.director = director
if not id:
self.id = self.get_id()
def get_id(self):
url = compose_url('advancedSearch', {'media': 'movie', 'title': self.title, 'director': self.director})
xml = read_url(url, headers = ITUNES_HEADERS)
id = find_re(xml, 'viewMovie\?id=(.*?)&')
return id
def get_data(self):
data = {'id': self.id}
url = compose_url('viewMovie', {'id': self.id})
xml = read_url(url, None, ITUNES_HEADERS)
f = open('/Users/rolux/Desktop/iTunesData.xml', 'w')
f.write(xml)
f.close()
data['actors'] = parse_cast(xml, 'actors')
string = find_re(xml, 'Average Rating:(.*?)</HBoxView>')
data['averageRating'] = string.count('rating_star_000033.png') + string.count('&#189;') * 0.5
data['directors'] = parse_cast(xml, 'directors')
data['format'] = find_re(xml, 'Format:(.*?)<')
data['genre'] = decode_html(find_re(xml, 'Genre:(.*?)<'))
data['plotSummary'] = decode_html(find_re(xml, 'PLOT SUMMARY</b>.*?<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
data['posterUrl'] = find_re(xml, 'reflection="." url="(.*?)"')
data['producers'] = parse_cast(xml, 'producers')
data['rated'] = find_re(xml, 'Rated(.*?)<')
data['relatedMovies'] = parse_movies(xml, 'related movies')
data['releaseDate'] = find_re(xml, 'Released(.*?)<')
data['runTime'] = find_re(xml, 'Run Time:(.*?)<')
data['screenwriters'] = parse_cast(xml, 'screenwriters')
data['soundtrackId'] = find_re(xml, 'viewAlbum\?id=(.*?)&')
data['trailerUrl'] = find_re(xml, 'autoplay="." url="(.*?)"')
return data
if __name__ == '__main__':
from ox.utils import json
data = ItunesAlbum(title = 'So Red the Rose', artist = 'Arcadia').get_data()
print json.dumps(data, sort_keys = True, indent = 4)
data = ItunesMovie(title = 'The Matrix', director = 'Wachowski').get_data()
print json.dumps(data, sort_keys = True, indent = 4)
for v in data['relatedMovies']:
data = ItunesMovie(id = v['id']).get_data()
print json.dumps(data, sort_keys = True, indent = 4)
data = ItunesMovie(id='272960052').get_data()
print json.dumps(data, sort_keys = True, indent = 4)

View file

@ -0,0 +1,42 @@
from ox.cache import read_url
from ox import find_re, strip_tags
import re
base = 'http://www.lookupbyisbn.com'
def get_data(isbn):
r = {}
url = '%s/Search/Book/%s/1' % (base, isbn)
data = read_url(url).decode('utf-8')
m = re.compile('href="(/Lookup/Book/[^"]+?)"').findall(data)
if m:
ids = m[0].split('/')
r['isbn'] = ids[-2]
r['asin'] = ids[-3]
url = '%s%s' % (base, m[0])
data = read_url(url).decode('utf-8')
r["title"] = find_re(data, "<h2>(.*?)</h2>")
keys = {
'author': 'Author(s)',
'publisher': 'Publisher',
'date': 'Publication date',
'edition': 'Edition',
'binding': 'Binding',
'volume': 'Volume(s)',
'pages': 'Pages',
}
for key in keys:
r[key] = find_re(data, '<span class="title">%s:</span>(.*?)</li>'% re.escape(keys[key]))
if r[key] == '--':
r[key] = ''
if key == 'pages' and r[key]:
r[key] = int(r[key])
desc = find_re(data, '<h2>Description:<\/h2>(.*?)<div ')
desc = desc.replace('<br /><br />', ' ').replace('<br /> ', ' ').replace('<br />', ' ')
r['description'] = strip_tags(desc).strip()
if r['description'] == u'Description of this item is not available at this time.':
r['description'] = ''
r['cover'] = find_re(data, '<img src="(.*?)" alt="Book cover').replace('._SL160_', '')
return r

View file

@ -0,0 +1,21 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
from ox.cache import read_url
from ox.html import decode_html
from ox.text import find_re
def get_lyrics(title, artist):
html = read_url('http://lyricsfly.com/api/')
key = find_re(html, '<font color=green><b>(.*?)</b></font>')
url = 'http://lyricsfly.com/api/api.php?i=%s&a=%s&t=%s' % (key, artist, title)
xml = read_url(url)
lyrics = find_re(xml, '<tx>(.*?)\[br\] Lyrics [a-z]* by lyricsfly.com')
lyrics = lyrics.replace('\n', '').replace('\r', '')
lyrics = lyrics.replace('[br]', '\n').strip()
lyrics.replace('\n\n\n', '\n\n')
lyrics = decode_html(lyrics.replace('&amp;', '&'))
return lyrics
if __name__ == '__main__':
print getLyrics('Election Day', 'Arcadia')

View file

@ -0,0 +1,63 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import re
from urllib import quote
from lxml.html import document_fromstring
from ox.cache import read_url
from ox import find_re, strip_tags
def get_url(id=None, imdb=None):
if imdb:
url = "http://www.imdb.com/title/tt%s/criticreviews" % imdb
data = read_url(url)
metacritic_url = find_re(data, '"(http://www.metacritic.com/movie/.*?)"')
return metacritic_url or None
return 'http://www.metacritic.com/movie/%s' % id
def get_id(url):
return url.split('/')[-1]
def get_show_url(title):
title = quote(title)
url = "http://www.metacritic.com/search/process?ty=6&ts=%s&tfs=tvshow_title&x=0&y=0&sb=0&release_date_s=&release_date_e=&metascore_s=&metascore_e=" % title
data = read_url(url)
return find_re(data, '(http://www.metacritic.com/tv/shows/.*?)\?')
def get_data(url):
data = read_url(url, unicode=True)
doc = document_fromstring(data)
score = filter(lambda s: s.attrib.get('property') == 'v:average',
doc.xpath('//span[@class="score_value"]'))
if score:
score = int(score[0].text)
else:
score = -1
authors = [a.text
for a in doc.xpath('//div[@class="review_content"]//div[@class="author"]//a')]
sources = [d.text
for d in doc.xpath('//div[@class="review_content"]//div[@class="source"]/a')]
reviews = [d.text
for d in doc.xpath('//div[@class="review_content"]//div[@class="review_body"]')]
scores = [int(d.text.strip())
for d in doc.xpath('//div[@class="review_content"]//div[contains(@class, "critscore")]')]
urls = [a.attrib['href']
for a in doc.xpath('//div[@class="review_content"]//a[contains(@class, "external")]')]
metacritics = []
for i in range(len(authors)):
metacritics.append({
'critic': authors[i],
'url': urls[i],
'source': sources[i],
'quote': strip_tags(reviews[i]).strip(),
'score': scores[i],
})
return {
'critics': metacritics,
'id': get_id(url),
'score': score,
'url': url,
}

View file

@ -0,0 +1,121 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
from datetime import datetime
import re
import socket
from urllib import quote
from ox.cache import read_url
from ox import find_re, cache, strip_tags, decode_html, get_torrent_info, int_value, normalize_newlines
from ox.normalize import normalize_imdbid
import ox
from torrent import Torrent
def _parse_results_page(data, max_results=10):
results=[]
regexp = '''<tr><td>(.*?)</td><td>(.*?)<a href="/tor/(.*?)">(.*?)</a>.*?</td>.*?</tr>'''
for row in re.compile(regexp, re.DOTALL).findall(data):
torrentDate = row[0]
torrentExtra = row[1]
torrentId = row[2]
torrentTitle = decode_html(row[3]).strip()
torrentLink = "http://www.mininova.org/tor/" + torrentId
privateTracker = 'priv.gif' in torrentExtra
if not privateTracker:
results.append((torrentTitle, torrentLink, ''))
return results
def find_movie(query=None, imdb=None, max_results=10):
'''search for torrents on mininova
'''
if imdb:
url = "http://www.mininova.org/imdb/?imdb=%s" % normalize_imdbid(imdb)
else:
url = "http://www.mininova.org/search/%s/seeds" % quote(query)
data = read_url(url, unicode=True)
return _parse_results_page(data, max_results)
def get_id(mininovaId):
mininovaId = unicode(mininovaId)
d = find_re(mininovaId, "/(\d+)")
if d:
return d
mininovaId = mininovaId.split('/')
if len(mininovaId) == 1:
return mininovaId[0]
else:
return mininovaId[-1]
def exists(mininovaId):
mininovaId = get_id(mininovaId)
data = ox.net.read_url("http://www.mininova.org/tor/%s" % mininovaId)
if not data or 'Torrent not found...' in data:
return False
if 'tracker</a> of this torrent requires registration.' in data:
return False
return True
def get_data(mininovaId):
_key_map = {
'by': u'uploader',
}
mininovaId = get_id(mininovaId)
torrent = dict()
torrent[u'id'] = mininovaId
torrent[u'domain'] = 'mininova.org'
torrent[u'comment_link'] = "http://www.mininova.org/tor/%s" % mininovaId
torrent[u'torrent_link'] = "http://www.mininova.org/get/%s" % mininovaId
torrent[u'details_link'] = "http://www.mininova.org/det/%s" % mininovaId
data = read_url(torrent['comment_link'], unicode=True) + read_url(torrent['details_link'], unicode=True)
if '<h1>Torrent not found...</h1>' in data:
return None
for d in re.compile('<p>.<strong>(.*?):</strong>(.*?)</p>', re.DOTALL).findall(data):
key = d[0].lower().strip()
key = _key_map.get(key, key)
value = decode_html(strip_tags(d[1].strip()))
torrent[key] = value
torrent[u'title'] = find_re(data, '<title>(.*?):.*?</title>')
torrent[u'imdbId'] = find_re(data, 'title/tt(\d{7})')
torrent[u'description'] = find_re(data, '<div id="description">(.*?)</div>')
if torrent['description']:
torrent['description'] = normalize_newlines(decode_html(strip_tags(torrent['description']))).strip()
t = read_url(torrent[u'torrent_link'])
torrent[u'torrent_info'] = get_torrent_info(t)
return torrent
class Mininova(Torrent):
'''
>>> Mininova('123')
{}
>>> Mininova('1072195')['infohash']
'72dfa59d2338e4a48c78cec9de25964cddb64104'
'''
def __init__(self, mininovaId):
self.data = get_data(mininovaId)
if not self.data:
return
Torrent.__init__(self)
ratio = self.data['share ratio'].split(',')
self['seeder'] = -1
self['leecher'] = -1
if len(ratio) == 2:
val = int_value(ratio[0].replace(',','').strip())
if val:
self['seeder'] = int(val)
val = int_value(ratio[1].replace(',','').strip())
if val:
self['leecher'] = int(val)
val = int_value(self.data['downloads'].replace(',','').strip())
if val:
self['downloaded'] = int(val)
else:
self['downloaded'] = -1
published = self.data['added on']
published = published.split(' +')[0]
self['published'] = datetime.strptime(published, "%a, %d %b %Y %H:%M:%S")

View file

@ -0,0 +1,44 @@
# -*- coding: UTF-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import re
from ox.cache import read_url
from ox import find_re
def get_data(id):
'''
>>> get_data('0060304')['posters'][0]
u'http://www.movieposterdb.com/posters/06_03/1967/0060304/l_99688_0060304_639fdd1e.jpg'
>>> get_data('0123456')['posters']
[]
'''
data = {
"url": get_url(id)
}
data["posters"] = get_posters(data["url"])
return data
def get_id(url):
return url.split("/")[-2]
def get_posters(url, group=True, timeout=-1):
posters = []
html = read_url(url, timeout=timeout, unicode=True)
if url in html:
if group:
results = re.compile('<a href="(http://www.movieposterdb.com/group/.+?)\??">', re.DOTALL).findall(html)
for result in results:
posters += get_posters(result, False)
results = re.compile('<a href="(http://www.movieposterdb.com/poster/.+?)">', re.DOTALL).findall(html)
for result in results:
html = read_url(result, timeout=timeout, unicode=True)
posters.append(find_re(html, '"(http://www.movieposterdb.com/posters/.+?\.jpg)"'))
return posters
def get_url(id):
return "http://www.movieposterdb.com/movie/%s/" % id
if __name__ == '__main__':
print get_data('0060304')
print get_data('0133093')

View file

@ -0,0 +1,41 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import re
import feedparser
from ox.cache import read_url
from ox import find_re, strip_tags
from ox.iso import langCode2To3, langTo3Code
def find_subtitles(imdb, parts = 1, language = "eng"):
if len(language) == 2:
language = langCode2To3(language)
elif len(language) != 3:
language = langTo3Code(language)
url = "http://www.opensubtitles.org/en/search/"
if language:
url += "sublanguageid-%s/" % language
url += "subsumcd-%s/subformat-srt/imdbid-%s/rss_2_00" % (parts, imdb)
data = read_url(url)
if "title>opensubtitles.com - search results</title" in data:
fd = feedparser.parse(data)
opensubtitleId = None
if fd.entries:
link = fd.entries[0]['links'][0]['href']
opensubtitleId = re.compile('subtitles/(.*?)/').findall(link)
if opensubtitleId:
opensubtitleId = opensubtitleId[0]
else:
opensubtitleId = find_re(data, '/en/subtitles/(.*?)/')
return opensubtitleId
def download_subtitle(opensubtitle_id):
srts = {}
data = read_url('http://www.opensubtitles.org/en/subtitles/%s' % opensubtitle_id)
reg_exp = 'href="(/en/download/file/.*?)">(.*?)</a>'
for f in re.compile(reg_exp, re.DOTALL).findall(data):
name = strip_tags(f[1]).split('\n')[0]
url = "http://www.opensubtitles.com%s" % f[0]
srts[name] = read_url(url, unicode=True)
return srts

View file

@ -0,0 +1,10 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import ox.cache
def get_poster_url(id):
url = "http://0xdb.org/%s/poster.0xdb.jpg" % id
if ox.cache.exists(url):
return url
return ''

View file

@ -0,0 +1,19 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
from __future__ import print_function
import re
from ox.net import read_url
def get_poster_url(id):
url = 'http://piratecinema.org/posters/'
html = read_url(url, unicode=True)
results = re.compile('src="(.+)" title=".+\((\d{7})\)"').findall(html)
for result in results:
if result[1] == id:
return url + result[0]
return ''
if __name__ == '__main__':
print(get_poster_url('0749451'))

View file

@ -0,0 +1,54 @@
# -*- coding: UTF-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import re
from ox.cache import read_url
from ox import find_re, strip_tags
def get_url(id=None, imdb=None):
#this would also wor but does not cache:
'''
from urllib2 import urlopen
u = urlopen(url)
return u.url
'''
if imdb:
url = "http://www.rottentomatoes.com/alias?type=imdbid&s=%s" % imdb
data = read_url(url)
if "movie_title" in data:
movies = re.compile('(/m/.*?/)').findall(data)
if movies:
return "http://www.rottentomatoes.com" + movies[0]
return None
def get_og(data, key):
return find_re(data, '<meta property="og:%s".*?content="(.*?)"' % key)
def get_data(url):
data = read_url(url)
r = {}
r['title'] = find_re(data, '<h1 class="movie_title">(.*?)</h1>')
if '(' in r['title']:
r['year'] = find_re(r['title'], '\((\d*?)\)')
r['title'] = strip_tags(re.sub('\((\d*?)\)', '', r['title'])).strip()
r['summary'] = strip_tags(find_re(data, '<p id="movieSynopsis" class="movie_synopsis" itemprop="description">(.*?)</p>')).strip()
r['summary'] = r['summary'].replace('\t', ' ').replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')
if not r['summary']:
r['summary'] = get_og(data, 'description')
meter = re.compile('<span id="all-critics-meter" class="meter(.*?)">(.*?)</span>').findall(data)
meter = filter(lambda m: m[1].isdigit(), meter)
if meter:
r['tomatometer'] = meter[0][1]
r['rating'] = find_re(data, 'Average Rating: <span>([\d.]+)/10</span>')
r['user_score'] = find_re(data, '<span class="meter popcorn numeric ">(\d+)</span>')
r['user_rating'] = find_re(data, 'Average Rating: ([\d.]+)/5')
poster = get_og(data, 'image')
if poster and not 'poster_default.gif' in poster:
r['posters'] = [poster]
for key in r.keys():
if not r[key]:
del r[key]
return r

View file

@ -0,0 +1,76 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import re
from six import string_types
from ..cache import read_url
from .. import decode_html
from ..utils import datetime
def cleanup(key, data, data_type):
if data:
if isinstance(data[0], string_types):
#FIXME: some types need strip_tags
#data = [strip_tags(decode_html(p)).strip() for p in data]
data = [decode_html(p).strip() for p in data]
elif isinstance(data[0], list) or isinstance(data[0], tuple):
data = [cleanup(key, p, data_type) for p in data]
while len(data) == 1 and not isinstance(data, string_types):
data = data[0]
if data_type == 'list' and isinstance(data, string_types):
data = [data, ]
elif data_type != 'list':
data = ''
return data
class SiteParser(dict):
baseUrl = ''
regex = {}
def get_url(self, page):
return "%s%s" % (self.baseUrl, page)
def read_url(self, url, timeout):
if not url in self._cache:
self._cache[url] = read_url(url, timeout=timeout, unicode=True)
return self._cache[url]
def __init__(self, timeout=-1):
self._cache = {}
for key in self.regex:
url = self.get_url(self.regex[key]['page'])
data = self.read_url(url, timeout)
if isinstance(self.regex[key]['re'], string_types):
data = re.compile(self.regex[key]['re'], re.DOTALL).findall(data)
data = cleanup(key, data, self.regex[key]['type'])
elif callable(self.regex[key]['re']):
data = self.regex[key]['re'](data)
else:
for r in self.regex[key]['re']:
if callable(r):
f = r
else:
f = re.compile(r, re.DOTALL).findall
if isinstance(data, string_types):
data = f(data)
else:
data = [f(d) for d in data]
data = cleanup(key, data, self.regex[key]['type'])
def apply_f(f, data):
if data and isinstance(data[0], list):
data = [f(d) for d in data]
else:
data = f(data)
return data
if self.regex[key]['type'] == 'float' and data:
data = apply_f(float, data)
elif self.regex[key]['type'] == 'int' and data:
data = apply_f(int, data)
elif self.regex[key]['type'] == 'date':
parse_date = lambda d: d and datetime.strptime('-'.join(d), '%m-%d-%Y').strftime('%Y-%m-%d')
data = apply_f(parse_date, data)
if data:
self[key] = data

View file

@ -0,0 +1,287 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
from datetime import datetime
import re
import time
import ox.cache
from ox.html import decode_html, strip_tags
import ox.net
def get_news(year, month, day):
sections = [
'politik', 'wirtschaft', 'panorama', 'sport', 'kultur', 'netzwelt',
'wissenschaft', 'unispiegel', 'schulspiegel', 'reise', 'auto'
]
dt = datetime(year, month, day)
day = int(dt.strftime('%j'))
date = dt.strftime('%d.%m.%Y')
news = []
for section in sections:
url = 'http://www.spiegel.de/%s/0,1518,archiv-%d-%03d,00.html' % (section, year, day)
if date == time.strftime('%d.%m.%Y', time.localtime()):
html = ox.net.read_url(url)
else:
html = ox.cache.read_url(url)
for item in re.compile('<div class="spTeaserCenterpage(.*?)</p>', re.DOTALL).findall(html):
dateString = strip_tags(re.compile('<div class="spDateTime">(.*?)</div>', re.DOTALL).findall(item)[0]).strip()
try:
description = format_string(re.compile('<p>(.*?)<', re.DOTALL).findall(item)[0])
except:
description = ''
try:
imageUrl = re.compile('<img src="(.*?)"').findall(item)[0]
except:
imageUrl = ''
try:
title = format_string(re.compile('alt=[\'|"](.*?)[\'|"] title=', re.DOTALL).findall(item)[0]).replace(' : ', ': ').replace('::', ':')
except:
title = ''
if dateString[:10] == date and description and imageUrl and title.find(': ') != -1:
new = {}
if len(dateString) == 10:
new['date'] = '%s-%s-%s 00:00' % (dateString[6:10], dateString[3:5], dateString[:2])
else:
new['date'] = '%s-%s-%s %s:%s' % (dateString[6:10], dateString[3:5], dateString[:2], dateString[12:14], dateString[15:17])
# fix decode_html
# new['description'] = format_string(decode_html(description))
new['description'] = format_string(description)
new['imageUrl'] = imageUrl
new['section'] = format_section(section)
new['title'] = format_string(title)
new['title1'] = new['title'].replace('\xdf', '\xdf\xdf')[:len(format_string(re.compile('<h4>(.*?)</h4>', re.DOTALL).findall(item)[0]))].replace('\xdf\xdf', '\xdf')
if new['title1'][-1:] == ':':
new['title1'] = new['title1'][0:-1]
new['title2'] = new['title'][len(new['title1']) + 2:]
new['url'] = re.compile('<a href="(.*?)"').findall(item)[0]
if new['url'][:1] == '/':
new['url'] = 'http://www.spiegel.de' + new['url']
news.append(new)
# print '%s, %s' % (new['section'], dateString)
'''
elif dateString[:10] == date and not description:
print dateString + ' - no description'
elif dateString[:10] == date and not imageUrl:
print dateString + ' - no image'
'''
return news
def split_title(title):
title1 = re.compile('(.*?): ').findall(title)[0]
title2 = re.compile(': (.*?)$').findall(title)[0]
return [title1, title2]
def format_string(string):
string = string.replace('<span class="spOptiBreak"> </span>', '')
string = string.replace('\n', ' ').replace(' ', ' ').strip()
string = string.replace('&amp;', '&').replace('&apos;', '\'').replace('&quot;', '"')
return string
def format_section(string):
return string[:1].upper() + string[1:].replace('spiegel', 'SPIEGEL')
def format_subsection(string):
# SPIEGEL, SPIEGEL special
subsection = {
'abi': 'Abi - und dann?',
'formel1': 'Formel 1',
'jobundberuf': 'Job & Beruf',
'leben': 'Leben U21',
'mensch': 'Mensch & Technik',
'sonst': '',
'staedte': u'St\xc3dte',
'ussports': 'US-Sports',
'wunderbar': 'wunderBAR'
}
if subsection.has_key(string):
return subsection[string].replace(u'\xc3', 'ae')
return string[:1].upper() + string[1:]
def get_issue(year, week):
coverUrl = 'http://www.spiegel.de/static/epaper/SP/%d/%d/ROSPANZ%d%03d0001-312.jpg' % (year, week, year, week)
if not ox.net.exists(coverUrl):
return None
url = 'http://service.spiegel.de/digas/servlet/epaper?Q=SP&JG=%d&AG=%d&SE=1&AN=INHALT' % (year, week)
contents = []
data = ox.cache.read_url(url)
items = re.compile('<a.?href="http://service.spiegel.de/digas/servlet/epaper\?Q=SP&JG=".?>(.*?)</a>').findall(data)
for item in items:
item = item[1]
page = int(re.compile('&amp;SE=(.*?)"').findall(item)[0])
title = strip_tags(item).strip()
contents.append({'title': title, 'page': page})
pageUrl = {}
pages = page + 2
for page in range(1, pages + 10):
url = 'http://www.spiegel.de/static/epaper/SP/%d/%d/ROSPANZ%d%03d%04d-205.jpg' % (year, week, year, week, page)
if ox.cache.exists(url):
pageUrl[page] = url
else:
pageUrl[page] = ''
return {'pages': pages, 'contents': contents, 'coverUrl': coverUrl, 'pageUrl': pageUrl}
def archive_issues():
'''
this is just an example of an archiving application
'''
p = {}
import os
from ox.utils import json
import time
archivePath = '/Volumes/Rolux Home/Desktop/Data/spiegel.de/Der Spiegel'
localtime = time.localtime()
year = int(time.strftime('%Y', localtime))
week = int(time.strftime('%W', localtime))
for y in range(year, 1993, -1):
if y == year:
wMax = week + 1
else:
wMax = 53
for w in range(wMax, 0, -1):
print 'get_issue(%d, %d)' % (y, w)
issue = get_issue(y, w)
if issue:
dirname = '%s/%d/%02d' % (archivePath, y, w)
if not os.path.exists(dirname):
os.makedirs(dirname)
filename = '%s/Der Spiegel %d %02d.json' % (dirname, y, w)
if not os.path.exists(filename):
data = json.dumps(issue, ensure_ascii = False)
f = open(filename, 'w')
f.write(data)
f.close()
filename = '%s/Der Spiegel %d %02d.txt' % (dirname, y, w)
if not os.path.exists(filename):
data = []
for item in issue['contents']:
data.append('%3d %s' % (item['page'], item['title']))
data = '\n'.join(data)
f = open(filename, 'w')
f.write(data)
f.close()
filename = '%s/Der Spiegel %d %02d.jpg' % (dirname, y, w)
if not os.path.exists(filename):
data = ox.cache.read_url(issue['coverUrl'])
f = open(filename, 'w')
f.write(data)
f.close()
for page in issue['pageUrl']:
url = issue['pageUrl'][page]
if url:
filename = '%s/Der Spiegel %d %02d %03d.jpg' % (dirname, y, w, page)
if not os.path.exists(filename):
data = ox.cache.read_url(url)
f = open(filename, 'w')
f.write(data)
f.close()
if not p:
p = {'num': 1, 'sum': issue['pages'], 'min': issue['pages'], 'max': issue['pages']}
else:
p['num'] += 1
p['sum'] += issue['pages']
if issue['pages'] < p['min']:
p['min'] = issue['pages']
if issue['pages'] > p['max']:
p['max'] = issue['pages']
print p['min'], p['sum'] / p['num'], p['max']
def archive_news():
'''
this is just an example of an archiving application
'''
import os
from ox.utils import json
import time
count = {}
colon = []
archivePath = '/Volumes/Rolux Home/Desktop/Data/spiegel.de/Spiegel Online'
days = [0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
localtime = time.localtime()
year = int(time.strftime('%Y', localtime))
month = int(time.strftime('%m', localtime))
day = int(time.strftime('%d', localtime)) - 1
for y in range(year, 1999, -1):
if y == year:
mMax = month
else:
mMax = 12
for m in range(mMax, 0, -1):
if y == year and m == month:
dMax = day
elif m == 2 and y % 4 == 0 and y % 400 != 0:
dMax = days[m] + 1
else:
dMax = days[m]
for d in range(dMax, 0, -1):
print 'getNews(%d, %d, %d)' % (y, m, d)
news = getNews(y, m ,d)
for new in news:
dirname = archivePath + '/' + new['date'][0:4] + '/' + new['date'][5:7] + new['date'][8:10] + '/' + new['date'][11:13] + new['date'][14:16]
if not os.path.exists(dirname):
os.makedirs(dirname)
if new['url'][-5:] == '.html':
filename = dirname + '/' + new['url'].split('/')[-1][:-5] + '.json'
else:
filename = dirname + '/' + new['url'] + '.json'
if not os.path.exists(filename) or True:
data = json.dumps(new, ensure_ascii = False)
f = open(filename, 'w')
f.write(data)
f.close()
filename = filename[:-5] + '.txt'
if not os.path.exists(filename) or True:
data = split_title(new['title'])
data.append(new['description'])
data = '\n'.join(data)
f = open(filename, 'w')
f.write(data)
f.close()
filename = dirname + '/' + new['imageUrl'].split('/')[-1]
if not os.path.exists(filename):
data = ox.cache.read_url(new['imageUrl'])
f = open(filename, 'w')
f.write(data)
f.close()
strings = new['url'].split('/')
string = strings[3]
if len(strings) == 6:
string += '/' + strings[4]
if not count.has_key(string):
count[string] = {'count': 1, 'string': '%s %s http://www.spiegel.de/%s/0,1518,archiv-%d-%03d,00.html' % (new['date'], new['date'], new['section'].lower(), y, int(datetime(y, m, d).strftime('%j')))}
else:
count[string] = {'count': count[string]['count'] + 1, 'string': '%s %s' % (new['date'], count[string]['string'][17:])}
strings = split_title(new['title'])
if strings[0] != new['title1'] or strings[1] != new['title2']:
colon.append('%s %s %s: %s' % (new['date'], new['title'], new['title1'], new['title2']))
for key in sorted(count):
print '%6d %-24s %s' % (count[key]['count'], key, count[key]['string'])
for value in colon:
print value
if __name__ == '__main__':
# spiegel = Spiegel(2008, 8)
# print spiegel.getContents()
# news = News(2001, 9, 10)
# output(news.getNews())
'''
x = []
for d in range(10, 30):
print '2/%d' % d
news = getNews(2008, 2, d)
for new in news:
strings = new['url'].split('/')
string = format_section(strings[3])
if len(strings) == 6:
string += '/' + format_subsection(strings[4])
if not string in x:
x.append(string)
print x
'''
# archive_issues()
archive_news()

View file

@ -0,0 +1,117 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
from datetime import datetime
import re
import socket
from urllib import quote, urlencode
from urllib2 import URLError
from ox import find_re, cache, strip_tags, decode_html, get_torrent_info, normalize_newlines
from ox.normalize import normalize_imdbid
import ox
from torrent import Torrent
cache_timeout = 24*60*60 # cache search only for 24 hours
season_episode = re.compile("S..E..", re.IGNORECASE)
def read_url(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None, unicode=False):
headers = headers.copy()
headers['Cookie'] = 'language=en_EN'
return cache.read_url(url, data, headers, timeout, unicode=unicode)
def find_movies(query=None, imdb=None, max_results=10):
if imdb:
query = "tt" + normalize_imdbid(imdb)
results = []
next = ["http://thepiratebay.org/search/%s/0/3/200" % quote(query), ]
page_count = 1
while next and page_count < 4:
page_count += 1
url = next[0]
if not url.startswith('http'):
if not url.startswith('/'):
url = "/" + url
url = "http://thepiratebay.org" + url
data = read_url(url, timeout=cache_timeout, unicode=True)
regexp = '''<tr.*?<td class="vertTh"><a href="/browse/(.*?)".*?<td><a href="(/torrent/.*?)" class="detLink".*?>(.*?)</a>.*?</tr>'''
for row in re.compile(regexp, re.DOTALL).findall(data):
torrentType = row[0]
torrentLink = "http://thepiratebay.org" + row[1]
torrentTitle = decode_html(row[2])
# 201 = Movies , 202 = Movie DVDR, 205 TV Shows
if torrentType in ['201']:
results.append((torrentTitle, torrentLink, ''))
if len(results) >= max_results:
return results
next = re.compile('<a.*?href="(.*?)".*?>.*?next.gif.*?</a>').findall(data)
return results
def get_id(piratebayId):
if piratebayId.startswith('http://torrents.thepiratebay.org/'):
piratebayId = piratebayId.split('org/')[1]
d = find_re(piratebayId, "tor/(\d+)")
if d:
piratebayId = d
d = find_re(piratebayId, "torrent/(\d+)")
if d:
piratebayId = d
return piratebayId
def exists(piratebayId):
piratebayId = get_id(piratebayId)
return ox.net.exists("http://thepiratebay.org/torrent/%s" % piratebayId)
def get_data(piratebayId):
_key_map = {
'spoken language(s)': u'language',
'texted language(s)': u'subtitle language',
'by': u'uploader',
'leechers': 'leecher',
'seeders': 'seeder',
}
piratebayId = get_id(piratebayId)
torrent = dict()
torrent[u'id'] = piratebayId
torrent[u'domain'] = 'thepiratebay.org'
torrent[u'comment_link'] = 'http://thepiratebay.org/torrent/%s' % piratebayId
data = read_url(torrent['comment_link'], unicode=True)
torrent[u'title'] = find_re(data, '<title>(.*?) \(download torrent\) - TPB</title>')
if not torrent[u'title']:
return None
torrent[u'title'] = decode_html(torrent[u'title']).strip()
torrent[u'imdbId'] = find_re(data, 'title/tt(\d{7})')
title = quote(torrent['title'].encode('utf-8'))
torrent[u'torrent_link']="http://torrents.thepiratebay.org/%s/%s.torrent" % (piratebayId, title)
for d in re.compile('dt>(.*?):</dt>.*?<dd.*?>(.*?)</dd>', re.DOTALL).findall(data):
key = d[0].lower().strip()
key = _key_map.get(key, key)
value = decode_html(strip_tags(d[1].strip()))
torrent[key] = value
torrent[u'description'] = find_re(data, '<div class="nfo">(.*?)</div>')
if torrent[u'description']:
torrent['description'] = normalize_newlines(decode_html(strip_tags(torrent['description']))).strip()
t = read_url(torrent[u'torrent_link'])
torrent[u'torrent_info'] = get_torrent_info(t)
return torrent
class Thepiratebay(Torrent):
'''
>>> Thepiratebay('123')
{}
>>> Thepiratebay('3951349')['infohash']
'4e84415d36ed7b54066160c05a0b0f061898d12b'
'''
def __init__(self, piratebayId):
self.data = get_data(piratebayId)
if not self.data:
return
Torrent.__init__(self)
published = self.data['uploaded']
published = published.replace(' GMT', '').split(' +')[0]
self['published'] = datetime.strptime(published, "%Y-%m-%d %H:%M:%S")

View file

@ -0,0 +1,37 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
from ox import int_value
class Torrent(dict):
'''
>>> Torrent()
{'files': 1, 'domain': u'', 'subtitle language': u'', 'seeder': -1, 'description': u'', 'language': u'', 'title': u'', 'imdbId': u'', 'downloaded': -1, 'leecher': -1, 'torrent_link': u'', 'torrent_info': {}, 'published': u'', 'announce': '', 'infohash': '', 'id': u'', 'comment_link': u'', 'size': -1}
'''
_string_keys = ('id', 'title', 'description', 'infohash', 'torrent_link', 'comment_link',
'imdbId', 'announce', 'domain', 'published', 'language', 'subtitle language')
_int_keys = ('size', 'seeder', 'leecher', 'downloaded', 'files')
_dict_keys = ('torrent_info', )
_list_keys = ()
data = {'torrent_info': {}}
def __init__(self):
for key in self._string_keys:
self[key] = self.data.get(key, u'')
for key in self._dict_keys:
self[key] = self.data.get(key, {})
for key in self._list_keys:
self[key] = self.data.get(key, [])
for key in self._int_keys:
value = self.data.get(key, -1)
if not isinstance(value, int):
value = int(int_value(value))
self[key] = value
self['infohash'] = self.data['torrent_info'].get('hash', '')
self['size'] = self.data['torrent_info'].get('size', -1)
self['announce'] = self.data['torrent_info'].get('announce', '')
if 'files' in self.data['torrent_info']:
self['files'] = len(self.data['torrent_info']['files'])
else:
self['files'] = 1

View file

@ -0,0 +1,32 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import re
import time
from ox import strip_tags, find_re
from ox.cache import read_url
def get_episode_data(url):
'''
prases informatin on tvcom episode pages
returns dict with title, show, description, score
example:
get_episode_data('http://www.tv.com/lost/do-no-harm/episode/399310/summary.html')
'''
data = read_url(url, unicode=True)
r = {}
r['description'] = strip_tags(find_re(data, 'div id="main-col">.*?<div>(.*?)</div').split('\r')[0])
r['show'] = find_re(data, '<h1>(.*?)</h1>')
r['title'] = find_re(data, '<title>.*?: (.*?) - TV.com </title>')
#episode score
r['episode score'] = find_re(data, '<span class="f-28 f-bold mt-10 mb-10 f-FF9 db lh-18">(.*?)</span>')
match = re.compile('Episode Number: (\d*?) &nbsp;&nbsp; Season Num: (\d*?) &nbsp;&nbsp; First Aired: (.*?) &nbsp').findall(data)
if match:
r['season'] = int(match[0][1])
r['episode'] = int(match[0][0])
#'Wednesday September 29, 2004' -> 2004-09-29
r['air date'] = time.strftime('%Y-%m-%d', time.strptime(match[0][2], '%A %B %d, %Y'))
return r

View file

@ -0,0 +1,35 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import re
from datetime import datetime
from urllib import quote
import lxml.html
import ox
from ox.cache import read_url
def find(query=None, user=None, timeout=60):
if user:
url = 'https://twitter.com/' + quote(user)
else:
url = 'https://twitter.com/search/' + quote(query)
data = ox.cache.read_url(url, timeout=timeout).decode('utf-8')
doc = lxml.html.document_fromstring(data)
tweets = []
for e in doc.xpath("//div[contains(@class, 'original-tweet')]"):
t = lxml.html.tostring(e)
text = e.xpath(".//p[contains(@class, 'js-tweet-text')]")[0]
html = lxml.html.tostring(text, encoding='unicode').strip()
text = ox.decode_html(ox.strip_tags(html)).strip()
user = re.compile('data-name="(.*?)"').findall(t)[0]
user = ox.decode_html(ox.strip_tags(user)).strip()
tweets.append({
'id': re.compile('data-tweet-id="(\d+)"').findall(t)[0],
'user-id': re.compile('data-user-id="(\d+)"').findall(t)[0],
'name': re.compile('data-screen-name="(.*?)"').findall(t)[0],
'time': datetime.fromtimestamp(int(re.compile('data-time="(\d+)"').findall(t)[0])),
'user': user,
'text': text,
'html': html,
})
return tweets

View file

@ -0,0 +1,99 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import re
from ox import find_re, strip_tags, decode_html
from ox.cache import read_url
def get_id(url):
return url.replace('http://www.ubu.com/', '').split('.html')[0]
def get_url(id):
return 'http://www.ubu.com/%s.html' % id
def get_data(url):
if not url.startswith('http:'):
url = get_url(url)
data = read_url(url, unicode=True)
m = {
'id': get_id(url),
'url': url,
'type': re.compile('ubu.com/(.*?)/').findall(url)[0]
}
for videourl, title in re.compile('<a href="(http://ubumexico.centro.org.mx/.*?)">(.*?)</a>').findall(data):
if videourl.endswith('.srt'):
m['srt'] = videourl
elif not 'video' in m:
m['video'] = videourl
m['video'] = m['video'].replace('/video/ ', '/video/').replace(' ', '%20')
if m['video'] == 'http://ubumexico.centro.org.mx/video/':
del m['video']
m['title'] = strip_tags(decode_html(title)).strip()
if not 'url' in m:
print url, 'missing'
if 'title' in m:
m['title'] = re.sub('(.*?) \(\d{4}\)$', '\\1', m['title'])
match = re.compile("flashvars','file=(.*?.flv)'").findall(data)
if match:
m['flv'] = match[0]
m['flv'] = m['flv'].replace('/video/ ', '/video/').replace(' ', '%20')
y = re.compile('\((\d{4})\)').findall(data)
if y:
m['year'] = int(y[0])
d = re.compile('Director: (.+)').findall(data)
if d:
m['director'] = strip_tags(decode_html(d[0])).strip()
a = re.compile('<a href="(.*?)">Back to (.*?)</a>', re.DOTALL).findall(data)
if a:
m['artist'] = strip_tags(decode_html(a[0][1])).strip()
else:
a = re.compile('<a href="(.*?)">(.*?) in UbuWeb Film').findall(data)
if a:
m['artist'] = strip_tags(decode_html(a[0][1])).strip()
else:
a = re.compile('<b>(.*?)\(b\..*?\d{4}\)').findall(data)
if a:
m['artist'] = strip_tags(decode_html(a[0])).strip()
elif m['id'] == 'film/lawder_color':
m['artist'] = 'Standish Lawder'
if 'artist' in m:
m['artist'] = m['artist'].replace('in UbuWeb Film', '')
m['artist'] = m['artist'].replace('on UbuWeb Film', '').strip()
if m['id'] == 'film/coulibeuf':
m['title'] = 'Balkan Baroque'
m['year'] = 1999
return m
def get_films():
ids = get_ids()
films = []
for id in ids:
info = get_data(id)
if info['type'] == 'film' and ('flv' in info or 'video' in info):
films.append(info)
return films
def get_ids():
data = read_url('http://www.ubu.com/film/')
ids = []
author_urls = []
for url, author in re.compile('<a href="(\./.*?)">(.*?)</a>').findall(data):
url = 'http://www.ubu.com/film' + url[1:]
data = read_url(url)
author_urls.append(url)
for u, title in re.compile('<a href="(.*?)">(.*?)</a>').findall(data):
if not u.startswith('http'):
if u == '../../sound/burroughs.html':
u = 'http://www.ubu.com/sound/burroughs.html'
elif u.startswith('../'):
u = 'http://www.ubu.com/' + u[3:]
else:
u = 'http://www.ubu.com/film/' + u
if u not in author_urls and u.endswith('.html'):
ids.append(u)
ids = [get_id(url) for url in list(set(ids))]
return ids

View file

@ -0,0 +1,27 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import re
from StringIO import StringIO
import xml.etree.ElementTree as ET
from ox.cache import read_url
from ox import find_string, find_re
def get_data(id):
url = 'http://www.vimeo.com/moogaloop/load/clip:%s' %id
xml = read_url(url)
tree = ET.parse(StringIO(xml))
request_signature = tree.find('request_signature').text
request_signature_expires = tree.find('request_signature_expires').text
data = {}
video_url = "http://www.vimeo.com/moogaloop/play/clip:%s/%s/%s/?q=" % \
(id, request_signature, request_signature_expires)
data['video_sd'] = video_url + 'sd'
data['video_hd'] = video_url + 'hd'
video = tree.find('video')
for key in ('caption', 'width', 'height', 'duration', 'thumbnail'):
data[key] = video.find(key).text
return data

View file

@ -0,0 +1,156 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
from __future__ import print_function
import re
from six.moves import urllib
from ox.utils import json
from ox.cache import read_url
from ox import find_re
def get_id(url):
return url.split("/")[-1]
def get_url(id=None, imdb=None, allmovie=None):
if imdb:
query = '"%s"'% imdb
result = find(query)
if result:
url = result[0][1]
data = get_movie_data(url)
if 'imdb_id' in data:
return url
return ""
if allmovie:
query = '"amg_id = 1:%s"'% allmovie
result = find(query)
if result:
url = result[0][1]
return url
return ''
return "http://en.wikipedia.org/wiki/%s" % id
def get_movie_id(title, director='', year=''):
query = '"%s" film %s %s' % (title, director, year)
result = find(query, 1)
if result:
return result[0][1]
return ''
def get_wiki_data(wikipedia_url):
url = wikipedia_url.replace('wikipedia.org/wiki/', 'wikipedia.org/w/index.php?title=')
url = "%s&action=raw" % url
data = read_url(url).decode('utf-8')
return data
def get_movie_data(wikipedia_url):
if not wikipedia_url.startswith('http'):
wikipedia_url = get_url(wikipedia_url)
data = get_wiki_data(wikipedia_url)
filmbox_data = find_re(data, '''\{\{[Ii]nfobox.[Ff]ilm(.*?)\n\}\}''')
filmbox = {}
_box = filmbox_data.strip().split('|')
for row in _box:
d = row.split('=')
if len(d) == 2:
_key = d[0].strip()
if _key:
key = _key
if key[0] == '|':
key = key[1:]
key = key.strip()
value = d[1].strip()
value = value.replace('<!-- see WP:ALT -->', '')
if '<br>' in value:
value = value.split('<br>')
if value:
if key in filmbox:
if isinstance(value, list) and isinstance(filmbox[key], basestring):
filmbox[key] = [filmbox[key]] + value
else:
filmbox[key] += value
if isinstance(filmbox[key], list):
filmbox[key] = [k for k in filmbox[key] if k]
else:
filmbox[key] = value
if not filmbox_data:
return filmbox
if 'amg_id' in filmbox and not filmbox['amg_id'].isdigit():
del filmbox['amg_id']
if 'Allmovie movie' in data:
filmbox['amg_id'] = find_re(data, 'Allmovie movie\|.*?(\d+)')
elif 'Allmovie title' in data:
filmbox['amg_id'] = find_re(data, 'Allmovie title\|.*?(\d+)')
if 'Official website' in data:
filmbox['website'] = find_re(data, 'Official website\|(.*?)}').strip()
r = re.compile('{{IMDb title\|id=(\d{7})', re.IGNORECASE).findall(data)
if r:
filmbox['imdb_id'] = r[0]
else:
r = re.compile('{{IMDb title\|(\d{7})', re.IGNORECASE).findall(data)
if r:
filmbox['imdb_id'] = r[0]
r = re.compile('{{Internet Archive.*?\|id=(.*?)[\|}]', re.IGNORECASE).findall(data)
if r:
filmbox['archiveorg_id'] = r[0]
r = re.compile('{{mojo title\|(.*?)[\|}]', re.IGNORECASE).findall(data)
if r:
filmbox['mojo_id'] = r[0].replace('id=', '')
r = re.compile('{{rotten-tomatoes\|(.*?)[\|}]', re.IGNORECASE).findall(data)
if r:
filmbox['rottentomatoes_id'] = r[0].replace('id=', '')
if 'google video' in data:
filmbox['google_video_id'] = find_re(data, 'google video\|.*?(\d*?)[\|}]')
if 'DEFAULTSORT' in data:
filmbox['title_sort'] = find_re(data, '''\{\{DEFAULTSORT:(.*?)\}\}''')
return filmbox
def get_image_url(name):
url = 'http://en.wikipedia.org/wiki/Image:' + name.replace(' ', '%20')
data = read_url(url)
url = find_re(data, 'href="(http://upload.wikimedia.org/.*?)"')
if not url:
url = find_re(data, 'href="(//upload.wikimedia.org/.*?)"')
if url:
url = 'http:' + url
return url
def get_poster_url(wikipedia_url):
if not wikipedia_url.startswith('http'): wikipedia_url = get_url(wikipedia_url)
data = get_movie_data(wikipedia_url)
if 'image' in data:
return get_image_url(data['image'])
return ''
def get_movie_poster(wikipedia_url):
# deprecated, use get_poster_url()
return get_poster_url(wikipedia_url)
def get_allmovie_id(wikipedia_url):
data = get_movie_data(wikipedia_url)
return data.get('amg_id', '')
def find(query, max_results=10):
query = {'action': 'query', 'list':'search', 'format': 'json',
'srlimit': max_results, 'srwhat': 'text', 'srsearch': query.encode('utf-8')}
url = "http://en.wikipedia.org/w/api.php?" + urllib.parse.urlencode(query)
data = read_url(url)
if not data:
data = read_url(url, timeout=0)
result = json.loads(data.decode('utf-8'))
results = []
if result and 'query' in result:
for r in result['query']['search']:
title = r['title']
url = "http://en.wikipedia.org/wiki/%s" % title.replace(' ', '_')
results.append((title, url, ''))
return results

View file

@ -0,0 +1,217 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
from urllib import quote, unquote_plus
import urllib2
import cookielib
import re
from xml.dom.minidom import parseString
import json
import feedparser
import ox
from ox.cache import read_url, cache_timeout
def get_id(url):
match = re.compile('v=(.+?)($|&)').findall(url)
if match:
return match[0][0]
def get_url(id):
return 'http://www.youtube.com/watch?v=%s' % id
def video_url(youtubeId, format='mp4', timeout=cache_timeout):
"""
youtubeId - if of video
format - video format, options: webm, 1080p, 720p, mp4, high
"""
fmt = None
if format == '4k':
fmt=38
elif format == '1080p':
fmt=37
elif format == '720p':
fmt=22
elif format == 'mp4':
fmt=18
elif format == 'high':
fmt=35
elif format == 'webm':
streams = videos(youtubeId, 'webm')
return streams[max(streams.keys())]['url']
streams = videos(youtubeId)
if str(fmt) in streams:
return streams[str(fmt)]['url']
def get_video_info(id):
eurl = get_url(id)
data = read_url(eurl)
t = re.compile('\W[\'"]?t[\'"]?: ?[\'"](.+?)[\'"]').findall(data)
if t:
t = t[0]
else:
raise IOError
url = "http://www.youtube.com/get_video_info?&video_id=%s&el=$el&ps=default&eurl=%s&hl=en_US&t=%s" % (id, quote(eurl), quote(t))
data = read_url(url)
info = {}
for part in data.split('&'):
key, value = part.split('=')
info[key] = unquote_plus(value).replace('+', ' ')
return info
def find(query, max_results=10, offset=1, orderBy='relevance'):
query = quote(query)
url = "http://gdata.youtube.com/feeds/api/videos?vq=%s&orderby=%s&start-index=%s&max-results=%s" % (query, orderBy, offset, max_results)
data = read_url(url)
fd = feedparser.parse(data)
videos = []
for item in fd.entries:
id = item['id'].split('/')[-1]
title = item['title']
description = item['description']
videos.append((title, id, description))
if len(videos) >= max_results:
return videos
return videos
def info(id, timeout=cache_timeout):
info = {}
if id.startswith('http'):
id = get_id(id)
if not id:
return info
url = "http://gdata.youtube.com/feeds/api/videos/%s?v=2" % id
data = read_url(url, timeout=timeout)
xml = parseString(data)
info['id'] = id
info['url'] = get_url(id)
info['title'] = xml.getElementsByTagName('title')[0].firstChild.data
info['description'] = xml.getElementsByTagName('media:description')[0].firstChild.data
info['date'] = xml.getElementsByTagName('published')[0].firstChild.data.split('T')[0]
info['author'] = "http://www.youtube.com/user/%s"%xml.getElementsByTagName('name')[0].firstChild.data
info['categories'] = []
for cat in xml.getElementsByTagName('media:category'):
info['categories'].append(cat.firstChild.data)
k = xml.getElementsByTagName('media:keywords')[0].firstChild
if k:
info['keywords'] = k.data.split(', ')
data = read_url(info['url'], timeout=timeout)
match = re.compile('<h4>License:</h4>(.*?)</p>', re.DOTALL).findall(data)
if match:
info['license'] = match[0].strip()
info['license'] = re.sub('<.+?>', '', info['license']).strip()
url = "http://www.youtube.com/api/timedtext?hl=en&type=list&tlangs=1&v=%s&asrs=1" % id
data = read_url(url, timeout=timeout)
xml = parseString(data)
languages = [t.getAttribute('lang_code') for t in xml.getElementsByTagName('track')]
if languages:
info['subtitles'] = {}
for language in languages:
url = "http://www.youtube.com/api/timedtext?hl=en&v=%s&type=track&lang=%s&name&kind"%(id, language)
data = read_url(url, timeout=timeout)
xml = parseString(data)
subs = []
for t in xml.getElementsByTagName('text'):
start = float(t.getAttribute('start'))
duration = t.getAttribute('dur')
if not duration:
duration = '2'
end = start + float(duration)
if t.firstChild:
text = t.firstChild.data
subs.append({
'in': start,
'out': end,
'value': ox.decode_html(text),
})
info['subtitles'][language] = subs
return info
def videos(id, format=''):
stream_type = {
'flv': 'video/x-flv',
'webm': 'video/webm',
'mp4': 'video/mp4'
}.get(format)
info = get_video_info(id)
stream_map = info['url_encoded_fmt_stream_map']
streams = {}
for x in stream_map.split(','):
stream = {}
#for s in x.split('\\u0026'):
for s in x.split('&'):
key, value = s.split('=')
value = unquote_plus(value)
stream[key] = value
if 'url' in stream and 'sig' in stream:
stream['url'] = '%s&signature=%s' % (stream['url'], stream['sig'])
if not stream_type or stream['type'].startswith(stream_type):
streams[stream['itag']] = stream
return streams
def playlist(url):
data = read_url(url)
items = []
for i in list(set(re.compile('<a href="(/watch\?v=.*?)" title="(.*?)" ').findall(data))):
items.append({
'title': i[1],
'url': 'http://www.youtube.com' + i[0].split('&amp;')[0]
})
return items
def download_webm(id, filename):
stream_type = 'video/webm'
url = "http://www.youtube.com/watch?v=%s" % id
cj = cookielib.CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
opener.addheaders = [
('User-Agent',
'Mozilla/5.0 (X11; Linux i686; rv:2.0) Gecko/20100101 Firefox/4.0'),
('Accept-Language', 'en-us, en;q=0.50')
]
u = opener.open(url)
data = u.read()
u.close()
match = re.compile('"url_encoded_fmt_stream_map": "(.*?)"').findall(data)
streams = {}
for x in match[0].split(','):
stream = {}
for s in x.split('\\u0026'):
key, value = s.split('=')
value = unquote_plus(value)
stream[key] = value
if stream['type'].startswith(stream_type):
streams[stream['itag']] = stream
if streams:
s = max(streams.keys())
url = streams[s]['url']
if 'sig' in streams[s]:
url += 'signature=' + streams[s]['sig']
else:
return None
#download video and save to file.
u = opener.open(url)
f = open(filename, 'w')
data = True
while data:
data = u.read(4096)
f.write(data)
f.close()
u.close()
return filename
def get_config(id):
if id.startswith('http'):
url = id
else:
url = get_url(id)
data = read_url(url)
match = re.compile('ytplayer.config = (.*?);<').findall(data)
if match:
config = json.load(match[0])
return config