Switch to python3
This commit is contained in:
parent
531041e89a
commit
9ba4b6a91a
5286 changed files with 677347 additions and 576888 deletions
9
Shared/lib/python3.4/site-packages/ox/web/__init__.py
Normal file
9
Shared/lib/python3.4/site-packages/ox/web/__init__.py
Normal file
|
|
@ -0,0 +1,9 @@
|
|||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
# encoding: utf-8
|
||||
__version__ = '1.0.0'
|
||||
|
||||
from . import imdb
|
||||
from . import wikipedia
|
||||
from . import google
|
||||
from . import piratecinema
|
||||
from . import oxdb
|
||||
20
Shared/lib/python3.4/site-packages/ox/web/abebooks.py
Normal file
20
Shared/lib/python3.4/site-packages/ox/web/abebooks.py
Normal file
|
|
@ -0,0 +1,20 @@
|
|||
from ox.cache import read_url
|
||||
import re
|
||||
import lxml.html
|
||||
|
||||
def get_data(id):
|
||||
info = {}
|
||||
base = 'http://www.abebooks.com'
|
||||
url = '%s/servlet/SearchResults?isbn=%s&sts=t' % (base, id)
|
||||
data = read_url(url)
|
||||
urls = re.compile('href="(/servlet/BookDetailsPL[^"]+)"').findall(data)
|
||||
if urls:
|
||||
details = '%s%s' % (base, urls[0])
|
||||
data = read_url(details)
|
||||
doc = lxml.html.document_fromstring(data)
|
||||
for e in doc.xpath("//*[contains(@id, 'biblio')]"):
|
||||
key = e.attrib['id'].replace('biblio-', '')
|
||||
value = e.text_content()
|
||||
if value and key not in ('bookcondition', 'binding'):
|
||||
info[key] = value
|
||||
return info
|
||||
85
Shared/lib/python3.4/site-packages/ox/web/allmovie.py
Normal file
85
Shared/lib/python3.4/site-packages/ox/web/allmovie.py
Normal file
|
|
@ -0,0 +1,85 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
import re
|
||||
|
||||
from ox import strip_tags, find_re
|
||||
from ox.cache import read_url
|
||||
|
||||
|
||||
def get_id(url):
|
||||
return url.split("/")[-1]
|
||||
|
||||
def get_data(id):
|
||||
'''
|
||||
>>> get_data('129689')['cast'][1][1]
|
||||
u'Marianne'
|
||||
>>> get_data('129689')['credits'][0][0]
|
||||
u'Jean-Luc Godard'
|
||||
>>> get_data('129689')['posters'][0]
|
||||
u'http://image.allmusic.com/00/adg/cov200/dru800/u812/u81260bbffr.jpg'
|
||||
>>> get_data('129689')['rating']
|
||||
u'4.5'
|
||||
'''
|
||||
if id.startswith('http'):
|
||||
id = get_id(id)
|
||||
data = {
|
||||
"url": get_url(id)
|
||||
}
|
||||
html = read_url(data["url"], unicode=True)
|
||||
data['aka'] = parse_list(html, 'AKA')
|
||||
data['category'] = find_re(html, '<dt>category</dt>.*?<dd>(.*?)</dd>')
|
||||
data['countries'] = parse_list(html, 'countries')
|
||||
data['director'] = parse_entry(html, 'directed by')
|
||||
data['genres'] = parse_list(html, 'genres')
|
||||
data['keywords'] = parse_list(html, 'keywords')
|
||||
data['posters'] = [find_re(html, '<img src="(http://cps-.*?)"')]
|
||||
data['produced'] = parse_list(html, 'produced by')
|
||||
data['rating'] = find_re(html, 'Stars" title="(.*?) Stars"')
|
||||
data['released'] = parse_entry(html, 'released by')
|
||||
data['releasedate'] = parse_list(html, 'release date')
|
||||
data['runtime'] = parse_entry(html, 'run time').replace('min.', '').strip()
|
||||
data['set'] = parse_entry(html, 'set in')
|
||||
data['synopsis'] = strip_tags(find_re(html, '<div class="toggle-text" itemprop="description">(.*?)</div>')).strip()
|
||||
data['themes'] = parse_list(html, 'themes')
|
||||
data['types'] = parse_list(html, 'types')
|
||||
data['year'] = find_re(html, '<span class="year">.*?(\d+)')
|
||||
#data['stills'] = [re.sub('_derived.*?/', '', i) for i in re.compile('<a href="#" title="movie still".*?<img src="(.*?)"', re.DOTALL).findall(html)]
|
||||
data['stills'] = re.compile('<a href="#" title="movie still".*?<img src="(.*?)"', re.DOTALL).findall(html)
|
||||
#html = read_url("http://allmovie.com/work/%s/cast" % id, unicode=True)
|
||||
#data['cast'] = parse_table(html)
|
||||
#html = read_url("http://allmovie.com/work/%s/credits" % id, unicode=True)
|
||||
#data['credits'] = parse_table(html)
|
||||
html = read_url("http://allmovie.com/work/%s/review" % id, unicode=True)
|
||||
data['review'] = strip_tags(find_re(html, '<div class="toggle-text" itemprop="description">(.*?)</div>')).strip()
|
||||
return data
|
||||
|
||||
def get_url(id):
|
||||
return "http://allmovie.com/work/%s" % id
|
||||
|
||||
def parse_entry(html, title):
|
||||
html = find_re(html, '<dt>%s</dt>.*?<dd>(.*?)</dd>' % title)
|
||||
return strip_tags(html).strip()
|
||||
|
||||
def parse_list(html, title):
|
||||
html = find_re(html, '<dt>%s</dt>.*?<dd>(.*?)</dd>' % title.lower())
|
||||
r = map(strip_tags, re.compile('<li>(.*?)</li>', re.DOTALL).findall(html))
|
||||
if not r and html:
|
||||
r = [strip_tags(html)]
|
||||
return r
|
||||
|
||||
def parse_table(html):
|
||||
return [
|
||||
[
|
||||
strip_tags(r).strip().replace(' ', '')
|
||||
for r in x.split('<td width="305">-')
|
||||
]
|
||||
for x in find_re(html, '<div id="results-table">(.*?)</table>').split('</tr>')[:-1]
|
||||
]
|
||||
|
||||
def parse_text(html, title):
|
||||
return strip_tags(find_re(html, '%s</td>.*?<td colspan="2"><p>(.*?)</td>' % title)).strip()
|
||||
|
||||
if __name__ == '__main__':
|
||||
print get_data('129689')
|
||||
# print get_data('177524')
|
||||
|
||||
77
Shared/lib/python3.4/site-packages/ox/web/amazon.py
Normal file
77
Shared/lib/python3.4/site-packages/ox/web/amazon.py
Normal file
|
|
@ -0,0 +1,77 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
import re
|
||||
from six.moves.urllib.parse import quote
|
||||
|
||||
from ox import find_re, strip_tags, decode_html
|
||||
from ox.cache import read_url
|
||||
|
||||
import lxml
|
||||
|
||||
|
||||
def findISBN(title, author):
|
||||
q = '%s %s' % (title, author)
|
||||
url = "http://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Dstripbooks&field-keywords=" + "%s&x=0&y=0" % quote(q)
|
||||
data = read_url(url, unicode=True)
|
||||
links = re.compile('href="(http://www.amazon.com/.*?/dp/.*?)"').findall(data)
|
||||
id = find_re(re.compile('href="(http://www.amazon.com/.*?/dp/.*?)"').findall(data)[0], '/dp/(.*?)/')
|
||||
data = get_data(id)
|
||||
if author in data['authors']:
|
||||
return data
|
||||
return {}
|
||||
|
||||
def get_data(id):
|
||||
url = "http://www.amazon.com/title/dp/%s/" % id
|
||||
data = read_url(url, unicode=True)
|
||||
|
||||
|
||||
def find_data(key):
|
||||
return find_re(data, '<li><b>%s:</b>(.*?)</li>'% key).strip()
|
||||
|
||||
r = {}
|
||||
r['amazon'] = url
|
||||
r['title'] = find_re(data, '<span id="productTitle" class="a-size-large">(.*?)</span>')
|
||||
r['authors'] = []
|
||||
doc = lxml.html.document_fromstring(data)
|
||||
for e in doc.xpath("//span[contains(@class, 'author')]"):
|
||||
print e
|
||||
for secondary in e.xpath(".//span[contains(@class, 'a-color-secondary')]"):
|
||||
if 'Author' in secondary.text:
|
||||
author = e.xpath(".//span[contains(@class, 'a-size-medium')]")
|
||||
if author:
|
||||
r['authors'].append(author[0].text.strip())
|
||||
else:
|
||||
r['authors'].append(e.xpath('.//a')[0].text.strip())
|
||||
break
|
||||
elif 'Translator' in secondary.text:
|
||||
r['translator'] = [e.xpath('.//a')[0].text]
|
||||
break
|
||||
r['publisher'] = find_data('Publisher')
|
||||
r['language'] = find_data('Language')
|
||||
r['isbn-10'] = find_data('ISBN-10')
|
||||
r['isbn-13'] = find_data('ISBN-13').replace('-', '')
|
||||
r['dimensions'] = find_re(data, '<li><b>.*?Product Dimensions:.*?</b>(.*?)</li>')
|
||||
|
||||
r['pages'] = find_data('Paperback')
|
||||
if not r['pages']:
|
||||
r['pages'] = find_data('Hardcover')
|
||||
|
||||
r['review'] = strip_tags(find_re(data, '<h3 class="productDescriptionSource">Review</h3>.*?<div class="productDescriptionWrapper">(.*?)</div>').replace('<br />', '\n')).strip()
|
||||
|
||||
for e in doc.xpath('//noscript'):
|
||||
for c in e.getchildren():
|
||||
if c.tag == 'div':
|
||||
r['description'] = strip_tags(decode_html(lxml.html.tostring(c))).strip()
|
||||
break
|
||||
|
||||
r['cover'] = re.findall('src="(.*?)" id="prodImage"', data)
|
||||
if r['cover']:
|
||||
r['cover'] = r['cover'][0].split('._BO2')[0]
|
||||
if not r['cover'].endswith('.jpg'):
|
||||
r['cover'] = r['cover'] + '.jpg'
|
||||
if 'no-image-avail-img' in r['cover']:
|
||||
del r['cover']
|
||||
else:
|
||||
del r['cover']
|
||||
return r
|
||||
|
||||
67
Shared/lib/python3.4/site-packages/ox/web/apple.py
Normal file
67
Shared/lib/python3.4/site-packages/ox/web/apple.py
Normal file
|
|
@ -0,0 +1,67 @@
|
|||
import json
|
||||
import re
|
||||
|
||||
from ox.cache import read_url
|
||||
|
||||
HEADERS = {
|
||||
'User-Agent': 'iTunes/10.4 (Macintosh; Intel Mac OS X 10.7) AppleWebKit/534.48.3',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||
'Accept-Language': 'en-us, en;q=0.50',
|
||||
'X-Apple-Store-Front': '143441-1,12',
|
||||
'X-Apple-Tz': '7200',
|
||||
'Accept-Encoding': 'gzip, deflate'
|
||||
}
|
||||
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7) '
|
||||
USER_AGENT += 'AppleWebKit/534.48.3 (KHTML, like Gecko) Version/5.1 Safari/534.48.3'
|
||||
|
||||
def get_movie_data(title, director):
|
||||
if isinstance(title, unicode):
|
||||
title = title.encode('utf-8')
|
||||
if isinstance(director, unicode):
|
||||
director = director.encode('utf-8')
|
||||
data = {}
|
||||
# itunes section (preferred source for link)
|
||||
url = 'http://ax.search.itunes.apple.com/WebObjects/MZSearch.woa/wa/advancedSearch'
|
||||
url += '?media=movie&movieTerm=' + title
|
||||
url += '&actorNames=&directorProducerName=' + director
|
||||
url += '&releaseYearTerm=&descriptionTerm=&genreIndex=1&ratingIndex=1'
|
||||
HEADERS['Referer'] = url
|
||||
html = read_url(url, headers=HEADERS, unicode=True)
|
||||
regexp = '<a href="(http://itunes.apple.com/us/movie/.*?)" class="artwork-link"><div class="artwork">'
|
||||
regexp += '<img width=".*?" height=".*?" alt=".*?" class="artwork" src="(.*?)" /></div></a>'
|
||||
results = re.compile(regexp).findall(html)
|
||||
if results:
|
||||
data['link'] = results[0][0]
|
||||
data['poster'] = results[0][1].replace('140x140', '600x600')
|
||||
html = read_url(data['link'], headers=HEADERS, unicode=True)
|
||||
results = re.compile('video-preview-url="(.*?)"').findall(html)
|
||||
if results:
|
||||
data['trailer'] = results[0]
|
||||
# trailers section (preferred source for poster and trailer)
|
||||
host = 'http://trailers.apple.com'
|
||||
url = host + '/trailers/home/scripts/quickfind.php?callback=searchCallback&q=' + title
|
||||
js = json.loads(read_url(url, unicode=True)[16:-4])
|
||||
results = js['results']
|
||||
if results:
|
||||
url = host + results[0]['location']
|
||||
if not 'link' in data:
|
||||
data['link'] = url
|
||||
headers = {
|
||||
'User-Agent': USER_AGENT
|
||||
}
|
||||
html = read_url(url, headers=headers, unicode=True)
|
||||
results = re.compile('"(' + host + '.*?poster\.jpg)"').findall(html)
|
||||
if results:
|
||||
data['poster'] = results[0].replace('poster.jpg', 'poster-xlarge.jpg')
|
||||
html = read_url(url + 'includes/playlists/web.inc', headers=headers, unicode=True)
|
||||
results = re.compile('"(' + host + '\S+\.mov)"').findall(html)
|
||||
if results:
|
||||
data['trailer'] = results[-1]
|
||||
return data
|
||||
|
||||
if __name__ == '__main__':
|
||||
print get_movie_data('Alphaville', 'Jean-Luc Godard')
|
||||
print get_movie_data('Sin City', 'Roberto Rodriguez')
|
||||
print get_movie_data('Breathless', 'Jean-Luc Godard')
|
||||
print get_movie_data('Capitalism: A Love Story', 'Michael Moore')
|
||||
print get_movie_data('Film Socialisme', 'Jean-Luc Godard')
|
||||
26
Shared/lib/python3.4/site-packages/ox/web/archive.py
Normal file
26
Shared/lib/python3.4/site-packages/ox/web/archive.py
Normal file
|
|
@ -0,0 +1,26 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
from .. import cache
|
||||
from ..utils import json
|
||||
|
||||
def get_id(url):
|
||||
return url.split("/")[-1]
|
||||
|
||||
def get_url(id):
|
||||
return "http://www.archive.org/details/%s" % id
|
||||
|
||||
def get_data(id):
|
||||
data = {}
|
||||
url = get_url(id)
|
||||
details = cache.read_url('%s?output=json' % url)
|
||||
details = json.loads(details)
|
||||
for key in ('title', 'description', 'runtime'):
|
||||
data[key] = details['metadata'][key]
|
||||
if isinstance(data[key], list):
|
||||
data[key] = data[key][0]
|
||||
data['url'] = url
|
||||
data['image'] = 'http://archive.org/download/%s/format=thumbnail' % id
|
||||
data['ogg'] = 'http://archive.org/download/%s/format=Ogg+video' % id
|
||||
data['mp4'] = 'http://archive.org/download/%s/format=512Kb+MPEG4' % id
|
||||
return data
|
||||
|
||||
71
Shared/lib/python3.4/site-packages/ox/web/arsenalberlin.py
Normal file
71
Shared/lib/python3.4/site-packages/ox/web/arsenalberlin.py
Normal file
|
|
@ -0,0 +1,71 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
|
||||
from ox import find_re, strip_tags
|
||||
from ox.cache import read_url
|
||||
|
||||
def get_data(id, language='en'):
|
||||
if language == 'de':
|
||||
url = 'http://films.arsenal-berlin.de/index.php/Detail/Object/Show/object_id/%d/lang/de_DE' % id
|
||||
else:
|
||||
url = 'http://films.arsenal-berlin.de/index.php/Detail/Object/Show/object_id/%d' % id
|
||||
html = read_url(url, unicode=True)
|
||||
if 'ID does not exist' in html:
|
||||
return None
|
||||
if 'Willkommen in der Datenbank des Arsenal' in html:
|
||||
return None
|
||||
data = {}
|
||||
data[u'id'] = id
|
||||
data[u'url'] = url
|
||||
m = re.compile('<h1>(.*?)</h1>').findall(html)
|
||||
if m:
|
||||
data[u'title'] = m[0]
|
||||
m = re.compile("<b>Director: </b><a href='.*?'>(.*?)</a>").findall(html)
|
||||
if m:
|
||||
data[u'director'] = m[0]
|
||||
|
||||
m = re.compile("caUI.initImageScroller\(\[\{url:'(.*?)'").findall(html)
|
||||
if m:
|
||||
data[u'image'] = m[0]
|
||||
|
||||
units = re.compile("<div class='unit'>(.*?)</div>", re.DOTALL).findall(html)
|
||||
for x in map(re.compile('<b>(.*?)</b>: (.*)', re.DOTALL).findall, units):
|
||||
if x:
|
||||
#data[x[0][0].lower()] = strip_tags(x[0][1])
|
||||
key = x[0][0].lower()
|
||||
data[key] = x[0][1]
|
||||
if key == "forum catalogue pdf":
|
||||
data[key] = find_re(data[key], '"(http:.*?)"')
|
||||
else:
|
||||
data[key] = strip_tags(data[key])
|
||||
if "running time (minutes)" in data:
|
||||
data[u'runtime'] = float(data.pop("running time (minutes)").replace(',', '.')) * 60
|
||||
for key in ('year', 'length in metres', 'forum participation year', 'number of reels'):
|
||||
if key in data and data[key].isdigit():
|
||||
data[key] = int(data[key])
|
||||
return data
|
||||
|
||||
def backup(filename):
|
||||
if os.path.exists(filename):
|
||||
with open(filename) as f:
|
||||
data = json.load(f)
|
||||
else:
|
||||
data = {}
|
||||
start = max(map(int, data)) or 1
|
||||
for i in range(start, 11872):
|
||||
info = get_data(i)
|
||||
if info:
|
||||
data[i] = info
|
||||
if len(data) % 10 == 0:
|
||||
print 'save', filename, len(data)
|
||||
with open(filename, 'w') as f:
|
||||
json.dump(data, f)
|
||||
else:
|
||||
print 'ignore', i
|
||||
with open(filename, 'w') as f:
|
||||
json.dump(data, f)
|
||||
return data
|
||||
|
||||
33
Shared/lib/python3.4/site-packages/ox/web/auth.py
Normal file
33
Shared/lib/python3.4/site-packages/ox/web/auth.py
Normal file
|
|
@ -0,0 +1,33 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
# GPL 2009
|
||||
import os
|
||||
|
||||
from ox.utils import json
|
||||
|
||||
def get(key):
|
||||
user_auth = os.environ.get('oxAUTH', os.path.expanduser('~/.ox/auth.json'))
|
||||
auth = {}
|
||||
if os.path.exists(user_auth):
|
||||
f = open(user_auth, "r")
|
||||
data = f.read()
|
||||
f.close()
|
||||
auth = json.loads(data)
|
||||
if key in auth:
|
||||
return auth[key]
|
||||
print "please add key %s to json file '%s'" % (key, user_auth)
|
||||
raise Exception,"no key %s found" % key
|
||||
|
||||
def update(key, value):
|
||||
user_auth = os.environ.get('oxAUTH', os.path.expanduser('~/.ox/auth.json'))
|
||||
auth = {}
|
||||
if os.path.exists(user_auth):
|
||||
f = open(user_auth, "r")
|
||||
data = f.read()
|
||||
f.close()
|
||||
auth = json.loads(data)
|
||||
auth[key] = value
|
||||
f = open(user_auth, "w")
|
||||
f.write(json.dumps(auth, indent=2))
|
||||
f.close()
|
||||
|
||||
100
Shared/lib/python3.4/site-packages/ox/web/criterion.py
Normal file
100
Shared/lib/python3.4/site-packages/ox/web/criterion.py
Normal file
|
|
@ -0,0 +1,100 @@
|
|||
# -*- coding: UTF-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
import re
|
||||
|
||||
import ox.cache
|
||||
from ox.cache import read_url
|
||||
from ox.html import strip_tags
|
||||
from ox.text import find_re
|
||||
|
||||
import imdb
|
||||
|
||||
def get_id(url):
|
||||
return url.split("/")[-1]
|
||||
|
||||
def get_url(id):
|
||||
return "http://www.criterion.com/films/%s" % id
|
||||
|
||||
def get_data(id, timeout=ox.cache.cache_timeout, get_imdb=False):
|
||||
'''
|
||||
>>> get_data('1333').get('imdbId')
|
||||
u'0060304'
|
||||
|
||||
>>> get_data('236')['posters'][0]
|
||||
u'http://s3.amazonaws.com/criterion-production/release_images/1586/ThirdManReplace.jpg'
|
||||
|
||||
>>> get_data('786')['posters'][0]
|
||||
u'http://s3.amazonaws.com/criterion-production/product_images/185/343_box_348x490.jpg'
|
||||
'''
|
||||
data = {
|
||||
"url": get_url(id)
|
||||
}
|
||||
try:
|
||||
html = read_url(data["url"], timeout=timeout, unicode=True)
|
||||
except:
|
||||
html = ox.cache.read_url(data["url"], timeout=timeout)
|
||||
data["number"] = find_re(html, "<li>Spine #(\d+)")
|
||||
|
||||
data["title"] = find_re(html, "<h1 class=\"movietitle\">(.*?)</h1>")
|
||||
data["title"] = data["title"].split(u' \u2014 The Television Version')[0]
|
||||
data["director"] = strip_tags(find_re(html, "<h2 class=\"director\">(.*?)</h2>"))
|
||||
results = find_re(html, '<div class="left_column">(.*?)</div>')
|
||||
results = re.compile("<li>(.*?)</li>").findall(results)
|
||||
data["country"] = results[0]
|
||||
data["year"] = results[1]
|
||||
data["synopsis"] = strip_tags(find_re(html, "<div class=\"content_block last\">.*?<p>(.*?)</p>"))
|
||||
|
||||
result = find_re(html, "<div class=\"purchase\">(.*?)</div>")
|
||||
if 'Blu-Ray' in result or 'Essential Art House DVD' in result:
|
||||
r = re.compile('<h3 class="section_title first">Other Editions</h3>(.*?)</div>', re.DOTALL).findall(html)
|
||||
if r:
|
||||
result = r[0]
|
||||
result = find_re(result, "<a href=\"(.*?)\"")
|
||||
if not "/boxsets/" in result:
|
||||
data["posters"] = [result]
|
||||
else:
|
||||
html_ = read_url(result, unicode=True)
|
||||
result = find_re(html_, '<a href="http://www.criterion.com/films/%s.*?">(.*?)</a>' % id)
|
||||
result = find_re(result, "src=\"(.*?)\"")
|
||||
if result:
|
||||
data["posters"] = [result.replace("_w100", "")]
|
||||
else:
|
||||
data["posters"] = []
|
||||
data['posters'] = [re.sub('(\?\d+)$', '', p) for p in data['posters']]
|
||||
result = find_re(html, "<img alt=\"Film Still\" height=\"252\" src=\"(.*?)\"")
|
||||
if result:
|
||||
data["stills"] = [result]
|
||||
data["trailers"] = []
|
||||
else:
|
||||
data["stills"] = filter(lambda x: x, [find_re(html, "\"thumbnailURL\", \"(.*?)\"")])
|
||||
data["trailers"] = filter(lambda x: x, [find_re(html, "\"videoURL\", \"(.*?)\"")])
|
||||
|
||||
if timeout == ox.cache.cache_timeout:
|
||||
timeout = -1
|
||||
if get_imdb:
|
||||
# removed year, as "title (year)" may fail to match
|
||||
data['imdbId'] = imdb.get_movie_id(data['title'], data['director'], timeout=timeout)
|
||||
return data
|
||||
|
||||
def get_ids(page=None):
|
||||
ids = []
|
||||
if page:
|
||||
url = "http://www.criterion.com/library/expanded_view?m=dvd&p=%s&pp=50&s=spine" % page
|
||||
html = read_url(url)
|
||||
results = re.compile("films/(\d+)").findall(html)
|
||||
ids += results
|
||||
results = re.compile("boxsets/(.*?)\"").findall(html)
|
||||
for result in results:
|
||||
html = read_url("http://www.criterion.com/boxsets/" + result)
|
||||
results = re.compile("films/(\d+)").findall(html)
|
||||
ids += results
|
||||
return set(ids)
|
||||
html = read_url("http://www.criterion.com/library/expanded_view?m=dvd&p=1&pp=50&s=spine", unicode=True)
|
||||
results = re.compile("\&p=(\d+)\&").findall(html)
|
||||
pages = max(map(int, results))
|
||||
for page in range(1, pages):
|
||||
ids += get_ids(page)
|
||||
return sorted(set(ids), key=int)
|
||||
|
||||
if __name__ == '__main__':
|
||||
print get_ids()
|
||||
21
Shared/lib/python3.4/site-packages/ox/web/dailymotion.py
Normal file
21
Shared/lib/python3.4/site-packages/ox/web/dailymotion.py
Normal file
|
|
@ -0,0 +1,21 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
import re
|
||||
from six.moves.urllib.parse import unquote
|
||||
from ox.cache import read_url
|
||||
|
||||
|
||||
def get_video_url(url):
|
||||
'''
|
||||
>>> get_video_url('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3opar_priere-pour-refuznik-1-jeanluc-goda_shortfilms').split('?auth')[0]
|
||||
'http://www.dailymotion.com/cdn/FLV-320x240/video/x3opar_priere-pour-refuznik-1-jean-luc-god_shortfilms.flv'
|
||||
|
||||
>>> get_video_url('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3ou94_priere-pour-refuznik-2-jeanluc-goda_shortfilms').split('?auth')[0]
|
||||
'http://www.dailymotion.com/cdn/FLV-320x240/video/x3ou94_priere-pour-refuznik-2-jean-luc-god_shortfilms.flv'
|
||||
'''
|
||||
data = read_url(url)
|
||||
video = re.compile('''video", "(.*?)"''').findall(data)
|
||||
for v in video:
|
||||
v = unquote(v).split('@@')[0]
|
||||
return v
|
||||
return ''
|
||||
22
Shared/lib/python3.4/site-packages/ox/web/duckduckgo.py
Normal file
22
Shared/lib/python3.4/site-packages/ox/web/duckduckgo.py
Normal file
|
|
@ -0,0 +1,22 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
import re
|
||||
|
||||
from six.moves import urllib
|
||||
import ox
|
||||
from ox import strip_tags, decode_html
|
||||
from ox.cache import read_url
|
||||
|
||||
|
||||
def find(query, timeout=ox.cache.cache_timeout):
|
||||
if not isinstance(query, bytes):
|
||||
query = query.encode('utf-8')
|
||||
params = urllib.parse.urlencode({'q': query})
|
||||
url = 'http://duckduckgo.com/html/?' + params
|
||||
data = read_url(url, timeout=timeout).decode('utf-8')
|
||||
results = []
|
||||
regex = '<a .*?class="large" href="(.+?)">(.*?)</a>.*?<div class="snippet">(.*?)</div>'
|
||||
for r in re.compile(regex, re.DOTALL).findall(data):
|
||||
results.append((strip_tags(decode_html(r[1])), r[0], strip_tags(decode_html(r[2]))))
|
||||
return results
|
||||
|
||||
49
Shared/lib/python3.4/site-packages/ox/web/epguides.py
Normal file
49
Shared/lib/python3.4/site-packages/ox/web/epguides.py
Normal file
|
|
@ -0,0 +1,49 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
import re
|
||||
import time
|
||||
|
||||
from ox import strip_tags, find_re
|
||||
from ox.cache import read_url
|
||||
|
||||
import google
|
||||
|
||||
|
||||
def get_show_url(title):
|
||||
'''
|
||||
Search Epguide Url for Show via Show Title.
|
||||
Use Google to search the url, this is also done on Epguide.
|
||||
'''
|
||||
for (name, url, desc) in google.find('allintitle: site:epguides.com %s' % title, 1):
|
||||
if url.startswith('http://epguides.com'):
|
||||
if re.search(title, name):
|
||||
return url
|
||||
return None
|
||||
|
||||
def get_show_data(url):
|
||||
data = read_url(url, unicode=True)
|
||||
r = {}
|
||||
r['title'] = strip_tags(find_re(data, '<h1>(.*?)</h1>'))
|
||||
r['imdb'] = find_re(data, '<h1><a href=".*?/title/tt(\d.*?)">.*?</a></h1>')
|
||||
r['episodes'] = {}
|
||||
#1. 1- 1 1001 7 Aug 05 You Can't Miss the Bear
|
||||
for episode in re.compile('(\d+?)\..*?(\d+?-.\d.*?) .*?(\d+?) .*?(.*?) <a target="_blank" href="(.*?)">(.*?)</a>').findall(data):
|
||||
air_date = episode[3].strip()
|
||||
#'22 Sep 04' -> 2004-09-22
|
||||
try:
|
||||
air_date = time.strftime('%Y-%m-%d', time.strptime(air_date, '%d %b %y'))
|
||||
except:
|
||||
pass
|
||||
s = episode[1].split('-')[0].strip()
|
||||
e = episode[1].split('-')[-1].strip()
|
||||
try:
|
||||
r['episodes']['S%02dE%02d' % (int(s), int(e))] = {
|
||||
'prod code': episode[2],
|
||||
'air date': air_date,
|
||||
'url': episode[4],
|
||||
'title':episode[5],
|
||||
}
|
||||
except:
|
||||
print "oxweb.epguides failed,", url
|
||||
return r
|
||||
|
||||
39
Shared/lib/python3.4/site-packages/ox/web/filmsdivision.py
Normal file
39
Shared/lib/python3.4/site-packages/ox/web/filmsdivision.py
Normal file
|
|
@ -0,0 +1,39 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
import re
|
||||
import string
|
||||
import subprocess
|
||||
import ox
|
||||
import os
|
||||
|
||||
def get_ids():
|
||||
result = []
|
||||
for i in string.ascii_uppercase:
|
||||
url = "http://www.filmsdivision.org/search.php?title=%s" % i
|
||||
data = ox.cache.read_url(url)
|
||||
links = re.compile('view_video.php\?movId=(.*?)[\'"]', re.DOTALL).findall(data)
|
||||
result += links
|
||||
return list(set(result))
|
||||
|
||||
def get_data(id):
|
||||
result = {}
|
||||
url = "http://www.filmsdivision.org/view_video.php?movId=%s" % id
|
||||
data = ox.cache.read_url(url)
|
||||
result['title'] = re.compile('<td.*?class="vdoheadtxt".*?>(.*?)</td>').findall(data)[0]
|
||||
result['year'] = re.compile('Release: (\d{4})').findall(data)[0]
|
||||
result['duration'] = int(re.compile('Duration: (\d+)mins').findall(data)[0]) * 60
|
||||
result['producer'] = re.compile('Producer: (.*?)\t').findall(data)[0].strip()
|
||||
if 'Director:' in data:
|
||||
result['director'] = re.compile('Director: (.*?)\t').findall(data)[0].strip()
|
||||
else:
|
||||
result['director'] = "Unknown Director"
|
||||
result['url'] = re.compile('value="(.*?.wmv)"').findall(data)[0]
|
||||
return result
|
||||
|
||||
def download_video(url, filename):
|
||||
dirname = os.path.dirname(filename)
|
||||
if not os.path.exists(dirname):
|
||||
os.makedirs(dirname)
|
||||
p = subprocess.Popen(['gst-launch', 'mmssrc', 'location=%s'%url, '!', 'filesink', 'locaiton='%filename])
|
||||
p.wait()
|
||||
return p.returncode == 0
|
||||
74
Shared/lib/python3.4/site-packages/ox/web/flixter.py
Normal file
74
Shared/lib/python3.4/site-packages/ox/web/flixter.py
Normal file
|
|
@ -0,0 +1,74 @@
|
|||
# -*- coding: UTF-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
|
||||
import re
|
||||
from lxml.html import document_fromstring
|
||||
|
||||
from ox.cache import read_url
|
||||
from ox import find_re, strip_tags
|
||||
from ox.web.imdb import ImdbCombined
|
||||
|
||||
|
||||
def get_data(id, timeout=-1):
|
||||
'''
|
||||
>>> get_data('the-matrix')['poster']
|
||||
'http://content7.flixster.com/movie/16/90/52/1690525_gal.jpg'
|
||||
|
||||
>>> get_data('0133093')['poster']
|
||||
'http://content7.flixster.com/movie/16/90/52/1690525_gal.jpg'
|
||||
|
||||
>>> get_data('2-or-3-things-i-know-about-her')['poster']
|
||||
'http://content6.flixster.com/movie/10/95/43/10954392_gal.jpg'
|
||||
|
||||
>>> get_data('0078875')['rottentomatoes_id']
|
||||
'http://www.rottentomatoes.com/m/the-tin-drum/'
|
||||
'''
|
||||
if len(id) == 7:
|
||||
try:
|
||||
int(id)
|
||||
id = get_id(imdb=id)
|
||||
except:
|
||||
pass
|
||||
data = {
|
||||
"url": get_url(id),
|
||||
}
|
||||
html = read_url(data['url'], timeout=timeout, unicode=True)
|
||||
doc = document_fromstring(html)
|
||||
|
||||
props = {
|
||||
'og:title': 'title',
|
||||
'og:image': 'poster',
|
||||
'og:url': 'rottentomatoes_id',
|
||||
}
|
||||
for meta in doc.head.findall('meta'):
|
||||
prop = meta.attrib.get('property', None)
|
||||
content = meta.attrib.get('content', '')
|
||||
if prop in props and content:
|
||||
data[props[prop]] = content
|
||||
|
||||
for p in doc.body.find_class('synopsis'):
|
||||
data['synopsis'] = p.text.strip()
|
||||
|
||||
if 'poster' in data and data['poster']:
|
||||
data['poster'] = data['poster'].replace('_pro.jpg', '_gal.jpg')
|
||||
if not 'title' in data:
|
||||
return None
|
||||
return data
|
||||
|
||||
def get_id(url=None, imdb=None):
|
||||
'''
|
||||
>>> get_id(imdb='0133093')
|
||||
u'the-matrix'
|
||||
|
||||
#>>> get_id(imdb='0060304')
|
||||
#u'2-or-3-things-i-know-about-her'
|
||||
'''
|
||||
if imdb:
|
||||
i = ImdbCombined(imdb)
|
||||
title = i['title']
|
||||
return title.replace(' ', '-').lower().replace("'", '')
|
||||
return url.split('/')[-1]
|
||||
|
||||
def get_url(id):
|
||||
return "http://www.flixster.com/movie/%s"%id
|
||||
|
||||
42
Shared/lib/python3.4/site-packages/ox/web/freebase.py
Normal file
42
Shared/lib/python3.4/site-packages/ox/web/freebase.py
Normal file
|
|
@ -0,0 +1,42 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
import json
|
||||
|
||||
from ox.cache import read_url
|
||||
from ox import find_re
|
||||
|
||||
class Freebase(dict):
|
||||
def __init__(self, id, timeout=-1):
|
||||
url = "http://ids.freebaseapps.com/get_ids?id=/authority/imdb/title/tt%s" % id
|
||||
'''
|
||||
"http://graph.freebase.com/imdb.title.tt%s" % id
|
||||
might also be of interest at some point, right now not much info
|
||||
'''
|
||||
data = read_url(url, unicode=True)
|
||||
try:
|
||||
data = json.loads(data)
|
||||
except ValueError:
|
||||
return
|
||||
'''
|
||||
for key in data:
|
||||
self[key] = data[key]
|
||||
'''
|
||||
for key in ('id', 'guid', 'name'):
|
||||
self[key] = data[key]
|
||||
keys = {
|
||||
'wikipedia': '/wikipedia/en',
|
||||
'netflix': '/authority/netflix/movie',
|
||||
'nytimes': '/source/nytimes/movie',
|
||||
'metacritic': '/source/metacritic/movie',
|
||||
}
|
||||
for key in keys:
|
||||
links = filter(lambda x: x['namespace'] == keys[key],data['ids'])
|
||||
if links:
|
||||
self[key] = links[0]['uri']
|
||||
|
||||
if 'nytimes' in self:
|
||||
self['nytimes'] = self['nytimes'].replace('_/overview', '%s/overview' % self['name'].replace(' ', '-'))
|
||||
self['amgId'] = find_re(self['nytimes'], 'movie/(\d+)/')
|
||||
|
||||
|
||||
|
||||
44
Shared/lib/python3.4/site-packages/ox/web/google.py
Normal file
44
Shared/lib/python3.4/site-packages/ox/web/google.py
Normal file
|
|
@ -0,0 +1,44 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
import re
|
||||
from six.moves import urllib
|
||||
|
||||
import ox
|
||||
from ox import strip_tags, decode_html
|
||||
|
||||
DEFAULT_MAX_RESULTS = 10
|
||||
DEFAULT_TIMEOUT = 24*60*60
|
||||
|
||||
def read_url(url, data=None, headers=ox.net.DEFAULT_HEADERS, timeout=DEFAULT_TIMEOUT):
|
||||
return ox.cache.read_url(url, data, headers, timeout, unicode=True)
|
||||
|
||||
def quote_plus(s):
|
||||
if not isinstance(s, bytes):
|
||||
s = s.encode('utf-8')
|
||||
return urllib.parse.quote_plus(s)
|
||||
|
||||
def find(query, max_results=DEFAULT_MAX_RESULTS, timeout=DEFAULT_TIMEOUT):
|
||||
"""
|
||||
Return max_results tuples with title, url, description
|
||||
|
||||
>>> find("The Matrix site:imdb.com", 1)[0][0]
|
||||
u'The Matrix (1999) - IMDb'
|
||||
|
||||
>>> find("The Matrix site:imdb.com", 1)[0][1]
|
||||
u'http://www.imdb.com/title/tt0133093/'
|
||||
"""
|
||||
results = []
|
||||
offset = 0
|
||||
while len(results) < max_results:
|
||||
url = 'http://google.com/search?q=%s' % quote_plus(query)
|
||||
if offset:
|
||||
url += '&start=%d' % offset
|
||||
data = read_url(url, timeout=timeout)
|
||||
data = re.sub('<span class="f">(.*?)</span>', '\\1', data)
|
||||
for a in re.compile('<a href="(htt\S+?)".*?>(.*?)</a>.*?<span class="st">(.*?)<\/span>').findall(data):
|
||||
results.append((strip_tags(decode_html(a[1])), a[0], strip_tags(decode_html(a[2]))))
|
||||
if len(results) >= max_results:
|
||||
break
|
||||
offset += 10
|
||||
return results
|
||||
|
||||
821
Shared/lib/python3.4/site-packages/ox/web/imdb.py
Normal file
821
Shared/lib/python3.4/site-packages/ox/web/imdb.py
Normal file
|
|
@ -0,0 +1,821 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
from __future__ import print_function
|
||||
|
||||
import re
|
||||
import time
|
||||
import unicodedata
|
||||
|
||||
from six.moves import urllib
|
||||
from six import string_types
|
||||
|
||||
|
||||
from .. import find_re, strip_tags, decode_html
|
||||
from .. import cache
|
||||
|
||||
|
||||
from . siteparser import SiteParser
|
||||
from . import duckduckgo
|
||||
from ..utils import datetime
|
||||
from ..geo import normalize_country_name
|
||||
|
||||
def read_url(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None, unicode=False):
|
||||
headers = headers.copy()
|
||||
return cache.read_url(url, data, headers, timeout, unicode=unicode)
|
||||
|
||||
def get_url(id):
|
||||
return "http://www.imdb.com/title/tt%s/" % id
|
||||
|
||||
class Imdb(SiteParser):
|
||||
'''
|
||||
>>> Imdb('0068646')['title']
|
||||
u'The Godfather'
|
||||
|
||||
>>> Imdb('0133093')['title']
|
||||
u'The Matrix'
|
||||
'''
|
||||
regex = {
|
||||
'alternativeTitles': {
|
||||
'page': 'releaseinfo',
|
||||
're': [
|
||||
'name="akas".*?<table.*?>(.*?)</table>',
|
||||
"td>(.*?)</td>.*?<td>(.*?)</td>"
|
||||
],
|
||||
'type': 'list'
|
||||
|
||||
},
|
||||
'aspectratio': {
|
||||
'page': 'combined',
|
||||
're': 'Aspect Ratio:</h5><div class="info-content">([\d\.]+)',
|
||||
'type': 'float',
|
||||
},
|
||||
'budget': {
|
||||
'page': 'business',
|
||||
're': [
|
||||
'<h5>Budget</h5>\s*?\$(.*?)<br',
|
||||
lambda data: find_re(decode_html(data).replace(',', ''), '\d+')
|
||||
],
|
||||
'type': 'int'
|
||||
},
|
||||
'cast': {
|
||||
'page': 'combined',
|
||||
're': [
|
||||
'<td class="nm">.*?>(.*?)</a>.*?<td class="char">(.*?)</td>',
|
||||
lambda ll: [strip_tags(l) for l in ll]
|
||||
],
|
||||
'type': 'list'
|
||||
},
|
||||
'cinematographer': {
|
||||
'page': 'combined',
|
||||
're': [
|
||||
lambda data: data.split('Series Crew')[0],
|
||||
'Cinematography by</a>(.*?)</table>',
|
||||
'<a href="/name/.*?/">(.*?)</a>'
|
||||
],
|
||||
'type': 'list'
|
||||
},
|
||||
'connections': {
|
||||
'page': 'trivia?tab=mc',
|
||||
're': '<h4 class="li_group">(.*?)</h4>(.*?)(<\/div>\n <a|<script)',
|
||||
'type': 'list'
|
||||
},
|
||||
'country': {
|
||||
'page': 'combined',
|
||||
're': [
|
||||
'<div class="info"><h5>Country:</h5>.*?<div class="info">',
|
||||
#'<a href="/country/.*?">(.*?)</a>', #links changed to work with existing caches, just take all links
|
||||
'<a.*?>(.*?)</a>',
|
||||
],
|
||||
'type': 'list'
|
||||
},
|
||||
'creator': {
|
||||
'page': 'combined',
|
||||
're': [
|
||||
'<h5>Creator.?:</h5>.*?<div class="info-content">(.*?)</div>',
|
||||
'<a href="/name/.*?>(.*?)</a>'
|
||||
],
|
||||
'type': 'list'
|
||||
},
|
||||
'director': {
|
||||
'page': 'combined',
|
||||
're': [
|
||||
lambda data: data.split('<b>Series Crew</b>')[0],
|
||||
'Directed by</a>(.*?)</table>',
|
||||
'<a href="/name/.*?>(.*?)</a>'
|
||||
],
|
||||
'type': 'list'
|
||||
},
|
||||
'_director': {
|
||||
'page': 'combined',
|
||||
're': [
|
||||
'<h5>Director:</h5>.*?<div class="info-content">(.*?)</div>',
|
||||
'<a href="/name/.*?>(.*?)</a>'
|
||||
],
|
||||
'type': 'list'
|
||||
},
|
||||
'editor': {
|
||||
'page': 'combined',
|
||||
're': [
|
||||
lambda data: data.split('Series Crew')[0],
|
||||
'Film Editing by</a>(.*?)</table>',
|
||||
'<a href="/name/.*?>(.*?)</a>'
|
||||
],
|
||||
'type': 'list'
|
||||
},
|
||||
'composer': {
|
||||
'page': 'combined',
|
||||
're': [
|
||||
lambda data: data.split('Series Crew')[0],
|
||||
'Original Music by</a>(.*?)</table>',
|
||||
'<a href="/name/.*?>(.*?)</a>'
|
||||
],
|
||||
'type': 'list'
|
||||
},
|
||||
'episodeTitle': {
|
||||
'page': 'combined',
|
||||
're': '<div id="tn15title">.*?<em>(.*?)</em>',
|
||||
'type': 'string'
|
||||
},
|
||||
'filmingLocations': {
|
||||
'page': 'locations',
|
||||
're': [
|
||||
'<a href="/search/title\?locations=.*?".*?>(.*?)</a>',
|
||||
lambda data: data.strip(),
|
||||
],
|
||||
'type': 'list'
|
||||
},
|
||||
'genre': {
|
||||
'page': 'combined',
|
||||
're': [
|
||||
'<h5>Genre:</h5>(.*?)<hr',
|
||||
'<a href="/Sections/Genres/.*?/">(.*?)</a>'
|
||||
],
|
||||
'type': 'list'
|
||||
},
|
||||
'gross': {
|
||||
'page': 'business',
|
||||
're': [
|
||||
'<h5>Gross</h5>\s*?\$(.*?)<br',
|
||||
lambda data: find_re(data.replace(',', ''), '\d+')
|
||||
],
|
||||
'type': 'int'
|
||||
},
|
||||
'keyword': {
|
||||
'page': 'keywords',
|
||||
're': '<a href="/keyword/.*?>(.*?)</a>',
|
||||
'type': 'list'
|
||||
},
|
||||
'language': {
|
||||
'page': 'combined',
|
||||
're': [
|
||||
'<div class="info"><h5>Language:</h5>.*?<div class="info">',
|
||||
#'<a href="/language/.*?">(.*?)</a>', #links changed to work with existing caches, just take all links
|
||||
'<a.*?>(.*?)</a>',
|
||||
],
|
||||
'type': 'list'
|
||||
},
|
||||
'summary': {
|
||||
'page': 'plotsummary',
|
||||
're': '<p class="plotSummary">(.*?)<\/p>',
|
||||
'type': 'string'
|
||||
},
|
||||
'posterId': {
|
||||
'page': 'combined',
|
||||
're': '/primary-photo/media/rm(.*?)/tt',
|
||||
'type': 'string'
|
||||
},
|
||||
'posterIds': {
|
||||
'page': 'posters',
|
||||
're': '/unknown-thumbnail/media/rm(.*?)/tt',
|
||||
'type': 'list'
|
||||
},
|
||||
'producer': {
|
||||
'page': 'combined',
|
||||
're': [
|
||||
lambda data: data.split('Series Crew')[0],
|
||||
'Produced by</a>(.*?)</table>',
|
||||
'<a href="/name/.*?/">(.*?)</a>'
|
||||
],
|
||||
'type': 'list'
|
||||
},
|
||||
'productionCompany': {
|
||||
'page': 'combined',
|
||||
're': [
|
||||
'Production Companies</b><ul>(.*?)</ul>',
|
||||
'<a href="/company/.*?/">(.*?)</a>'
|
||||
],
|
||||
'type': 'list'
|
||||
},
|
||||
'rating': {
|
||||
'page': 'combined',
|
||||
're': '<div class="starbar-meta">.*?<b>([\d,.]+?)/10</b>',
|
||||
'type': 'float'
|
||||
},
|
||||
'releasedate': {
|
||||
'page': 'releaseinfo',
|
||||
're': [
|
||||
'<td class="release_date">(.*?)</td>',
|
||||
strip_tags,
|
||||
],
|
||||
'type': 'list'
|
||||
},
|
||||
'reviews': {
|
||||
'page': 'externalreviews',
|
||||
're': [
|
||||
'<ol>(.*?)</ol>',
|
||||
'<li><a href="(http.*?)".*?>(.*?)</a></li>'
|
||||
],
|
||||
'type': 'list'
|
||||
},
|
||||
'runtime': {
|
||||
'page': 'combined',
|
||||
're': '<h5>Runtime:</h5><div class="info-content">.*?([0-9]+ sec|[0-9]+ min).*?</div>',
|
||||
'type': 'string'
|
||||
},
|
||||
'color': {
|
||||
'page': 'combined',
|
||||
're': [
|
||||
'<h5>Color:</h5><div class="info-content">(.*?)</div>',
|
||||
'<a.*?>(.*?)</a>'
|
||||
],
|
||||
'type': 'list'
|
||||
},
|
||||
'sound': {
|
||||
'page': 'combined',
|
||||
're': [
|
||||
'<h5>Sound Mix:</h5><div class="info-content">(.*?)</div>',
|
||||
'<a.*?>(.*?)</a>'
|
||||
],
|
||||
'type': 'list'
|
||||
},
|
||||
'season': {
|
||||
'page': 'combined',
|
||||
're': [
|
||||
'<h5>Original Air Date:</h5>.*?<div class="info-content">(.*?)</div>',
|
||||
'\(Season (\d+), Episode \d+\)',
|
||||
],
|
||||
'type': 'int'
|
||||
},
|
||||
'episode': {
|
||||
'page': 'combined',
|
||||
're': [
|
||||
'<h5>Original Air Date:</h5>.*?<div class="info-content">(.*?)</div>',
|
||||
'\(Season \d+, Episode (\d+)\)',
|
||||
],
|
||||
'type': 'int'
|
||||
},
|
||||
'series': {
|
||||
'page': 'combined',
|
||||
're': '<h5>TV Series:</h5>.*?<a href="/title/tt(\d{7})',
|
||||
'type': 'string'
|
||||
},
|
||||
'isSeries': {
|
||||
'page': 'combined',
|
||||
're': '<span class="tv-extra">(TV series|TV mini-series) ',
|
||||
'type': 'string'
|
||||
},
|
||||
'title': {
|
||||
'page': 'combined',
|
||||
're': '<h1>(.*?) <span>',
|
||||
'type': 'string'
|
||||
},
|
||||
'trivia': {
|
||||
'page': 'trivia',
|
||||
're': [
|
||||
'<div class="sodatext">(.*?)<(br|/div)',
|
||||
lambda data: data[0]
|
||||
],
|
||||
'type': 'list',
|
||||
},
|
||||
'votes': {
|
||||
'page': 'combined',
|
||||
're': '<a href="ratings" class="tn15more">([\d,]*?) votes</a>',
|
||||
'type': 'string'
|
||||
},
|
||||
'writer': {
|
||||
'page': 'combined',
|
||||
're': [
|
||||
lambda data: data.split('Series Crew')[0],
|
||||
'Writing credits</a>(.*?)</table>',
|
||||
'<a href="/name/.*?/">(.*?)</a>'
|
||||
],
|
||||
'type': 'list'
|
||||
},
|
||||
'year': {
|
||||
'page': 'combined',
|
||||
're': '="og:title" content="[^"]*?\((\d{4}).*?"',
|
||||
'type': 'int'
|
||||
}
|
||||
}
|
||||
|
||||
def read_url(self, url, timeout):
|
||||
if not url in self._cache:
|
||||
self._cache[url] = read_url(url, timeout=timeout, unicode=True)
|
||||
return self._cache[url]
|
||||
|
||||
def __init__(self, id, timeout=-1):
|
||||
#use akas.imdb.com to always get original title:
|
||||
#http://www.imdb.com/help/show_leaf?titlelanguagedisplay
|
||||
self.baseUrl = "http://akas.imdb.com/title/tt%s/" % id
|
||||
super(Imdb, self).__init__(timeout)
|
||||
|
||||
url = self.baseUrl + 'combined'
|
||||
page = self.read_url(url, timeout=-1)
|
||||
if '<title>IMDb: Page not found</title>' in page \
|
||||
or 'The requested URL was not found on our server.' in page:
|
||||
return
|
||||
if "<p>We're sorry, something went wrong.</p>" in page:
|
||||
time.sleep(1)
|
||||
super(Imdb, self).__init__(0)
|
||||
|
||||
if 'alternativeTitles' in self:
|
||||
if len(self['alternativeTitles']) == 2 and \
|
||||
isinstance(self['alternativeTitles'][0], string_types):
|
||||
self['alternativeTitles'] = [self['alternativeTitles']]
|
||||
|
||||
#normalize country names
|
||||
if 'country' in self:
|
||||
self['country'] = [normalize_country_name(c) or c for c in self['country']]
|
||||
|
||||
if 'sound' in self:
|
||||
self['sound'] = list(set(self['sound']))
|
||||
|
||||
types = {}
|
||||
stop_words = [
|
||||
'alternative spelling',
|
||||
'alternative title',
|
||||
'alternative transliteration',
|
||||
'closing credits title',
|
||||
'complete title',
|
||||
'IMAX version',
|
||||
'informal short title',
|
||||
'International (Spanish title)',
|
||||
'Japan (imdb display title)',
|
||||
'longer version',
|
||||
'new title',
|
||||
'original subtitled version',
|
||||
'pre-release title',
|
||||
'promotional abbreviation',
|
||||
'recut version',
|
||||
'reissue title',
|
||||
'restored version',
|
||||
'script title',
|
||||
'short title',
|
||||
'(subtitle)',
|
||||
'TV title',
|
||||
'working title',
|
||||
'World-wide (Spanish title)',
|
||||
]
|
||||
#ignore english japanese titles
|
||||
#for movies that are not only from japan
|
||||
if ['Japan'] != self.get('country', []):
|
||||
stop_words += [
|
||||
'Japan (English title)'
|
||||
]
|
||||
for t in self.get('alternativeTitles', []):
|
||||
for type in t[0].split('/'):
|
||||
type = type.strip()
|
||||
stop_word = False
|
||||
for key in stop_words:
|
||||
if key in type:
|
||||
stop_word = True
|
||||
break
|
||||
if not stop_word:
|
||||
if not type in types:
|
||||
types[type] = []
|
||||
types[type].append(t[1])
|
||||
titles = {}
|
||||
for type in types:
|
||||
for title in types[type]:
|
||||
if not title in titles:
|
||||
titles[title] = []
|
||||
titles[title].append(type)
|
||||
def select_title(type):
|
||||
title = types[type][0]
|
||||
count = 0
|
||||
if len(types[type]) > 1:
|
||||
for t in types[type]:
|
||||
if len(titles[t]) > count:
|
||||
count = len(titles[t])
|
||||
title = t
|
||||
return title
|
||||
|
||||
#FIXME: does work in python2.6, possible to import from __future__?
|
||||
#types = {type: select_title(type) for type in types}
|
||||
_types = {}
|
||||
for type in types:
|
||||
_types[type] = select_title(type)
|
||||
types = _types
|
||||
|
||||
regexps = [
|
||||
"^.+ \(imdb display title\) \(English title\)$",
|
||||
"^USA \(imdb display title\)$",
|
||||
"^International \(English title\)$",
|
||||
"^International \(English title\)$",
|
||||
"^UK \(imdb display title\)$",
|
||||
"^International \(.+\) \(English title\)$",
|
||||
"^World-wide \(English title\)$",
|
||||
]
|
||||
if 'Hong Kong' in self.get('country', []):
|
||||
regexps += [
|
||||
"Hong Kong \(English title\)"
|
||||
]
|
||||
english_countries = (
|
||||
'USA', 'UK', 'United States', 'United Kingdom',
|
||||
'Australia', 'New Zealand'
|
||||
)
|
||||
if not filter(lambda c: c in english_countries, self.get('country', [])):
|
||||
regexps += [
|
||||
"^[^(]+ \(English title\)$",
|
||||
"^.+ \(.+\) \(English title\)$",
|
||||
"^USA$",
|
||||
"^UK$",
|
||||
"^USA \(.+\)$",
|
||||
"^UK \(.+\)$",
|
||||
"^Australia \(.+\)$",
|
||||
"World-wide \(English title\)",
|
||||
"\(literal English title\)",
|
||||
"^International \(.+ title\)$",
|
||||
"^International \(.+\) \(.+ title\)$",
|
||||
]
|
||||
for regexp in regexps:
|
||||
for type in types:
|
||||
if re.compile(regexp).findall(type):
|
||||
#print types[type], type
|
||||
self['internationalTitle'] = types[type]
|
||||
break
|
||||
if 'internationalTitle' in self:
|
||||
break
|
||||
|
||||
def cleanup_title(title):
|
||||
if title.startswith('"') and title.endswith('"'):
|
||||
title = title[1:-1]
|
||||
if title.startswith("'") and title.endswith("'"):
|
||||
title = title[1:-1]
|
||||
title = re.sub('\(\#[.\d]+\)', '', title)
|
||||
return title.strip()
|
||||
|
||||
for t in ('title', 'internationalTitle'):
|
||||
if t in self:
|
||||
self[t] = cleanup_title(self[t])
|
||||
|
||||
if 'internationalTitle' in self and \
|
||||
self.get('title', '').lower() == self['internationalTitle'].lower():
|
||||
del self['internationalTitle']
|
||||
|
||||
if 'alternativeTitles' in self:
|
||||
alt = {}
|
||||
for t in self['alternativeTitles']:
|
||||
title = cleanup_title(t[1])
|
||||
if title not in (self.get('title'), self.get('internationalTitle')):
|
||||
if title not in alt:
|
||||
alt[title] = []
|
||||
for c in t[0].split('/'):
|
||||
if not '(working title)' in c:
|
||||
c = c.replace('International', '').replace('World-wide', '').split('(')[0].strip()
|
||||
if c:
|
||||
alt[title].append(c)
|
||||
self['alternativeTitles'] = []
|
||||
for t in sorted(alt, key=lambda a: sorted(alt[a])):
|
||||
if alt[t]:
|
||||
countries = sorted([normalize_country_name(c) or c for c in alt[t]])
|
||||
self['alternativeTitles'].append((t, countries))
|
||||
if not self['alternativeTitles']:
|
||||
del self['alternativeTitles']
|
||||
|
||||
if 'internationalTitle' in self:
|
||||
self['originalTitle'] = self['title']
|
||||
self['title'] = self.pop('internationalTitle')
|
||||
|
||||
if 'runtime' in self and self['runtime']:
|
||||
if 'min' in self['runtime']: base=60
|
||||
else: base=1
|
||||
self['runtime'] = int(find_re(self['runtime'], '([0-9]+)')) * base
|
||||
if 'runtime' in self and not self['runtime']:
|
||||
del self['runtime']
|
||||
if 'votes' in self: self['votes'] = self['votes'].replace(',', '')
|
||||
|
||||
if 'cast' in self:
|
||||
if isinstance(self['cast'][0], string_types):
|
||||
self['cast'] = [self['cast']]
|
||||
self['actor'] = [c[0] for c in self['cast']]
|
||||
def cleanup_character(c):
|
||||
c = c.replace('(uncredited)', '').strip()
|
||||
return c
|
||||
self['cast'] = [{'actor': x[0], 'character': cleanup_character(x[1])}
|
||||
for x in self['cast']]
|
||||
|
||||
if 'connections' in self:
|
||||
cc={}
|
||||
if len(self['connections']) == 3 and isinstance(self['connections'][0], string_types):
|
||||
self['connections'] = [self['connections']]
|
||||
for rel, data, _ in self['connections']:
|
||||
if isinstance(rel, bytes):
|
||||
rel = rel.decode('utf-8')
|
||||
#cc[rel] = re.compile('<a href="/title/tt(\d{7})/">(.*?)</a>').findall(data)
|
||||
def get_conn(c):
|
||||
r = {
|
||||
'id': c[0],
|
||||
'title': cleanup_title(c[1]),
|
||||
}
|
||||
description = c[2].split('<br />')
|
||||
if len(description) == 2 and description[-1].strip() != '-':
|
||||
r['description'] = description[-1].strip()
|
||||
return r
|
||||
cc[rel] = list(map(get_conn, re.compile('<a href="/title/tt(\d{7})/">(.*?)</a>(.*?)<\/div', re.DOTALL).findall(data)))
|
||||
|
||||
|
||||
self['connections'] = cc
|
||||
|
||||
for key in ('country', 'genre'):
|
||||
if key in self:
|
||||
self[key] = list(filter(lambda x: x.lower() != 'home', self[key]))
|
||||
#0092999
|
||||
if '_director' in self:
|
||||
if 'series' in self or 'isSeries' in self:
|
||||
self['creator'] = self.pop('_director')
|
||||
else:
|
||||
del self['_director']
|
||||
if 'isSeries' in self:
|
||||
del self['isSeries']
|
||||
self['isSeries'] = True
|
||||
if 'episodeTitle' in self:
|
||||
self['episodeTitle'] = re.sub('Episode \#\d+\.\d+', '', self['episodeTitle'])
|
||||
|
||||
if 'series' in self:
|
||||
series = Imdb(self['series'], timeout=timeout)
|
||||
self['seriesTitle'] = series['title']
|
||||
if 'episodeTitle' in self:
|
||||
self['seriesTitle'] = series['title']
|
||||
if 'season' in self and 'episode' in self:
|
||||
self['title'] = "%s (S%02dE%02d) %s" % (
|
||||
self['seriesTitle'], self['season'], self['episode'], self['episodeTitle'])
|
||||
else:
|
||||
self['title'] = "%s (S01) %s" % (self['seriesTitle'], self['episodeTitle'])
|
||||
self['season'] = 1
|
||||
self['title'] = self['title'].strip()
|
||||
if 'director' in self:
|
||||
self['episodeDirector'] = self['director']
|
||||
|
||||
if not 'creator' in series and 'director' in series:
|
||||
series['creator'] = series['director']
|
||||
if len(series['creator']) > 10:
|
||||
series['creator'] = series['director'][:1]
|
||||
|
||||
for key in ['creator', 'country']:
|
||||
if key in series:
|
||||
self[key] = series[key]
|
||||
|
||||
if 'year' in series:
|
||||
self['seriesYear'] = series['year']
|
||||
if not 'year' in self:
|
||||
self['year'] = series['year']
|
||||
|
||||
if 'year' in self:
|
||||
self['episodeYear'] = self['year']
|
||||
if 'creator' in self:
|
||||
self['seriesDirector'] = self['creator']
|
||||
if 'originalTitle' in self:
|
||||
del self['originalTitle']
|
||||
else:
|
||||
for key in ('seriesTitle', 'episodeTitle', 'season', 'episode'):
|
||||
if key in self:
|
||||
del self[key]
|
||||
if 'creator' in self:
|
||||
if 'director' in self:
|
||||
self['episodeDirector'] = self['director']
|
||||
self['director'] = self['creator']
|
||||
|
||||
#make lists unique but keep order
|
||||
for key in ('director', 'language'):
|
||||
if key in self:
|
||||
self[key] = [x for i,x in enumerate(self[key])
|
||||
if x not in self[key][i+1:]]
|
||||
|
||||
for key in ('actor', 'writer', 'producer', 'editor', 'composer'):
|
||||
if key in self:
|
||||
if isinstance(self[key][0], list):
|
||||
self[key] = [i[0] for i in self[key] if i]
|
||||
self[key] = sorted(list(set(self[key])), key=lambda a: self[key].index(a))
|
||||
|
||||
|
||||
if 'budget' in self and 'gross' in self:
|
||||
self['profit'] = self['gross'] - self['budget']
|
||||
|
||||
if 'releasedate' in self:
|
||||
def parse_date(d):
|
||||
try:
|
||||
d = datetime.strptime(d, '%d %B %Y')
|
||||
except:
|
||||
try:
|
||||
d = datetime.strptime(d, '%B %Y')
|
||||
except:
|
||||
return 'x'
|
||||
return '%d-%02d-%02d' % (d.year, d.month, d.day)
|
||||
self['releasedate'] = min([
|
||||
parse_date(d) for d in self['releasedate']
|
||||
])
|
||||
if self['releasedate'] == 'x':
|
||||
del self['releasedate']
|
||||
if 'summary' in self:
|
||||
if isinstance(self['summary'], list):
|
||||
self['summary'] = self['summary'][0]
|
||||
self['summary'] = self['summary'].split('</p')[0].strip()
|
||||
|
||||
class ImdbCombined(Imdb):
|
||||
def __init__(self, id, timeout=-1):
|
||||
_regex = {}
|
||||
for key in self.regex:
|
||||
if self.regex[key]['page'] in ('combined', 'releaseinfo'):
|
||||
_regex[key] = self.regex[key]
|
||||
self.regex = _regex
|
||||
super(ImdbCombined, self).__init__(id, timeout)
|
||||
|
||||
def get_movie_by_title(title, timeout=-1):
|
||||
'''
|
||||
This only works for exact title matches from the data dump
|
||||
Usually in the format
|
||||
Title (Year)
|
||||
"Series Title" (Year) {(#Season.Episode)}
|
||||
"Series Title" (Year) {Episode Title (#Season.Episode)}
|
||||
|
||||
If there is more than one film with that title for the year
|
||||
Title (Year/I)
|
||||
|
||||
>>> get_movie_by_title(u'"Father Knows Best" (1954) {(#5.34)}')
|
||||
u'1602860'
|
||||
|
||||
>>> get_movie_by_title(u'The Matrix (1999)')
|
||||
u'0133093'
|
||||
|
||||
>>> get_movie_by_title(u'Little Egypt (1951)')
|
||||
u'0043748'
|
||||
|
||||
>>> get_movie_by_title(u'Little Egypt (1897/I)')
|
||||
u'0214882'
|
||||
|
||||
>>> get_movie_by_title(u'Little Egypt')
|
||||
None
|
||||
|
||||
>>> get_movie_by_title(u'"Dexter" (2006) {Father Knows Best (#1.9)}')
|
||||
u'0866567'
|
||||
'''
|
||||
params = {'s':'tt','q': title}
|
||||
if not isinstance(title, bytes):
|
||||
try:
|
||||
params['q'] = unicodedata.normalize('NFKC', params['q']).encode('latin-1')
|
||||
except:
|
||||
params['q'] = params['q'].encode('utf-8')
|
||||
params = urllib.urlencode(params)
|
||||
url = "http://akas.imdb.com/find?" + params
|
||||
data = read_url(url, timeout=timeout, unicode=True)
|
||||
#if search results in redirect, get id of current page
|
||||
r = '<meta property="og:url" content="http://www.imdb.com/title/tt(\d{7})/" />'
|
||||
results = re.compile(r).findall(data)
|
||||
if results:
|
||||
return results[0]
|
||||
return None
|
||||
|
||||
def get_movie_id(title, director='', year='', timeout=-1):
|
||||
'''
|
||||
>>> get_movie_id('The Matrix')
|
||||
u'0133093'
|
||||
|
||||
>>> get_movie_id('2 or 3 Things I Know About Her', 'Jean-Luc Godard')
|
||||
u'0060304'
|
||||
|
||||
>>> get_movie_id('2 or 3 Things I Know About Her', 'Jean-Luc Godard', '1967')
|
||||
u'0060304'
|
||||
|
||||
>>> get_movie_id(u"Histoire(s) du cinema: Le controle de l'univers", 'Jean-Luc Godard')
|
||||
u'0179214'
|
||||
|
||||
>>> get_movie_id(u"Histoire(s) du cinéma: Le contrôle de l'univers", 'Jean-Luc Godard')
|
||||
u'0179214'
|
||||
'''
|
||||
imdbId = {
|
||||
(u'Le jour se l\xe8ve', u'Marcel Carn\xe9'): '0031514',
|
||||
(u'Wings', u'Larisa Shepitko'): '0061196',
|
||||
(u'The Ascent', u'Larisa Shepitko'): '0075404',
|
||||
(u'Fanny and Alexander', u'Ingmar Bergman'): '0083922',
|
||||
(u'Torment', u'Alf Sj\xf6berg'): '0036914',
|
||||
(u'Crisis', u'Ingmar Bergman'): '0038675',
|
||||
(u'To Joy', u'Ingmar Bergman'): '0043048',
|
||||
(u'Humain, trop humain', u'Louis Malle'): '0071635',
|
||||
(u'Place de la R\xe9publique', u'Louis Malle'): '0071999',
|
||||
(u'God\u2019s Country', u'Louis Malle'): '0091125',
|
||||
(u'Flunky, Work Hard', u'Mikio Naruse'): '0022036',
|
||||
(u'The Courtesans of Bombay', u'Richard Robbins') : '0163591',
|
||||
(u'Je tu il elle', u'Chantal Akerman') : '0071690',
|
||||
(u'Hotel Monterey', u'Chantal Akerman') : '0068725',
|
||||
(u'No Blood Relation', u'Mikio Naruse') : '023261',
|
||||
(u'Apart from You', u'Mikio Naruse') : '0024214',
|
||||
(u'Every-Night Dreams', u'Mikio Naruse') : '0024793',
|
||||
(u'Street Without End', u'Mikio Naruse') : '0025338',
|
||||
(u'Sisters of the Gion', u'Kenji Mizoguchi') : '0027672',
|
||||
(u'Osaka Elegy', u'Kenji Mizoguchi') : '0028021',
|
||||
(u'Blaise Pascal', u'Roberto Rossellini') : '0066839',
|
||||
(u'Japanese Girls at the Harbor', u'Hiroshi Shimizu') : '0160535',
|
||||
(u'The Private Life of Don Juan', u'Alexander Korda') : '0025681',
|
||||
(u'Last Holiday', u'Henry Cass') : '0042665',
|
||||
(u'A Colt Is My Passport', u'Takashi Nomura') : '0330536',
|
||||
(u'Androcles and the Lion', u'Chester Erskine') : '0044355',
|
||||
(u'Major Barbara', u'Gabriel Pascal') : '0033868',
|
||||
(u'Come On Children', u'Allan King') : '0269104',
|
||||
|
||||
(u'Jimi Plays Monterey & Shake! Otis at Monterey', u'D. A. Pennebaker and Chris Hegedus') : '',
|
||||
(u'Martha Graham: Dance on Film', u'Nathan Kroll') : '',
|
||||
(u'Carmen', u'Carlos Saura'): '0085297',
|
||||
(u'The Story of a Cheat', u'Sacha Guitry'): '0028201',
|
||||
(u'Weekend', 'Andrew Haigh'): '1714210',
|
||||
}.get((title, director), None)
|
||||
if imdbId:
|
||||
return imdbId
|
||||
params = {'s':'tt','q': title}
|
||||
if director:
|
||||
params['q'] = u'"%s" %s' % (title, director)
|
||||
if year:
|
||||
params['q'] = u'"%s (%s)" %s' % (title, year, director)
|
||||
google_query = "site:imdb.com %s" % params['q']
|
||||
if not isinstance(params['q'], bytes):
|
||||
try:
|
||||
params['q'] = unicodedata.normalize('NFKC', params['q']).encode('latin-1')
|
||||
except:
|
||||
params['q'] = params['q'].encode('utf-8')
|
||||
params = urllib.urlencode(params)
|
||||
url = "http://akas.imdb.com/find?" + params
|
||||
#print url
|
||||
|
||||
data = read_url(url, timeout=timeout, unicode=True)
|
||||
#if search results in redirect, get id of current page
|
||||
r = '<meta property="og:url" content="http://www.imdb.com/title/tt(\d{7})/" />'
|
||||
results = re.compile(r).findall(data)
|
||||
if results:
|
||||
return results[0]
|
||||
#otherwise get first result
|
||||
r = '<td valign="top">.*?<a href="/title/tt(\d{7})/"'
|
||||
results = re.compile(r).findall(data)
|
||||
if results:
|
||||
return results[0]
|
||||
|
||||
#print (title, director), ": '',"
|
||||
#print google_query
|
||||
#results = google.find(google_query, timeout=timeout)
|
||||
results = duckduckgo.find(google_query, timeout=timeout)
|
||||
if results:
|
||||
for r in results[:2]:
|
||||
imdbId = find_re(r[1], 'title/tt(\d{7})')
|
||||
if imdbId:
|
||||
return imdbId
|
||||
#or nothing
|
||||
return ''
|
||||
|
||||
def get_movie_poster(imdbId):
|
||||
'''
|
||||
>>> get_movie_poster('0133093')
|
||||
'http://ia.media-imdb.com/images/M/MV5BMjEzNjg1NTg2NV5BMl5BanBnXkFtZTYwNjY3MzQ5._V1._SX338_SY475_.jpg'
|
||||
|
||||
>>> get_movie_poster('0994352')
|
||||
'http://ia.media-imdb.com/images/M/MV5BMjA3NzMyMzU1MV5BMl5BanBnXkFtZTcwNjc1ODUwMg@@._V1._SX594_SY755_.jpg'
|
||||
'''
|
||||
info = ImdbCombined(imdbId)
|
||||
if 'posterId' in info:
|
||||
url = "http://www.imdb.com/media/rm%s/tt%s" % (info['posterId'], imdbId)
|
||||
data = read_url(url).decode('utf-8', 'ignore')
|
||||
poster = find_re(data, 'img.*?id="primary-img".*?src="(.*?)"')
|
||||
return poster
|
||||
elif 'series' in info:
|
||||
return get_movie_poster(info['series'])
|
||||
return ''
|
||||
|
||||
def get_episodes(imdbId, season=None):
|
||||
episodes = {}
|
||||
url = 'http://www.imdb.com/title/tt%s/episodes' % imdbId
|
||||
if season:
|
||||
url += '?season=%d' % season
|
||||
data = cache.read_url(url)
|
||||
for e in re.compile('<div data-const="tt(\d{7})".*?>.*?<div>S(\d+), Ep(\d+)<\/div>\n<\/div>', re.DOTALL).findall(data):
|
||||
episodes['S%02dE%02d' %(int(e[1]), int(e[2]))] = e[0]
|
||||
else:
|
||||
data = cache.read_url(url)
|
||||
match = re.compile('<strong>Season (\d+)</strong>').findall(data)
|
||||
if match:
|
||||
for season in range(1, int(match[0]) + 1):
|
||||
episodes.update(get_episodes(imdbId, season))
|
||||
return episodes
|
||||
|
||||
def max_votes():
|
||||
url = 'http://www.imdb.com/search/title?num_votes=500000,&sort=num_votes,desc'
|
||||
data = cache.read_url(url)
|
||||
votes = max([int(v.replace(',', ''))
|
||||
for v in re.compile('<td class="sort_col">([\d,]+)</td>').findall(data)])
|
||||
return votes
|
||||
|
||||
def guess(title, director='', timeout=-1):
|
||||
return get_movie_id(title, director, timeout=timeout)
|
||||
|
||||
if __name__ == "__main__":
|
||||
import json
|
||||
print(json.dumps(Imdb('0306414'), indent=2))
|
||||
#print json.dumps(Imdb('0133093'), indent=2)
|
||||
|
||||
300
Shared/lib/python3.4/site-packages/ox/web/impawards.py
Normal file
300
Shared/lib/python3.4/site-packages/ox/web/impawards.py
Normal file
|
|
@ -0,0 +1,300 @@
|
|||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
# encoding: utf-8
|
||||
import re
|
||||
|
||||
from ox.cache import read_url
|
||||
from ox.html import strip_tags
|
||||
from ox.text import find_re
|
||||
|
||||
|
||||
def get_data(id):
|
||||
'''
|
||||
>>> get_data('1991/silence_of_the_lambs')['imdbId']
|
||||
u'0102926'
|
||||
|
||||
>>> get_data('1991/silence_of_the_lambs')['posters'][0]
|
||||
u'http://www.impawards.com/1991/posters/silence_of_the_lambs_ver1.jpg'
|
||||
|
||||
>>> get_data('1991/silence_of_the_lambs')['url']
|
||||
u'http://www.impawards.com/1991/silence_of_the_lambs_ver1.html'
|
||||
'''
|
||||
data = {
|
||||
'url': get_url(id)
|
||||
}
|
||||
html = read_url(data['url'], unicode=True)
|
||||
data['imdbId'] = find_re(html, 'imdb.com/title/tt(\d{7})')
|
||||
if not data['imdbId']:
|
||||
data['imdbId'] = _id_map.get(id, '')
|
||||
data['title'] = strip_tags(find_re(html, '<p class="name white">(.*?) \(<a href="alpha1.html">'))
|
||||
data['year'] = find_re(html, '\(<a href="alpha1.html">(.*?)</a>\)')
|
||||
data['posters'] = []
|
||||
poster = find_re(html, '<img src="(posters.*?)"')
|
||||
if poster:
|
||||
poster = 'http://www.impawards.com/%s/%s' % (data['year'], poster)
|
||||
data['posters'].append(poster)
|
||||
results = re.compile('<a href = (%s.*?html)' % id[5:], re.DOTALL).findall(html)
|
||||
for result in results:
|
||||
result = result.replace('_xlg.html', '.html')
|
||||
url = 'http://www.impawards.com/%s/%s' % (data['year'], result)
|
||||
html = read_url(url, unicode=True)
|
||||
result = find_re(html, '<a href = (\w*?_xlg.html)')
|
||||
if result:
|
||||
url = 'http://www.impawards.com/%s/%s' % (data['year'], result)
|
||||
html = read_url(url, unicode=True)
|
||||
poster = 'http://www.impawards.com/%s/%s' % (data['year'], find_re(html, '<img SRC="(.*?)"'))
|
||||
else:
|
||||
poster = 'http://www.impawards.com/%s/%s' % (data['year'], find_re(html, '<img src="(posters.*?)"'))
|
||||
data['posters'].append(poster)
|
||||
|
||||
return data
|
||||
|
||||
def get_id(url):
|
||||
split = url.split('/')
|
||||
year = split[3]
|
||||
split = split[4][:-5].split('_')
|
||||
if split[-1] == 'xlg':
|
||||
split.pop()
|
||||
if find_re(split[-1], 'ver\d+$'):
|
||||
split.pop()
|
||||
id = '%s/%s' % (year, '_'.join(split))
|
||||
return id
|
||||
|
||||
def get_ids(page=None):
|
||||
ids = []
|
||||
if page:
|
||||
html = read_url('http://www.impawards.com/archives/page%s.html' % page, timeout = -1, unicode=True)
|
||||
results = re.compile('<a href = \.\./(.*?)>', re.DOTALL).findall(html)
|
||||
for result in results:
|
||||
url = 'http://impawards.com/%s' % result
|
||||
ids.append(get_id(url))
|
||||
return set(ids)
|
||||
#get all
|
||||
html = read_url('http://www.impawards.com/archives/latest.html', timeout = 60*60, unicode=True)
|
||||
pages = int(find_re(html, '<a href= page(.*?).html>')) + 1
|
||||
for page in range(pages, 0, -1):
|
||||
for id in get_ids(page):
|
||||
if not id in ids:
|
||||
ids.append(id)
|
||||
return ids
|
||||
|
||||
def get_url(id):
|
||||
url = u"http://www.impawards.com/%s.html" % id
|
||||
html = read_url(url, unicode=True)
|
||||
if find_re(html, "No Movie Posters on This Page"):
|
||||
url = u"http://www.impawards.com/%s_ver1.html" % id
|
||||
return url
|
||||
|
||||
_id_map = {
|
||||
'1933/forty_second_street': '0024034',
|
||||
'1933/tarzan_the_fearless': '0024645',
|
||||
'1935/informer': '0026529',
|
||||
'1935/thirty_nine_steps': '0026529',
|
||||
'1935/top_hat': '0027125',
|
||||
'1938/charlie_chaplin_cavalcade': '0284687',
|
||||
'1943/falcon_and_the_co-eds': '035855',
|
||||
'1969/angel_angel_down_we_go': '0065602',
|
||||
'1970/crimson_altar': '0062833',
|
||||
'1975/man_who_would_be_king_ver1': '0073341',
|
||||
'1975/picnic_at_hanging_rock_ver1': '0073540',
|
||||
'1979/electric_horseman_ver1': '0079100',
|
||||
'1980/caligula_ver1': '0080491',
|
||||
'1980/hollywood_knights_ver1': '0080881',
|
||||
'1981/history_of_the_world_part_i': '0082517',
|
||||
'1981/sea_wolves': '0081470',
|
||||
'1983/krull_ver1': '0085811',
|
||||
'1985/warriors_of_the_wind': '0087544',
|
||||
'1989/friday_the_thirteenth_part_viii_ver1': '0097388',
|
||||
'1989/high_hopes': '0095302',
|
||||
'1989/millenium': '0097883',
|
||||
'1989/story_of_women': '0096336',
|
||||
'1990/edward_scissorhands_ver1': '0099487',
|
||||
'1991/freddys_dead_ver1': '0101917',
|
||||
'1993/robocop_three_ver1': '0107978',
|
||||
'1993/waynes_world_two_ver1': '0108525',
|
||||
'1994/above_the_rim_ver1': '0109035',
|
||||
'1994/helas_pour_moi': '0107175',
|
||||
'1994/house_of_the_spirits_ver1': '0107151',
|
||||
'1994/i_dont_want_to_talk_about_it': '0106678',
|
||||
'1994/in_custody': '0107199',
|
||||
'1994/ladybird_ladybird': '0110296',
|
||||
'1994/leon_the_pig_farmer': '0104710',
|
||||
'1994/love_after_love': '0103710',
|
||||
'1994/l_six_two_seven': '0104658',
|
||||
'1994/martin_lawrence_you_so_crazy_ver1': '0111804',
|
||||
'1994/savage_nights': '0105032',
|
||||
'1994/sex_drugs_and_democracy': '0111135',
|
||||
'1995/bye_bye_love': '0112606',
|
||||
'1995/cold_comfort_farm': '0112701',
|
||||
'1995/gumby_the_movie': '0113234',
|
||||
'1995/les_miserables': '0113828',
|
||||
'1995/mystery_of_rampo': '0110943',
|
||||
'1995/pharaohs_army': '0114122',
|
||||
'1995/pure_formality': '0110917',
|
||||
'1995/quick_and_the_dead_ver1': '0114214',
|
||||
'1995/reflections_in_the_dark': '0110956',
|
||||
'1995/safe_ver1': '0114323',
|
||||
'1995/search_and_destroy': '0114371',
|
||||
'1995/secret_of_roan_inish_ver1': '0111112',
|
||||
'1995/underneath': '0114788',
|
||||
'1996/ghost_in_the_shell': '0113568',
|
||||
'1996/hate': '0113247',
|
||||
'1996/horseman_on_the_roof': '0113362',
|
||||
'1996/kids_in_the_hall_brain_candy': '0116768',
|
||||
'1996/maybe_maybe_not': '0109255',
|
||||
'1996/prisoner_of_the_mountains': '0116754',
|
||||
'1997/fifth_element_ver1': '0119116',
|
||||
'1997/fools_rush_in_ver1': '0119141',
|
||||
'1997/gi_jane_ver1': '0119173',
|
||||
'1997/happy_together_ver1': '0118845',
|
||||
'1997/lilies': '0116882',
|
||||
'1997/mouth_to_mouth': '0112546',
|
||||
'1997/mr_nice_guy': '0117786',
|
||||
'1997/nenette_and_boni': '0117221',
|
||||
'1997/paperback_romance': '0110405',
|
||||
'1997/second_jungle_book': '0120087',
|
||||
'1997/single_girl': '0113057',
|
||||
'1997/super_speedway': '0120245',
|
||||
'1997/temptress_moon': '0116295',
|
||||
'1998/alarmist': '0119534',
|
||||
'1998/barneys_great_adventure_the_movie': '0120598',
|
||||
'1998/bulworth_ver1': '0118798',
|
||||
'1998/celebration': '0154420',
|
||||
'1998/east_palace_west_palace': '0119007',
|
||||
'1998/hurricane_streets': '0119338',
|
||||
'1998/i_married_a_strange_person': '0119346',
|
||||
'1998/inheritors': '0141824',
|
||||
'1998/killing_time': '0140312',
|
||||
'1998/live_flesh': '0118819',
|
||||
'1998/music_from_another_room': '0119734',
|
||||
'1998/post_coitum_ver1': '0119923',
|
||||
'1998/steam_the_turkish_bath': '0119248',
|
||||
'1998/velocity_of_gary': '0120878',
|
||||
'1999/after_life': '0165078',
|
||||
'1999/emperor_and_the_assassin': '0162866',
|
||||
'1999/fantasia_two_thousand': '0120910',
|
||||
'1999/get_bruce': '0184510',
|
||||
'1999/god_said_ha': '0119207',
|
||||
'1999/jawbreaker': '0155776',
|
||||
'1999/jeanne_and_the_perfect_guy': '0123923',
|
||||
'1999/king_and_i': '0160429',
|
||||
'1999/lovers_of_the_arctic_circle': '0133363',
|
||||
'1999/plunkett_and_macleane': '0134033',
|
||||
'1999/pokemon_the_first_movie': '0190641',
|
||||
'1999/school_of_flesh': '0157208',
|
||||
'1999/splendor': '0127296',
|
||||
'1999/stranger_in_the_kingdom': '0126680',
|
||||
'1999/train_of_life': '0170705',
|
||||
'1999/twice_upon_a_yesterday': '0138590',
|
||||
'1999/whiteboys': '0178988',
|
||||
'1999/wildfire': '0194544',
|
||||
'1999/windhorse': '0169388',
|
||||
'2000/claim': '0218378',
|
||||
'2000/color_of_paradise': '0191043',
|
||||
'2000/criminal_lovers': '0205735',
|
||||
'2000/everlasting_piece': '0218182',
|
||||
'2000/girl_on_the_bridge_ver1': '0144201',
|
||||
'2000/godzilla_two_thousand': '0188640',
|
||||
'2000/goya_in_bordeaux': '0210717',
|
||||
'2000/mad_about_mambo': '0156757',
|
||||
'2000/picking_up_the_pieces': '0192455',
|
||||
'2000/pokemon_the_movie_2000': '0257001',
|
||||
'2000/seven_days_to_live': '0221928',
|
||||
'2000/south_of_heaven_west_of_hell': '0179473',
|
||||
'2000/suzhou_river': '0234837',
|
||||
'2000/time_for_drunken_horses': '0259072',
|
||||
'2000/venus_beauty_institute': '0174330',
|
||||
'2001/circle': '0368646',
|
||||
'2001/devils_backbone': '0256009',
|
||||
'2001/kill_me_later': '0243595',
|
||||
'2001/king_is_dancing': '0244173',
|
||||
'2001/learning_curve': '0219126',
|
||||
'2001/marco_polo__return_to_xanadu_ver1': '0296074',
|
||||
'2001/me_you_them': '0244504',
|
||||
'2001/our_lady_of_the_assassins': '0250809',
|
||||
'2001/pinero': '0261066',
|
||||
'2001/pokemon_three_the_movie_ver1': '0266860',
|
||||
'2001/scratch': '0143861',
|
||||
'2001/vampire_hunter_d_bloodlust_ver1': '0216651',
|
||||
'2002/el_bosque_animado': '0310790',
|
||||
'2002/fifty_first_state': '0227984',
|
||||
'2002/les_destinees': '0216689',
|
||||
'2002/sons_room': '0208990',
|
||||
'2003/open_hearts': '0315543',
|
||||
'2003/tulse_luper_suitcases': '0307596',
|
||||
'2003/valentin': '0296915',
|
||||
'2004/if_only_ver1': '0332136',
|
||||
'2004/wondrous_oblivion': '0334725',
|
||||
'2005/wu_ji': '0417976',
|
||||
'2006/golden_door': '0465188',
|
||||
'2006/kin': '1091189',
|
||||
'2007/revenge_of_the_nerds': '0088000',
|
||||
'2008/bad_batch': '1605644',
|
||||
'2008/mercedes': '1368083',
|
||||
'2008/spirit': '0831887',
|
||||
'2009/dead_air': '0993841',
|
||||
'2009/edge_of_love': '0819714',
|
||||
'2009/fuel': '1072437',
|
||||
'2009/fuel': '1072437',
|
||||
'2009/one_good_man': '1239357',
|
||||
'2009/st_trinians': '1210106',
|
||||
'2009/surveillance': '0409345',
|
||||
'2009/taken': '0936501',
|
||||
'2009/vaml': '1610453',
|
||||
'2010/adopting_haiti': '1764164',
|
||||
'2010/afterlife': '0838247',
|
||||
'2010/agora': '1186830',
|
||||
'2010/athlete': '1356996',
|
||||
'2010/beneath_the_blue': '1222698',
|
||||
'2010/bitch_slap': '1212974',
|
||||
'2010/black_waters_of_echos_pond': '0960066',
|
||||
'2010/case_thirty_nine': '0795351',
|
||||
'2010/finite_and_infinite_games': '1772268',
|
||||
'2010/hole': '1085779',
|
||||
'2010/jolene': '0867334',
|
||||
'2010/lake_mungo': '0816556',
|
||||
'2010/last_day_of_summer': '1242544',
|
||||
'2010/leaves_of_grass': '1151359',
|
||||
'2010/life_of_lemon': '1466057',
|
||||
'2010/man_in_the_maze': '1721692',
|
||||
'2010/mr_immortality_the_life_and_times_of_twista': '1711017',
|
||||
'2010/paper_man': '0437405',
|
||||
'2010/perfect_game': '0473102',
|
||||
'2010/red_baron': '0365675',
|
||||
'2010/satin': '0433397',
|
||||
'2010/shutter_island': '1130884',
|
||||
'2010/strange_powers': '1534075',
|
||||
'2010/suicidegirls_must_die': '1584733',
|
||||
'2010/veronika_decides_to_die': '1068678',
|
||||
'2010/witchblade': '0494292',
|
||||
'2010/youth_in_revolt': '0403702',
|
||||
'2011/beastly': '1152398',
|
||||
'2011/burning_palms': '1283887',
|
||||
'2011/cabin_in_the_woods': '1259521',
|
||||
'2011/conan': '0816462',
|
||||
'2011/courageous': '1630036',
|
||||
'2011/cruces_divided_two': '1698645',
|
||||
'2011/green_with_envy': '1204342',
|
||||
'2011/happythankyoumoreplease': '1481572',
|
||||
'2011/homework': '1645080',
|
||||
'2011/i_got_next': '1915570',
|
||||
'2011/lebanon_pa': '1290082',
|
||||
'2011/money_pet': '1965198',
|
||||
'2011/my_suicide': '0492896',
|
||||
'2011/priest': '0822847',
|
||||
'2011/prowl': '1559033',
|
||||
'2011/red_sonja': '0800175',
|
||||
'2011/season_of_the_witch': '0479997',
|
||||
'2011/stay_cool': '1235807',
|
||||
'2011/sympathy_for_delicious': '1270277',
|
||||
'2011/trust': '1529572',
|
||||
'2011/undefeated': '1961604',
|
||||
'2011/vanishing_on_seventh_street': '1452628',
|
||||
'2011/where_is_robert_fisher': '2042712',
|
||||
'2011/yellowbrickroad': '1398428',
|
||||
'2012/haywire': '1506999',
|
||||
'2012/last_call_at_the_oasis': '2043900',
|
||||
}
|
||||
|
||||
if __name__ == '__main__':
|
||||
ids = get_ids()
|
||||
print sorted(ids), len(ids)
|
||||
187
Shared/lib/python3.4/site-packages/ox/web/itunes.py
Normal file
187
Shared/lib/python3.4/site-packages/ox/web/itunes.py
Normal file
|
|
@ -0,0 +1,187 @@
|
|||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
# encoding: utf-8
|
||||
import re
|
||||
import urllib
|
||||
|
||||
from ox.cache import read_url
|
||||
from ox.html import decode_html, strip_tags
|
||||
from ox.text import find_re
|
||||
from ox.text import find_string
|
||||
|
||||
|
||||
# to sniff itunes traffic, use something like
|
||||
# sudo tcpdump -i en1 -Avs 8192 host appleglobal.112.2o7.net
|
||||
|
||||
# http://ax.phobos.apple.com.edgesuite.net/WebObjects/MZSearch.woa/wa/advancedSearch?media=music&songTerm=&genreIndex=1&flavor=0&mediaType=2&composerTerm=&allArtistNames=Arcadia&ringtone=0&searchButton=submit
|
||||
# http://ax.phobos.apple.com.edgesuite.net/WebObjects/MZSearch.woa/wa/advancedSearch?media=movie&movieTerm=The%20Matrix&descriptionTerm=&ratingIndex=1&mediaType=3&directorProducerName=Andy%20Wachowski&flavor=0&releaseYearTerm=1999&closedCaption=0&actorTerm=&searchButton=submit
|
||||
|
||||
ITUNES_HEADERS = {
|
||||
'X-Apple-Tz': '0',
|
||||
'X-Apple-Storefront': '143441-1',
|
||||
'User-Agent': 'iTunes/7.6.2 (Macintosh; U; Intel Mac OS X 10.5.2)',
|
||||
'Accept-Language': 'en-us, en;q=0.50',
|
||||
'Accept-Encoding': 'gzip',
|
||||
'Connection': 'close',
|
||||
}
|
||||
|
||||
def compose_url(request, parameters):
|
||||
if request == 'advancedSearch':
|
||||
url = 'http://ax.phobos.apple.com.edgesuite.net/WebObjects/MZSearch.woa/wa/advancedSearch?'
|
||||
if parameters['media'] == 'music':
|
||||
url += urllib.urlencode({
|
||||
'albumTerm': parameters['title'],
|
||||
'allArtistNames': parameters['artist'],
|
||||
'composerTerm': '',
|
||||
'flavor': 0,
|
||||
'genreIndex': 1,
|
||||
'media': 'music',
|
||||
'mediaType': 2,
|
||||
'ringtone': 0,
|
||||
'searchButton': 'submit',
|
||||
'songTerm': ''
|
||||
})
|
||||
elif parameters['media'] == 'movie':
|
||||
url += urllib.urlencode({
|
||||
'actorTerm': '',
|
||||
'closedCaption': 0,
|
||||
'descriptionTerm': '',
|
||||
'directorProducerName': parameters['director'],
|
||||
'flavor': 0,
|
||||
'media': 'movie',
|
||||
'mediaType': 3,
|
||||
'movieTerm': parameters['title'],
|
||||
'ratingIndex': 1,
|
||||
'releaseYearTerm': '',
|
||||
'searchButton': 'submit'
|
||||
})
|
||||
elif request == 'viewAlbum':
|
||||
url = 'http://phobos.apple.com/WebObjects/MZStore.woa/wa/viewAlbum?id=%s' % parameters['id']
|
||||
elif request == 'viewMovie':
|
||||
url = 'http://phobos.apple.com/WebObjects/MZStore.woa/wa/viewMovie?id=%s&prvw=1' % parameters['id']
|
||||
return url
|
||||
|
||||
def parse_xml_dict(xml):
|
||||
values = {}
|
||||
strings = xml.split('<key>')
|
||||
for string in strings:
|
||||
if string.find('</key>') != -1:
|
||||
key = find_re(string, '(.*?)</key>')
|
||||
type = find_re(string, '</key><(.*?)>')
|
||||
if type == 'true/':
|
||||
value = True
|
||||
else:
|
||||
value = find_re(string, '<%s>(.*?)</%s>' % (type, type))
|
||||
if type == 'integer':
|
||||
value = int(value)
|
||||
elif type == 'string':
|
||||
value = decode_html(value)
|
||||
values[key] = value
|
||||
return values
|
||||
|
||||
def parse_cast(xml, title):
|
||||
list = []
|
||||
try:
|
||||
strings = find_re(xml, '<SetFontStyle normalStyle="textColor">%s(.*?)</VBoxView>' % title[:-1].upper()).split('</GotoURL>')
|
||||
strings.pop()
|
||||
for string in strings:
|
||||
list.append(find_re(string, '<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
|
||||
return list
|
||||
except:
|
||||
return list
|
||||
|
||||
def parse_movies(xml, title):
|
||||
list = []
|
||||
try:
|
||||
strings = find_re(xml, '<SetFontStyle normalStyle="outlineTitleFontStyle"><b>%s(.*?)</Test>' % title[:-1].upper()).split('</GotoURL>')
|
||||
strings.pop()
|
||||
for string in strings:
|
||||
list.append({
|
||||
'id': find_re(string, 'viewMovie\?id=(.*?)&'),
|
||||
'title': find_re(string, '<SetFontStyle normalStyle="outlineTextFontStyle"><b>(.*?)</b></SetFontStyle>')
|
||||
})
|
||||
return list
|
||||
except:
|
||||
return list
|
||||
|
||||
class ItunesAlbum:
|
||||
def __init__(self, id = '', title = '', artist = ''):
|
||||
self.id = id
|
||||
self.title = title
|
||||
self.artist = artist
|
||||
if not id:
|
||||
self.id = self.get_id()
|
||||
|
||||
def get_id(self):
|
||||
url = compose_url('advancedSearch', {'media': 'music', 'title': self.title, 'artist': self.artist})
|
||||
xml = read_url(url, headers = ITUNES_HEADERS)
|
||||
id = find_re(xml, 'viewAlbum\?id=(.*?)&')
|
||||
return id
|
||||
|
||||
def get_data(self):
|
||||
data = {'id': self.id}
|
||||
url = compose_url('viewAlbum', {'id': self.id})
|
||||
xml = read_url(url, None, ITUNES_HEADERS)
|
||||
data['albumName'] = find_re(xml, '<B>(.*?)</B>')
|
||||
data['artistName'] = find_re(xml, '<b>(.*?)</b>')
|
||||
data['coverUrl'] = find_re(xml, 'reflection="." url="(.*?)"')
|
||||
data['genre'] = find_re(xml, 'Genre:(.*?)<')
|
||||
data['releaseDate'] = find_re(xml, 'Released(.*?)<')
|
||||
data['review'] = strip_tags(find_re(xml, 'REVIEW</b>.*?<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
|
||||
data['tracks'] = []
|
||||
strings = find_re(xml, '<key>items</key>.*?<dict>(.*?)$').split('<dict>')
|
||||
for string in strings:
|
||||
data['tracks'].append(parse_xml_dict(string))
|
||||
data['type'] = find_re(xml, '<key>listType</key><string>(.*?)<')
|
||||
return data
|
||||
|
||||
class ItunesMovie:
|
||||
def __init__(self, id = '', title = '', director = ''):
|
||||
self.id = id
|
||||
self.title = title
|
||||
self.director = director
|
||||
if not id:
|
||||
self.id = self.get_id()
|
||||
|
||||
def get_id(self):
|
||||
url = compose_url('advancedSearch', {'media': 'movie', 'title': self.title, 'director': self.director})
|
||||
xml = read_url(url, headers = ITUNES_HEADERS)
|
||||
id = find_re(xml, 'viewMovie\?id=(.*?)&')
|
||||
return id
|
||||
|
||||
def get_data(self):
|
||||
data = {'id': self.id}
|
||||
url = compose_url('viewMovie', {'id': self.id})
|
||||
xml = read_url(url, None, ITUNES_HEADERS)
|
||||
f = open('/Users/rolux/Desktop/iTunesData.xml', 'w')
|
||||
f.write(xml)
|
||||
f.close()
|
||||
data['actors'] = parse_cast(xml, 'actors')
|
||||
string = find_re(xml, 'Average Rating:(.*?)</HBoxView>')
|
||||
data['averageRating'] = string.count('rating_star_000033.png') + string.count('½') * 0.5
|
||||
data['directors'] = parse_cast(xml, 'directors')
|
||||
data['format'] = find_re(xml, 'Format:(.*?)<')
|
||||
data['genre'] = decode_html(find_re(xml, 'Genre:(.*?)<'))
|
||||
data['plotSummary'] = decode_html(find_re(xml, 'PLOT SUMMARY</b>.*?<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
|
||||
data['posterUrl'] = find_re(xml, 'reflection="." url="(.*?)"')
|
||||
data['producers'] = parse_cast(xml, 'producers')
|
||||
data['rated'] = find_re(xml, 'Rated(.*?)<')
|
||||
data['relatedMovies'] = parse_movies(xml, 'related movies')
|
||||
data['releaseDate'] = find_re(xml, 'Released(.*?)<')
|
||||
data['runTime'] = find_re(xml, 'Run Time:(.*?)<')
|
||||
data['screenwriters'] = parse_cast(xml, 'screenwriters')
|
||||
data['soundtrackId'] = find_re(xml, 'viewAlbum\?id=(.*?)&')
|
||||
data['trailerUrl'] = find_re(xml, 'autoplay="." url="(.*?)"')
|
||||
return data
|
||||
|
||||
if __name__ == '__main__':
|
||||
from ox.utils import json
|
||||
data = ItunesAlbum(title = 'So Red the Rose', artist = 'Arcadia').get_data()
|
||||
print json.dumps(data, sort_keys = True, indent = 4)
|
||||
data = ItunesMovie(title = 'The Matrix', director = 'Wachowski').get_data()
|
||||
print json.dumps(data, sort_keys = True, indent = 4)
|
||||
for v in data['relatedMovies']:
|
||||
data = ItunesMovie(id = v['id']).get_data()
|
||||
print json.dumps(data, sort_keys = True, indent = 4)
|
||||
data = ItunesMovie(id='272960052').get_data()
|
||||
print json.dumps(data, sort_keys = True, indent = 4)
|
||||
|
||||
42
Shared/lib/python3.4/site-packages/ox/web/lookupbyisbn.py
Normal file
42
Shared/lib/python3.4/site-packages/ox/web/lookupbyisbn.py
Normal file
|
|
@ -0,0 +1,42 @@
|
|||
from ox.cache import read_url
|
||||
from ox import find_re, strip_tags
|
||||
import re
|
||||
|
||||
base = 'http://www.lookupbyisbn.com'
|
||||
|
||||
def get_data(isbn):
|
||||
r = {}
|
||||
url = '%s/Search/Book/%s/1' % (base, isbn)
|
||||
|
||||
data = read_url(url).decode('utf-8')
|
||||
m = re.compile('href="(/Lookup/Book/[^"]+?)"').findall(data)
|
||||
if m:
|
||||
ids = m[0].split('/')
|
||||
r['isbn'] = ids[-2]
|
||||
r['asin'] = ids[-3]
|
||||
url = '%s%s' % (base, m[0])
|
||||
data = read_url(url).decode('utf-8')
|
||||
r["title"] = find_re(data, "<h2>(.*?)</h2>")
|
||||
keys = {
|
||||
'author': 'Author(s)',
|
||||
'publisher': 'Publisher',
|
||||
'date': 'Publication date',
|
||||
'edition': 'Edition',
|
||||
'binding': 'Binding',
|
||||
'volume': 'Volume(s)',
|
||||
'pages': 'Pages',
|
||||
}
|
||||
for key in keys:
|
||||
r[key] = find_re(data, '<span class="title">%s:</span>(.*?)</li>'% re.escape(keys[key]))
|
||||
if r[key] == '--':
|
||||
r[key] = ''
|
||||
if key == 'pages' and r[key]:
|
||||
r[key] = int(r[key])
|
||||
desc = find_re(data, '<h2>Description:<\/h2>(.*?)<div ')
|
||||
desc = desc.replace('<br /><br />', ' ').replace('<br /> ', ' ').replace('<br />', ' ')
|
||||
r['description'] = strip_tags(desc).strip()
|
||||
if r['description'] == u'Description of this item is not available at this time.':
|
||||
r['description'] = ''
|
||||
r['cover'] = find_re(data, '<img src="(.*?)" alt="Book cover').replace('._SL160_', '')
|
||||
return r
|
||||
|
||||
21
Shared/lib/python3.4/site-packages/ox/web/lyricsfly.py
Normal file
21
Shared/lib/python3.4/site-packages/ox/web/lyricsfly.py
Normal file
|
|
@ -0,0 +1,21 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
from ox.cache import read_url
|
||||
from ox.html import decode_html
|
||||
from ox.text import find_re
|
||||
|
||||
|
||||
def get_lyrics(title, artist):
|
||||
html = read_url('http://lyricsfly.com/api/')
|
||||
key = find_re(html, '<font color=green><b>(.*?)</b></font>')
|
||||
url = 'http://lyricsfly.com/api/api.php?i=%s&a=%s&t=%s' % (key, artist, title)
|
||||
xml = read_url(url)
|
||||
lyrics = find_re(xml, '<tx>(.*?)\[br\] Lyrics [a-z]* by lyricsfly.com')
|
||||
lyrics = lyrics.replace('\n', '').replace('\r', '')
|
||||
lyrics = lyrics.replace('[br]', '\n').strip()
|
||||
lyrics.replace('\n\n\n', '\n\n')
|
||||
lyrics = decode_html(lyrics.replace('&', '&'))
|
||||
return lyrics
|
||||
|
||||
if __name__ == '__main__':
|
||||
print getLyrics('Election Day', 'Arcadia')
|
||||
63
Shared/lib/python3.4/site-packages/ox/web/metacritic.py
Normal file
63
Shared/lib/python3.4/site-packages/ox/web/metacritic.py
Normal file
|
|
@ -0,0 +1,63 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
import re
|
||||
from urllib import quote
|
||||
from lxml.html import document_fromstring
|
||||
|
||||
from ox.cache import read_url
|
||||
from ox import find_re, strip_tags
|
||||
|
||||
def get_url(id=None, imdb=None):
|
||||
if imdb:
|
||||
url = "http://www.imdb.com/title/tt%s/criticreviews" % imdb
|
||||
data = read_url(url)
|
||||
metacritic_url = find_re(data, '"(http://www.metacritic.com/movie/.*?)"')
|
||||
return metacritic_url or None
|
||||
return 'http://www.metacritic.com/movie/%s' % id
|
||||
|
||||
def get_id(url):
|
||||
return url.split('/')[-1]
|
||||
|
||||
def get_show_url(title):
|
||||
title = quote(title)
|
||||
url = "http://www.metacritic.com/search/process?ty=6&ts=%s&tfs=tvshow_title&x=0&y=0&sb=0&release_date_s=&release_date_e=&metascore_s=&metascore_e=" % title
|
||||
data = read_url(url)
|
||||
return find_re(data, '(http://www.metacritic.com/tv/shows/.*?)\?')
|
||||
|
||||
def get_data(url):
|
||||
data = read_url(url, unicode=True)
|
||||
doc = document_fromstring(data)
|
||||
score = filter(lambda s: s.attrib.get('property') == 'v:average',
|
||||
doc.xpath('//span[@class="score_value"]'))
|
||||
if score:
|
||||
score = int(score[0].text)
|
||||
else:
|
||||
score = -1
|
||||
authors = [a.text
|
||||
for a in doc.xpath('//div[@class="review_content"]//div[@class="author"]//a')]
|
||||
sources = [d.text
|
||||
for d in doc.xpath('//div[@class="review_content"]//div[@class="source"]/a')]
|
||||
reviews = [d.text
|
||||
for d in doc.xpath('//div[@class="review_content"]//div[@class="review_body"]')]
|
||||
scores = [int(d.text.strip())
|
||||
for d in doc.xpath('//div[@class="review_content"]//div[contains(@class, "critscore")]')]
|
||||
urls = [a.attrib['href']
|
||||
for a in doc.xpath('//div[@class="review_content"]//a[contains(@class, "external")]')]
|
||||
|
||||
metacritics = []
|
||||
for i in range(len(authors)):
|
||||
metacritics.append({
|
||||
'critic': authors[i],
|
||||
'url': urls[i],
|
||||
'source': sources[i],
|
||||
'quote': strip_tags(reviews[i]).strip(),
|
||||
'score': scores[i],
|
||||
})
|
||||
|
||||
return {
|
||||
'critics': metacritics,
|
||||
'id': get_id(url),
|
||||
'score': score,
|
||||
'url': url,
|
||||
}
|
||||
|
||||
121
Shared/lib/python3.4/site-packages/ox/web/mininova.py
Normal file
121
Shared/lib/python3.4/site-packages/ox/web/mininova.py
Normal file
|
|
@ -0,0 +1,121 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
from datetime import datetime
|
||||
import re
|
||||
import socket
|
||||
from urllib import quote
|
||||
|
||||
from ox.cache import read_url
|
||||
from ox import find_re, cache, strip_tags, decode_html, get_torrent_info, int_value, normalize_newlines
|
||||
from ox.normalize import normalize_imdbid
|
||||
import ox
|
||||
|
||||
from torrent import Torrent
|
||||
|
||||
|
||||
def _parse_results_page(data, max_results=10):
|
||||
results=[]
|
||||
regexp = '''<tr><td>(.*?)</td><td>(.*?)<a href="/tor/(.*?)">(.*?)</a>.*?</td>.*?</tr>'''
|
||||
for row in re.compile(regexp, re.DOTALL).findall(data):
|
||||
torrentDate = row[0]
|
||||
torrentExtra = row[1]
|
||||
torrentId = row[2]
|
||||
torrentTitle = decode_html(row[3]).strip()
|
||||
torrentLink = "http://www.mininova.org/tor/" + torrentId
|
||||
privateTracker = 'priv.gif' in torrentExtra
|
||||
if not privateTracker:
|
||||
results.append((torrentTitle, torrentLink, ''))
|
||||
return results
|
||||
|
||||
def find_movie(query=None, imdb=None, max_results=10):
|
||||
'''search for torrents on mininova
|
||||
'''
|
||||
if imdb:
|
||||
url = "http://www.mininova.org/imdb/?imdb=%s" % normalize_imdbid(imdb)
|
||||
else:
|
||||
url = "http://www.mininova.org/search/%s/seeds" % quote(query)
|
||||
data = read_url(url, unicode=True)
|
||||
return _parse_results_page(data, max_results)
|
||||
|
||||
def get_id(mininovaId):
|
||||
mininovaId = unicode(mininovaId)
|
||||
d = find_re(mininovaId, "/(\d+)")
|
||||
if d:
|
||||
return d
|
||||
mininovaId = mininovaId.split('/')
|
||||
if len(mininovaId) == 1:
|
||||
return mininovaId[0]
|
||||
else:
|
||||
return mininovaId[-1]
|
||||
|
||||
def exists(mininovaId):
|
||||
mininovaId = get_id(mininovaId)
|
||||
data = ox.net.read_url("http://www.mininova.org/tor/%s" % mininovaId)
|
||||
if not data or 'Torrent not found...' in data:
|
||||
return False
|
||||
if 'tracker</a> of this torrent requires registration.' in data:
|
||||
return False
|
||||
return True
|
||||
|
||||
def get_data(mininovaId):
|
||||
_key_map = {
|
||||
'by': u'uploader',
|
||||
}
|
||||
mininovaId = get_id(mininovaId)
|
||||
torrent = dict()
|
||||
torrent[u'id'] = mininovaId
|
||||
torrent[u'domain'] = 'mininova.org'
|
||||
torrent[u'comment_link'] = "http://www.mininova.org/tor/%s" % mininovaId
|
||||
torrent[u'torrent_link'] = "http://www.mininova.org/get/%s" % mininovaId
|
||||
torrent[u'details_link'] = "http://www.mininova.org/det/%s" % mininovaId
|
||||
|
||||
data = read_url(torrent['comment_link'], unicode=True) + read_url(torrent['details_link'], unicode=True)
|
||||
if '<h1>Torrent not found...</h1>' in data:
|
||||
return None
|
||||
|
||||
for d in re.compile('<p>.<strong>(.*?):</strong>(.*?)</p>', re.DOTALL).findall(data):
|
||||
key = d[0].lower().strip()
|
||||
key = _key_map.get(key, key)
|
||||
value = decode_html(strip_tags(d[1].strip()))
|
||||
torrent[key] = value
|
||||
|
||||
torrent[u'title'] = find_re(data, '<title>(.*?):.*?</title>')
|
||||
torrent[u'imdbId'] = find_re(data, 'title/tt(\d{7})')
|
||||
torrent[u'description'] = find_re(data, '<div id="description">(.*?)</div>')
|
||||
if torrent['description']:
|
||||
torrent['description'] = normalize_newlines(decode_html(strip_tags(torrent['description']))).strip()
|
||||
t = read_url(torrent[u'torrent_link'])
|
||||
torrent[u'torrent_info'] = get_torrent_info(t)
|
||||
return torrent
|
||||
|
||||
class Mininova(Torrent):
|
||||
'''
|
||||
>>> Mininova('123')
|
||||
{}
|
||||
>>> Mininova('1072195')['infohash']
|
||||
'72dfa59d2338e4a48c78cec9de25964cddb64104'
|
||||
'''
|
||||
def __init__(self, mininovaId):
|
||||
self.data = get_data(mininovaId)
|
||||
if not self.data:
|
||||
return
|
||||
Torrent.__init__(self)
|
||||
ratio = self.data['share ratio'].split(',')
|
||||
self['seeder'] = -1
|
||||
self['leecher'] = -1
|
||||
if len(ratio) == 2:
|
||||
val = int_value(ratio[0].replace(',','').strip())
|
||||
if val:
|
||||
self['seeder'] = int(val)
|
||||
val = int_value(ratio[1].replace(',','').strip())
|
||||
if val:
|
||||
self['leecher'] = int(val)
|
||||
val = int_value(self.data['downloads'].replace(',','').strip())
|
||||
if val:
|
||||
self['downloaded'] = int(val)
|
||||
else:
|
||||
self['downloaded'] = -1
|
||||
published = self.data['added on']
|
||||
published = published.split(' +')[0]
|
||||
self['published'] = datetime.strptime(published, "%a, %d %b %Y %H:%M:%S")
|
||||
|
||||
44
Shared/lib/python3.4/site-packages/ox/web/movieposterdb.py
Normal file
44
Shared/lib/python3.4/site-packages/ox/web/movieposterdb.py
Normal file
|
|
@ -0,0 +1,44 @@
|
|||
# -*- coding: UTF-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
|
||||
import re
|
||||
|
||||
from ox.cache import read_url
|
||||
from ox import find_re
|
||||
|
||||
def get_data(id):
|
||||
'''
|
||||
>>> get_data('0060304')['posters'][0]
|
||||
u'http://www.movieposterdb.com/posters/06_03/1967/0060304/l_99688_0060304_639fdd1e.jpg'
|
||||
>>> get_data('0123456')['posters']
|
||||
[]
|
||||
'''
|
||||
data = {
|
||||
"url": get_url(id)
|
||||
}
|
||||
data["posters"] = get_posters(data["url"])
|
||||
return data
|
||||
|
||||
def get_id(url):
|
||||
return url.split("/")[-2]
|
||||
|
||||
def get_posters(url, group=True, timeout=-1):
|
||||
posters = []
|
||||
html = read_url(url, timeout=timeout, unicode=True)
|
||||
if url in html:
|
||||
if group:
|
||||
results = re.compile('<a href="(http://www.movieposterdb.com/group/.+?)\??">', re.DOTALL).findall(html)
|
||||
for result in results:
|
||||
posters += get_posters(result, False)
|
||||
results = re.compile('<a href="(http://www.movieposterdb.com/poster/.+?)">', re.DOTALL).findall(html)
|
||||
for result in results:
|
||||
html = read_url(result, timeout=timeout, unicode=True)
|
||||
posters.append(find_re(html, '"(http://www.movieposterdb.com/posters/.+?\.jpg)"'))
|
||||
return posters
|
||||
|
||||
def get_url(id):
|
||||
return "http://www.movieposterdb.com/movie/%s/" % id
|
||||
|
||||
if __name__ == '__main__':
|
||||
print get_data('0060304')
|
||||
print get_data('0133093')
|
||||
41
Shared/lib/python3.4/site-packages/ox/web/opensubtitles.py
Normal file
41
Shared/lib/python3.4/site-packages/ox/web/opensubtitles.py
Normal file
|
|
@ -0,0 +1,41 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
import re
|
||||
|
||||
import feedparser
|
||||
from ox.cache import read_url
|
||||
from ox import find_re, strip_tags
|
||||
from ox.iso import langCode2To3, langTo3Code
|
||||
|
||||
def find_subtitles(imdb, parts = 1, language = "eng"):
|
||||
if len(language) == 2:
|
||||
language = langCode2To3(language)
|
||||
elif len(language) != 3:
|
||||
language = langTo3Code(language)
|
||||
url = "http://www.opensubtitles.org/en/search/"
|
||||
if language:
|
||||
url += "sublanguageid-%s/" % language
|
||||
url += "subsumcd-%s/subformat-srt/imdbid-%s/rss_2_00" % (parts, imdb)
|
||||
data = read_url(url)
|
||||
if "title>opensubtitles.com - search results</title" in data:
|
||||
fd = feedparser.parse(data)
|
||||
opensubtitleId = None
|
||||
if fd.entries:
|
||||
link = fd.entries[0]['links'][0]['href']
|
||||
opensubtitleId = re.compile('subtitles/(.*?)/').findall(link)
|
||||
if opensubtitleId:
|
||||
opensubtitleId = opensubtitleId[0]
|
||||
else:
|
||||
opensubtitleId = find_re(data, '/en/subtitles/(.*?)/')
|
||||
return opensubtitleId
|
||||
|
||||
def download_subtitle(opensubtitle_id):
|
||||
srts = {}
|
||||
data = read_url('http://www.opensubtitles.org/en/subtitles/%s' % opensubtitle_id)
|
||||
reg_exp = 'href="(/en/download/file/.*?)">(.*?)</a>'
|
||||
for f in re.compile(reg_exp, re.DOTALL).findall(data):
|
||||
name = strip_tags(f[1]).split('\n')[0]
|
||||
url = "http://www.opensubtitles.com%s" % f[0]
|
||||
srts[name] = read_url(url, unicode=True)
|
||||
return srts
|
||||
|
||||
10
Shared/lib/python3.4/site-packages/ox/web/oxdb.py
Normal file
10
Shared/lib/python3.4/site-packages/ox/web/oxdb.py
Normal file
|
|
@ -0,0 +1,10 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
import ox.cache
|
||||
|
||||
def get_poster_url(id):
|
||||
url = "http://0xdb.org/%s/poster.0xdb.jpg" % id
|
||||
if ox.cache.exists(url):
|
||||
return url
|
||||
return ''
|
||||
|
||||
19
Shared/lib/python3.4/site-packages/ox/web/piratecinema.py
Normal file
19
Shared/lib/python3.4/site-packages/ox/web/piratecinema.py
Normal file
|
|
@ -0,0 +1,19 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
from __future__ import print_function
|
||||
|
||||
import re
|
||||
from ox.net import read_url
|
||||
|
||||
def get_poster_url(id):
|
||||
url = 'http://piratecinema.org/posters/'
|
||||
html = read_url(url, unicode=True)
|
||||
results = re.compile('src="(.+)" title=".+\((\d{7})\)"').findall(html)
|
||||
for result in results:
|
||||
if result[1] == id:
|
||||
return url + result[0]
|
||||
return ''
|
||||
|
||||
if __name__ == '__main__':
|
||||
print(get_poster_url('0749451'))
|
||||
|
||||
54
Shared/lib/python3.4/site-packages/ox/web/rottentomatoes.py
Normal file
54
Shared/lib/python3.4/site-packages/ox/web/rottentomatoes.py
Normal file
|
|
@ -0,0 +1,54 @@
|
|||
# -*- coding: UTF-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
import re
|
||||
|
||||
from ox.cache import read_url
|
||||
from ox import find_re, strip_tags
|
||||
|
||||
|
||||
def get_url(id=None, imdb=None):
|
||||
#this would also wor but does not cache:
|
||||
'''
|
||||
from urllib2 import urlopen
|
||||
u = urlopen(url)
|
||||
return u.url
|
||||
'''
|
||||
if imdb:
|
||||
url = "http://www.rottentomatoes.com/alias?type=imdbid&s=%s" % imdb
|
||||
data = read_url(url)
|
||||
if "movie_title" in data:
|
||||
movies = re.compile('(/m/.*?/)').findall(data)
|
||||
if movies:
|
||||
return "http://www.rottentomatoes.com" + movies[0]
|
||||
return None
|
||||
|
||||
def get_og(data, key):
|
||||
return find_re(data, '<meta property="og:%s".*?content="(.*?)"' % key)
|
||||
|
||||
def get_data(url):
|
||||
data = read_url(url)
|
||||
r = {}
|
||||
r['title'] = find_re(data, '<h1 class="movie_title">(.*?)</h1>')
|
||||
if '(' in r['title']:
|
||||
r['year'] = find_re(r['title'], '\((\d*?)\)')
|
||||
r['title'] = strip_tags(re.sub('\((\d*?)\)', '', r['title'])).strip()
|
||||
r['summary'] = strip_tags(find_re(data, '<p id="movieSynopsis" class="movie_synopsis" itemprop="description">(.*?)</p>')).strip()
|
||||
r['summary'] = r['summary'].replace('\t', ' ').replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')
|
||||
if not r['summary']:
|
||||
r['summary'] = get_og(data, 'description')
|
||||
|
||||
meter = re.compile('<span id="all-critics-meter" class="meter(.*?)">(.*?)</span>').findall(data)
|
||||
meter = filter(lambda m: m[1].isdigit(), meter)
|
||||
if meter:
|
||||
r['tomatometer'] = meter[0][1]
|
||||
r['rating'] = find_re(data, 'Average Rating: <span>([\d.]+)/10</span>')
|
||||
r['user_score'] = find_re(data, '<span class="meter popcorn numeric ">(\d+)</span>')
|
||||
r['user_rating'] = find_re(data, 'Average Rating: ([\d.]+)/5')
|
||||
poster = get_og(data, 'image')
|
||||
if poster and not 'poster_default.gif' in poster:
|
||||
r['posters'] = [poster]
|
||||
for key in r.keys():
|
||||
if not r[key]:
|
||||
del r[key]
|
||||
return r
|
||||
|
||||
76
Shared/lib/python3.4/site-packages/ox/web/siteparser.py
Normal file
76
Shared/lib/python3.4/site-packages/ox/web/siteparser.py
Normal file
|
|
@ -0,0 +1,76 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
import re
|
||||
|
||||
from six import string_types
|
||||
|
||||
from ..cache import read_url
|
||||
from .. import decode_html
|
||||
from ..utils import datetime
|
||||
|
||||
|
||||
def cleanup(key, data, data_type):
|
||||
if data:
|
||||
if isinstance(data[0], string_types):
|
||||
#FIXME: some types need strip_tags
|
||||
#data = [strip_tags(decode_html(p)).strip() for p in data]
|
||||
data = [decode_html(p).strip() for p in data]
|
||||
elif isinstance(data[0], list) or isinstance(data[0], tuple):
|
||||
data = [cleanup(key, p, data_type) for p in data]
|
||||
while len(data) == 1 and not isinstance(data, string_types):
|
||||
data = data[0]
|
||||
if data_type == 'list' and isinstance(data, string_types):
|
||||
data = [data, ]
|
||||
elif data_type != 'list':
|
||||
data = ''
|
||||
return data
|
||||
|
||||
class SiteParser(dict):
|
||||
baseUrl = ''
|
||||
regex = {}
|
||||
|
||||
def get_url(self, page):
|
||||
return "%s%s" % (self.baseUrl, page)
|
||||
|
||||
def read_url(self, url, timeout):
|
||||
if not url in self._cache:
|
||||
self._cache[url] = read_url(url, timeout=timeout, unicode=True)
|
||||
return self._cache[url]
|
||||
|
||||
def __init__(self, timeout=-1):
|
||||
self._cache = {}
|
||||
for key in self.regex:
|
||||
url = self.get_url(self.regex[key]['page'])
|
||||
data = self.read_url(url, timeout)
|
||||
if isinstance(self.regex[key]['re'], string_types):
|
||||
data = re.compile(self.regex[key]['re'], re.DOTALL).findall(data)
|
||||
data = cleanup(key, data, self.regex[key]['type'])
|
||||
elif callable(self.regex[key]['re']):
|
||||
data = self.regex[key]['re'](data)
|
||||
else:
|
||||
for r in self.regex[key]['re']:
|
||||
if callable(r):
|
||||
f = r
|
||||
else:
|
||||
f = re.compile(r, re.DOTALL).findall
|
||||
if isinstance(data, string_types):
|
||||
data = f(data)
|
||||
else:
|
||||
data = [f(d) for d in data]
|
||||
data = cleanup(key, data, self.regex[key]['type'])
|
||||
def apply_f(f, data):
|
||||
if data and isinstance(data[0], list):
|
||||
data = [f(d) for d in data]
|
||||
else:
|
||||
data = f(data)
|
||||
return data
|
||||
if self.regex[key]['type'] == 'float' and data:
|
||||
data = apply_f(float, data)
|
||||
elif self.regex[key]['type'] == 'int' and data:
|
||||
data = apply_f(int, data)
|
||||
elif self.regex[key]['type'] == 'date':
|
||||
parse_date = lambda d: d and datetime.strptime('-'.join(d), '%m-%d-%Y').strftime('%Y-%m-%d')
|
||||
data = apply_f(parse_date, data)
|
||||
if data:
|
||||
self[key] = data
|
||||
|
||||
287
Shared/lib/python3.4/site-packages/ox/web/spiegel.py
Normal file
287
Shared/lib/python3.4/site-packages/ox/web/spiegel.py
Normal file
|
|
@ -0,0 +1,287 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
from datetime import datetime
|
||||
import re
|
||||
import time
|
||||
|
||||
import ox.cache
|
||||
from ox.html import decode_html, strip_tags
|
||||
import ox.net
|
||||
|
||||
|
||||
def get_news(year, month, day):
|
||||
sections = [
|
||||
'politik', 'wirtschaft', 'panorama', 'sport', 'kultur', 'netzwelt',
|
||||
'wissenschaft', 'unispiegel', 'schulspiegel', 'reise', 'auto'
|
||||
]
|
||||
dt = datetime(year, month, day)
|
||||
day = int(dt.strftime('%j'))
|
||||
date = dt.strftime('%d.%m.%Y')
|
||||
news = []
|
||||
for section in sections:
|
||||
url = 'http://www.spiegel.de/%s/0,1518,archiv-%d-%03d,00.html' % (section, year, day)
|
||||
if date == time.strftime('%d.%m.%Y', time.localtime()):
|
||||
html = ox.net.read_url(url)
|
||||
else:
|
||||
html = ox.cache.read_url(url)
|
||||
for item in re.compile('<div class="spTeaserCenterpage(.*?)</p>', re.DOTALL).findall(html):
|
||||
dateString = strip_tags(re.compile('<div class="spDateTime">(.*?)</div>', re.DOTALL).findall(item)[0]).strip()
|
||||
try:
|
||||
description = format_string(re.compile('<p>(.*?)<', re.DOTALL).findall(item)[0])
|
||||
except:
|
||||
description = ''
|
||||
try:
|
||||
imageUrl = re.compile('<img src="(.*?)"').findall(item)[0]
|
||||
except:
|
||||
imageUrl = ''
|
||||
try:
|
||||
title = format_string(re.compile('alt=[\'|"](.*?)[\'|"] title=', re.DOTALL).findall(item)[0]).replace(' : ', ': ').replace('::', ':')
|
||||
except:
|
||||
title = ''
|
||||
if dateString[:10] == date and description and imageUrl and title.find(': ') != -1:
|
||||
new = {}
|
||||
if len(dateString) == 10:
|
||||
new['date'] = '%s-%s-%s 00:00' % (dateString[6:10], dateString[3:5], dateString[:2])
|
||||
else:
|
||||
new['date'] = '%s-%s-%s %s:%s' % (dateString[6:10], dateString[3:5], dateString[:2], dateString[12:14], dateString[15:17])
|
||||
# fix decode_html
|
||||
# new['description'] = format_string(decode_html(description))
|
||||
new['description'] = format_string(description)
|
||||
new['imageUrl'] = imageUrl
|
||||
new['section'] = format_section(section)
|
||||
new['title'] = format_string(title)
|
||||
new['title1'] = new['title'].replace('\xdf', '\xdf\xdf')[:len(format_string(re.compile('<h4>(.*?)</h4>', re.DOTALL).findall(item)[0]))].replace('\xdf\xdf', '\xdf')
|
||||
if new['title1'][-1:] == ':':
|
||||
new['title1'] = new['title1'][0:-1]
|
||||
new['title2'] = new['title'][len(new['title1']) + 2:]
|
||||
new['url'] = re.compile('<a href="(.*?)"').findall(item)[0]
|
||||
if new['url'][:1] == '/':
|
||||
new['url'] = 'http://www.spiegel.de' + new['url']
|
||||
news.append(new)
|
||||
# print '%s, %s' % (new['section'], dateString)
|
||||
'''
|
||||
elif dateString[:10] == date and not description:
|
||||
print dateString + ' - no description'
|
||||
elif dateString[:10] == date and not imageUrl:
|
||||
print dateString + ' - no image'
|
||||
'''
|
||||
return news
|
||||
|
||||
def split_title(title):
|
||||
title1 = re.compile('(.*?): ').findall(title)[0]
|
||||
title2 = re.compile(': (.*?)$').findall(title)[0]
|
||||
return [title1, title2]
|
||||
|
||||
def format_string(string):
|
||||
string = string.replace('<span class="spOptiBreak"> </span>', '')
|
||||
string = string.replace('\n', ' ').replace(' ', ' ').strip()
|
||||
string = string.replace('&', '&').replace(''', '\'').replace('"', '"')
|
||||
return string
|
||||
|
||||
def format_section(string):
|
||||
return string[:1].upper() + string[1:].replace('spiegel', 'SPIEGEL')
|
||||
|
||||
def format_subsection(string):
|
||||
# SPIEGEL, SPIEGEL special
|
||||
subsection = {
|
||||
'abi': 'Abi - und dann?',
|
||||
'formel1': 'Formel 1',
|
||||
'jobundberuf': 'Job & Beruf',
|
||||
'leben': 'Leben U21',
|
||||
'mensch': 'Mensch & Technik',
|
||||
'sonst': '',
|
||||
'staedte': u'St\xc3dte',
|
||||
'ussports': 'US-Sports',
|
||||
'wunderbar': 'wunderBAR'
|
||||
}
|
||||
if subsection.has_key(string):
|
||||
return subsection[string].replace(u'\xc3', 'ae')
|
||||
return string[:1].upper() + string[1:]
|
||||
|
||||
def get_issue(year, week):
|
||||
coverUrl = 'http://www.spiegel.de/static/epaper/SP/%d/%d/ROSPANZ%d%03d0001-312.jpg' % (year, week, year, week)
|
||||
if not ox.net.exists(coverUrl):
|
||||
return None
|
||||
url = 'http://service.spiegel.de/digas/servlet/epaper?Q=SP&JG=%d&AG=%d&SE=1&AN=INHALT' % (year, week)
|
||||
contents = []
|
||||
data = ox.cache.read_url(url)
|
||||
items = re.compile('<a.?href="http://service.spiegel.de/digas/servlet/epaper\?Q=SP&JG=".?>(.*?)</a>').findall(data)
|
||||
for item in items:
|
||||
item = item[1]
|
||||
page = int(re.compile('&SE=(.*?)"').findall(item)[0])
|
||||
title = strip_tags(item).strip()
|
||||
contents.append({'title': title, 'page': page})
|
||||
pageUrl = {}
|
||||
pages = page + 2
|
||||
for page in range(1, pages + 10):
|
||||
url = 'http://www.spiegel.de/static/epaper/SP/%d/%d/ROSPANZ%d%03d%04d-205.jpg' % (year, week, year, week, page)
|
||||
if ox.cache.exists(url):
|
||||
pageUrl[page] = url
|
||||
else:
|
||||
pageUrl[page] = ''
|
||||
return {'pages': pages, 'contents': contents, 'coverUrl': coverUrl, 'pageUrl': pageUrl}
|
||||
|
||||
|
||||
def archive_issues():
|
||||
'''
|
||||
this is just an example of an archiving application
|
||||
'''
|
||||
p = {}
|
||||
import os
|
||||
from ox.utils import json
|
||||
import time
|
||||
archivePath = '/Volumes/Rolux Home/Desktop/Data/spiegel.de/Der Spiegel'
|
||||
localtime = time.localtime()
|
||||
year = int(time.strftime('%Y', localtime))
|
||||
week = int(time.strftime('%W', localtime))
|
||||
for y in range(year, 1993, -1):
|
||||
if y == year:
|
||||
wMax = week + 1
|
||||
else:
|
||||
wMax = 53
|
||||
for w in range(wMax, 0, -1):
|
||||
print 'get_issue(%d, %d)' % (y, w)
|
||||
issue = get_issue(y, w)
|
||||
if issue:
|
||||
dirname = '%s/%d/%02d' % (archivePath, y, w)
|
||||
if not os.path.exists(dirname):
|
||||
os.makedirs(dirname)
|
||||
filename = '%s/Der Spiegel %d %02d.json' % (dirname, y, w)
|
||||
if not os.path.exists(filename):
|
||||
data = json.dumps(issue, ensure_ascii = False)
|
||||
f = open(filename, 'w')
|
||||
f.write(data)
|
||||
f.close()
|
||||
filename = '%s/Der Spiegel %d %02d.txt' % (dirname, y, w)
|
||||
if not os.path.exists(filename):
|
||||
data = []
|
||||
for item in issue['contents']:
|
||||
data.append('%3d %s' % (item['page'], item['title']))
|
||||
data = '\n'.join(data)
|
||||
f = open(filename, 'w')
|
||||
f.write(data)
|
||||
f.close()
|
||||
filename = '%s/Der Spiegel %d %02d.jpg' % (dirname, y, w)
|
||||
if not os.path.exists(filename):
|
||||
data = ox.cache.read_url(issue['coverUrl'])
|
||||
f = open(filename, 'w')
|
||||
f.write(data)
|
||||
f.close()
|
||||
for page in issue['pageUrl']:
|
||||
url = issue['pageUrl'][page]
|
||||
if url:
|
||||
filename = '%s/Der Spiegel %d %02d %03d.jpg' % (dirname, y, w, page)
|
||||
if not os.path.exists(filename):
|
||||
data = ox.cache.read_url(url)
|
||||
f = open(filename, 'w')
|
||||
f.write(data)
|
||||
f.close()
|
||||
if not p:
|
||||
p = {'num': 1, 'sum': issue['pages'], 'min': issue['pages'], 'max': issue['pages']}
|
||||
else:
|
||||
p['num'] += 1
|
||||
p['sum'] += issue['pages']
|
||||
if issue['pages'] < p['min']:
|
||||
p['min'] = issue['pages']
|
||||
if issue['pages'] > p['max']:
|
||||
p['max'] = issue['pages']
|
||||
print p['min'], p['sum'] / p['num'], p['max']
|
||||
|
||||
|
||||
def archive_news():
|
||||
'''
|
||||
this is just an example of an archiving application
|
||||
'''
|
||||
import os
|
||||
from ox.utils import json
|
||||
import time
|
||||
|
||||
count = {}
|
||||
colon = []
|
||||
|
||||
archivePath = '/Volumes/Rolux Home/Desktop/Data/spiegel.de/Spiegel Online'
|
||||
days = [0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
|
||||
localtime = time.localtime()
|
||||
year = int(time.strftime('%Y', localtime))
|
||||
month = int(time.strftime('%m', localtime))
|
||||
day = int(time.strftime('%d', localtime)) - 1
|
||||
for y in range(year, 1999, -1):
|
||||
if y == year:
|
||||
mMax = month
|
||||
else:
|
||||
mMax = 12
|
||||
for m in range(mMax, 0, -1):
|
||||
if y == year and m == month:
|
||||
dMax = day
|
||||
elif m == 2 and y % 4 == 0 and y % 400 != 0:
|
||||
dMax = days[m] + 1
|
||||
else:
|
||||
dMax = days[m]
|
||||
for d in range(dMax, 0, -1):
|
||||
print 'getNews(%d, %d, %d)' % (y, m, d)
|
||||
news = getNews(y, m ,d)
|
||||
for new in news:
|
||||
dirname = archivePath + '/' + new['date'][0:4] + '/' + new['date'][5:7] + new['date'][8:10] + '/' + new['date'][11:13] + new['date'][14:16]
|
||||
if not os.path.exists(dirname):
|
||||
os.makedirs(dirname)
|
||||
if new['url'][-5:] == '.html':
|
||||
filename = dirname + '/' + new['url'].split('/')[-1][:-5] + '.json'
|
||||
else:
|
||||
filename = dirname + '/' + new['url'] + '.json'
|
||||
if not os.path.exists(filename) or True:
|
||||
data = json.dumps(new, ensure_ascii = False)
|
||||
f = open(filename, 'w')
|
||||
f.write(data)
|
||||
f.close()
|
||||
filename = filename[:-5] + '.txt'
|
||||
if not os.path.exists(filename) or True:
|
||||
data = split_title(new['title'])
|
||||
data.append(new['description'])
|
||||
data = '\n'.join(data)
|
||||
f = open(filename, 'w')
|
||||
f.write(data)
|
||||
f.close()
|
||||
filename = dirname + '/' + new['imageUrl'].split('/')[-1]
|
||||
if not os.path.exists(filename):
|
||||
data = ox.cache.read_url(new['imageUrl'])
|
||||
f = open(filename, 'w')
|
||||
f.write(data)
|
||||
f.close()
|
||||
|
||||
strings = new['url'].split('/')
|
||||
string = strings[3]
|
||||
if len(strings) == 6:
|
||||
string += '/' + strings[4]
|
||||
if not count.has_key(string):
|
||||
count[string] = {'count': 1, 'string': '%s %s http://www.spiegel.de/%s/0,1518,archiv-%d-%03d,00.html' % (new['date'], new['date'], new['section'].lower(), y, int(datetime(y, m, d).strftime('%j')))}
|
||||
else:
|
||||
count[string] = {'count': count[string]['count'] + 1, 'string': '%s %s' % (new['date'], count[string]['string'][17:])}
|
||||
strings = split_title(new['title'])
|
||||
if strings[0] != new['title1'] or strings[1] != new['title2']:
|
||||
colon.append('%s %s %s: %s' % (new['date'], new['title'], new['title1'], new['title2']))
|
||||
for key in sorted(count):
|
||||
print '%6d %-24s %s' % (count[key]['count'], key, count[key]['string'])
|
||||
for value in colon:
|
||||
print value
|
||||
|
||||
if __name__ == '__main__':
|
||||
# spiegel = Spiegel(2008, 8)
|
||||
# print spiegel.getContents()
|
||||
# news = News(2001, 9, 10)
|
||||
# output(news.getNews())
|
||||
'''
|
||||
x = []
|
||||
for d in range(10, 30):
|
||||
print '2/%d' % d
|
||||
news = getNews(2008, 2, d)
|
||||
for new in news:
|
||||
strings = new['url'].split('/')
|
||||
string = format_section(strings[3])
|
||||
if len(strings) == 6:
|
||||
string += '/' + format_subsection(strings[4])
|
||||
if not string in x:
|
||||
x.append(string)
|
||||
print x
|
||||
'''
|
||||
# archive_issues()
|
||||
archive_news()
|
||||
117
Shared/lib/python3.4/site-packages/ox/web/thepiratebay.py
Normal file
117
Shared/lib/python3.4/site-packages/ox/web/thepiratebay.py
Normal file
|
|
@ -0,0 +1,117 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
from datetime import datetime
|
||||
import re
|
||||
import socket
|
||||
from urllib import quote, urlencode
|
||||
from urllib2 import URLError
|
||||
|
||||
from ox import find_re, cache, strip_tags, decode_html, get_torrent_info, normalize_newlines
|
||||
from ox.normalize import normalize_imdbid
|
||||
import ox
|
||||
|
||||
from torrent import Torrent
|
||||
|
||||
cache_timeout = 24*60*60 # cache search only for 24 hours
|
||||
|
||||
season_episode = re.compile("S..E..", re.IGNORECASE)
|
||||
|
||||
|
||||
def read_url(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None, unicode=False):
|
||||
headers = headers.copy()
|
||||
headers['Cookie'] = 'language=en_EN'
|
||||
return cache.read_url(url, data, headers, timeout, unicode=unicode)
|
||||
|
||||
def find_movies(query=None, imdb=None, max_results=10):
|
||||
if imdb:
|
||||
query = "tt" + normalize_imdbid(imdb)
|
||||
results = []
|
||||
next = ["http://thepiratebay.org/search/%s/0/3/200" % quote(query), ]
|
||||
page_count = 1
|
||||
while next and page_count < 4:
|
||||
page_count += 1
|
||||
url = next[0]
|
||||
if not url.startswith('http'):
|
||||
if not url.startswith('/'):
|
||||
url = "/" + url
|
||||
url = "http://thepiratebay.org" + url
|
||||
data = read_url(url, timeout=cache_timeout, unicode=True)
|
||||
regexp = '''<tr.*?<td class="vertTh"><a href="/browse/(.*?)".*?<td><a href="(/torrent/.*?)" class="detLink".*?>(.*?)</a>.*?</tr>'''
|
||||
for row in re.compile(regexp, re.DOTALL).findall(data):
|
||||
torrentType = row[0]
|
||||
torrentLink = "http://thepiratebay.org" + row[1]
|
||||
torrentTitle = decode_html(row[2])
|
||||
# 201 = Movies , 202 = Movie DVDR, 205 TV Shows
|
||||
if torrentType in ['201']:
|
||||
results.append((torrentTitle, torrentLink, ''))
|
||||
if len(results) >= max_results:
|
||||
return results
|
||||
next = re.compile('<a.*?href="(.*?)".*?>.*?next.gif.*?</a>').findall(data)
|
||||
return results
|
||||
|
||||
def get_id(piratebayId):
|
||||
if piratebayId.startswith('http://torrents.thepiratebay.org/'):
|
||||
piratebayId = piratebayId.split('org/')[1]
|
||||
d = find_re(piratebayId, "tor/(\d+)")
|
||||
if d:
|
||||
piratebayId = d
|
||||
d = find_re(piratebayId, "torrent/(\d+)")
|
||||
if d:
|
||||
piratebayId = d
|
||||
return piratebayId
|
||||
|
||||
def exists(piratebayId):
|
||||
piratebayId = get_id(piratebayId)
|
||||
return ox.net.exists("http://thepiratebay.org/torrent/%s" % piratebayId)
|
||||
|
||||
def get_data(piratebayId):
|
||||
_key_map = {
|
||||
'spoken language(s)': u'language',
|
||||
'texted language(s)': u'subtitle language',
|
||||
'by': u'uploader',
|
||||
'leechers': 'leecher',
|
||||
'seeders': 'seeder',
|
||||
}
|
||||
piratebayId = get_id(piratebayId)
|
||||
torrent = dict()
|
||||
torrent[u'id'] = piratebayId
|
||||
torrent[u'domain'] = 'thepiratebay.org'
|
||||
torrent[u'comment_link'] = 'http://thepiratebay.org/torrent/%s' % piratebayId
|
||||
|
||||
data = read_url(torrent['comment_link'], unicode=True)
|
||||
torrent[u'title'] = find_re(data, '<title>(.*?) \(download torrent\) - TPB</title>')
|
||||
if not torrent[u'title']:
|
||||
return None
|
||||
torrent[u'title'] = decode_html(torrent[u'title']).strip()
|
||||
torrent[u'imdbId'] = find_re(data, 'title/tt(\d{7})')
|
||||
title = quote(torrent['title'].encode('utf-8'))
|
||||
torrent[u'torrent_link']="http://torrents.thepiratebay.org/%s/%s.torrent" % (piratebayId, title)
|
||||
for d in re.compile('dt>(.*?):</dt>.*?<dd.*?>(.*?)</dd>', re.DOTALL).findall(data):
|
||||
key = d[0].lower().strip()
|
||||
key = _key_map.get(key, key)
|
||||
value = decode_html(strip_tags(d[1].strip()))
|
||||
torrent[key] = value
|
||||
torrent[u'description'] = find_re(data, '<div class="nfo">(.*?)</div>')
|
||||
if torrent[u'description']:
|
||||
torrent['description'] = normalize_newlines(decode_html(strip_tags(torrent['description']))).strip()
|
||||
t = read_url(torrent[u'torrent_link'])
|
||||
torrent[u'torrent_info'] = get_torrent_info(t)
|
||||
return torrent
|
||||
|
||||
class Thepiratebay(Torrent):
|
||||
'''
|
||||
>>> Thepiratebay('123')
|
||||
{}
|
||||
|
||||
>>> Thepiratebay('3951349')['infohash']
|
||||
'4e84415d36ed7b54066160c05a0b0f061898d12b'
|
||||
'''
|
||||
def __init__(self, piratebayId):
|
||||
self.data = get_data(piratebayId)
|
||||
if not self.data:
|
||||
return
|
||||
Torrent.__init__(self)
|
||||
published = self.data['uploaded']
|
||||
published = published.replace(' GMT', '').split(' +')[0]
|
||||
self['published'] = datetime.strptime(published, "%Y-%m-%d %H:%M:%S")
|
||||
|
||||
37
Shared/lib/python3.4/site-packages/ox/web/torrent.py
Normal file
37
Shared/lib/python3.4/site-packages/ox/web/torrent.py
Normal file
|
|
@ -0,0 +1,37 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
from ox import int_value
|
||||
|
||||
|
||||
class Torrent(dict):
|
||||
'''
|
||||
>>> Torrent()
|
||||
{'files': 1, 'domain': u'', 'subtitle language': u'', 'seeder': -1, 'description': u'', 'language': u'', 'title': u'', 'imdbId': u'', 'downloaded': -1, 'leecher': -1, 'torrent_link': u'', 'torrent_info': {}, 'published': u'', 'announce': '', 'infohash': '', 'id': u'', 'comment_link': u'', 'size': -1}
|
||||
'''
|
||||
_string_keys = ('id', 'title', 'description', 'infohash', 'torrent_link', 'comment_link',
|
||||
'imdbId', 'announce', 'domain', 'published', 'language', 'subtitle language')
|
||||
_int_keys = ('size', 'seeder', 'leecher', 'downloaded', 'files')
|
||||
_dict_keys = ('torrent_info', )
|
||||
_list_keys = ()
|
||||
data = {'torrent_info': {}}
|
||||
|
||||
def __init__(self):
|
||||
for key in self._string_keys:
|
||||
self[key] = self.data.get(key, u'')
|
||||
for key in self._dict_keys:
|
||||
self[key] = self.data.get(key, {})
|
||||
for key in self._list_keys:
|
||||
self[key] = self.data.get(key, [])
|
||||
for key in self._int_keys:
|
||||
value = self.data.get(key, -1)
|
||||
if not isinstance(value, int):
|
||||
value = int(int_value(value))
|
||||
self[key] = value
|
||||
self['infohash'] = self.data['torrent_info'].get('hash', '')
|
||||
self['size'] = self.data['torrent_info'].get('size', -1)
|
||||
self['announce'] = self.data['torrent_info'].get('announce', '')
|
||||
if 'files' in self.data['torrent_info']:
|
||||
self['files'] = len(self.data['torrent_info']['files'])
|
||||
else:
|
||||
self['files'] = 1
|
||||
|
||||
32
Shared/lib/python3.4/site-packages/ox/web/tv.py
Normal file
32
Shared/lib/python3.4/site-packages/ox/web/tv.py
Normal file
|
|
@ -0,0 +1,32 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
import re
|
||||
import time
|
||||
|
||||
from ox import strip_tags, find_re
|
||||
from ox.cache import read_url
|
||||
|
||||
|
||||
def get_episode_data(url):
|
||||
'''
|
||||
prases informatin on tvcom episode pages
|
||||
returns dict with title, show, description, score
|
||||
example:
|
||||
get_episode_data('http://www.tv.com/lost/do-no-harm/episode/399310/summary.html')
|
||||
'''
|
||||
data = read_url(url, unicode=True)
|
||||
r = {}
|
||||
r['description'] = strip_tags(find_re(data, 'div id="main-col">.*?<div>(.*?)</div').split('\r')[0])
|
||||
r['show'] = find_re(data, '<h1>(.*?)</h1>')
|
||||
r['title'] = find_re(data, '<title>.*?: (.*?) - TV.com </title>')
|
||||
#episode score
|
||||
r['episode score'] = find_re(data, '<span class="f-28 f-bold mt-10 mb-10 f-FF9 db lh-18">(.*?)</span>')
|
||||
|
||||
match = re.compile('Episode Number: (\d*?) Season Num: (\d*?) First Aired: (.*?)  ').findall(data)
|
||||
if match:
|
||||
r['season'] = int(match[0][1])
|
||||
r['episode'] = int(match[0][0])
|
||||
#'Wednesday September 29, 2004' -> 2004-09-29
|
||||
r['air date'] = time.strftime('%Y-%m-%d', time.strptime(match[0][2], '%A %B %d, %Y'))
|
||||
return r
|
||||
|
||||
35
Shared/lib/python3.4/site-packages/ox/web/twitter.py
Normal file
35
Shared/lib/python3.4/site-packages/ox/web/twitter.py
Normal file
|
|
@ -0,0 +1,35 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
import re
|
||||
from datetime import datetime
|
||||
from urllib import quote
|
||||
|
||||
import lxml.html
|
||||
import ox
|
||||
from ox.cache import read_url
|
||||
|
||||
def find(query=None, user=None, timeout=60):
|
||||
if user:
|
||||
url = 'https://twitter.com/' + quote(user)
|
||||
else:
|
||||
url = 'https://twitter.com/search/' + quote(query)
|
||||
data = ox.cache.read_url(url, timeout=timeout).decode('utf-8')
|
||||
doc = lxml.html.document_fromstring(data)
|
||||
tweets = []
|
||||
for e in doc.xpath("//div[contains(@class, 'original-tweet')]"):
|
||||
t = lxml.html.tostring(e)
|
||||
text = e.xpath(".//p[contains(@class, 'js-tweet-text')]")[0]
|
||||
html = lxml.html.tostring(text, encoding='unicode').strip()
|
||||
text = ox.decode_html(ox.strip_tags(html)).strip()
|
||||
user = re.compile('data-name="(.*?)"').findall(t)[0]
|
||||
user = ox.decode_html(ox.strip_tags(user)).strip()
|
||||
tweets.append({
|
||||
'id': re.compile('data-tweet-id="(\d+)"').findall(t)[0],
|
||||
'user-id': re.compile('data-user-id="(\d+)"').findall(t)[0],
|
||||
'name': re.compile('data-screen-name="(.*?)"').findall(t)[0],
|
||||
'time': datetime.fromtimestamp(int(re.compile('data-time="(\d+)"').findall(t)[0])),
|
||||
'user': user,
|
||||
'text': text,
|
||||
'html': html,
|
||||
})
|
||||
return tweets
|
||||
99
Shared/lib/python3.4/site-packages/ox/web/ubu.py
Normal file
99
Shared/lib/python3.4/site-packages/ox/web/ubu.py
Normal file
|
|
@ -0,0 +1,99 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
import re
|
||||
|
||||
from ox import find_re, strip_tags, decode_html
|
||||
from ox.cache import read_url
|
||||
|
||||
|
||||
def get_id(url):
|
||||
return url.replace('http://www.ubu.com/', '').split('.html')[0]
|
||||
|
||||
def get_url(id):
|
||||
return 'http://www.ubu.com/%s.html' % id
|
||||
|
||||
def get_data(url):
|
||||
if not url.startswith('http:'):
|
||||
url = get_url(url)
|
||||
data = read_url(url, unicode=True)
|
||||
m = {
|
||||
'id': get_id(url),
|
||||
'url': url,
|
||||
'type': re.compile('ubu.com/(.*?)/').findall(url)[0]
|
||||
}
|
||||
for videourl, title in re.compile('<a href="(http://ubumexico.centro.org.mx/.*?)">(.*?)</a>').findall(data):
|
||||
if videourl.endswith('.srt'):
|
||||
m['srt'] = videourl
|
||||
elif not 'video' in m:
|
||||
m['video'] = videourl
|
||||
m['video'] = m['video'].replace('/video/ ', '/video/').replace(' ', '%20')
|
||||
if m['video'] == 'http://ubumexico.centro.org.mx/video/':
|
||||
del m['video']
|
||||
m['title'] = strip_tags(decode_html(title)).strip()
|
||||
if not 'url' in m:
|
||||
print url, 'missing'
|
||||
if 'title' in m:
|
||||
m['title'] = re.sub('(.*?) \(\d{4}\)$', '\\1', m['title'])
|
||||
|
||||
match = re.compile("flashvars','file=(.*?.flv)'").findall(data)
|
||||
if match:
|
||||
m['flv'] = match[0]
|
||||
m['flv'] = m['flv'].replace('/video/ ', '/video/').replace(' ', '%20')
|
||||
|
||||
y = re.compile('\((\d{4})\)').findall(data)
|
||||
if y:
|
||||
m['year'] = int(y[0])
|
||||
d = re.compile('Director: (.+)').findall(data)
|
||||
if d:
|
||||
m['director'] = strip_tags(decode_html(d[0])).strip()
|
||||
|
||||
a = re.compile('<a href="(.*?)">Back to (.*?)</a>', re.DOTALL).findall(data)
|
||||
if a:
|
||||
m['artist'] = strip_tags(decode_html(a[0][1])).strip()
|
||||
else:
|
||||
a = re.compile('<a href="(.*?)">(.*?) in UbuWeb Film').findall(data)
|
||||
if a:
|
||||
m['artist'] = strip_tags(decode_html(a[0][1])).strip()
|
||||
else:
|
||||
a = re.compile('<b>(.*?)\(b\..*?\d{4}\)').findall(data)
|
||||
if a:
|
||||
m['artist'] = strip_tags(decode_html(a[0])).strip()
|
||||
elif m['id'] == 'film/lawder_color':
|
||||
m['artist'] = 'Standish Lawder'
|
||||
if 'artist' in m:
|
||||
m['artist'] = m['artist'].replace('in UbuWeb Film', '')
|
||||
m['artist'] = m['artist'].replace('on UbuWeb Film', '').strip()
|
||||
if m['id'] == 'film/coulibeuf':
|
||||
m['title'] = 'Balkan Baroque'
|
||||
m['year'] = 1999
|
||||
return m
|
||||
|
||||
def get_films():
|
||||
ids = get_ids()
|
||||
films = []
|
||||
for id in ids:
|
||||
info = get_data(id)
|
||||
if info['type'] == 'film' and ('flv' in info or 'video' in info):
|
||||
films.append(info)
|
||||
return films
|
||||
|
||||
def get_ids():
|
||||
data = read_url('http://www.ubu.com/film/')
|
||||
ids = []
|
||||
author_urls = []
|
||||
for url, author in re.compile('<a href="(\./.*?)">(.*?)</a>').findall(data):
|
||||
url = 'http://www.ubu.com/film' + url[1:]
|
||||
data = read_url(url)
|
||||
author_urls.append(url)
|
||||
for u, title in re.compile('<a href="(.*?)">(.*?)</a>').findall(data):
|
||||
if not u.startswith('http'):
|
||||
if u == '../../sound/burroughs.html':
|
||||
u = 'http://www.ubu.com/sound/burroughs.html'
|
||||
elif u.startswith('../'):
|
||||
u = 'http://www.ubu.com/' + u[3:]
|
||||
else:
|
||||
u = 'http://www.ubu.com/film/' + u
|
||||
if u not in author_urls and u.endswith('.html'):
|
||||
ids.append(u)
|
||||
ids = [get_id(url) for url in list(set(ids))]
|
||||
return ids
|
||||
27
Shared/lib/python3.4/site-packages/ox/web/vimeo.py
Normal file
27
Shared/lib/python3.4/site-packages/ox/web/vimeo.py
Normal file
|
|
@ -0,0 +1,27 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
import re
|
||||
from StringIO import StringIO
|
||||
import xml.etree.ElementTree as ET
|
||||
|
||||
from ox.cache import read_url
|
||||
from ox import find_string, find_re
|
||||
|
||||
|
||||
def get_data(id):
|
||||
url = 'http://www.vimeo.com/moogaloop/load/clip:%s' %id
|
||||
xml = read_url(url)
|
||||
tree = ET.parse(StringIO(xml))
|
||||
request_signature = tree.find('request_signature').text
|
||||
request_signature_expires = tree.find('request_signature_expires').text
|
||||
|
||||
data = {}
|
||||
video_url = "http://www.vimeo.com/moogaloop/play/clip:%s/%s/%s/?q=" % \
|
||||
(id, request_signature, request_signature_expires)
|
||||
data['video_sd'] = video_url + 'sd'
|
||||
data['video_hd'] = video_url + 'hd'
|
||||
video = tree.find('video')
|
||||
for key in ('caption', 'width', 'height', 'duration', 'thumbnail'):
|
||||
data[key] = video.find(key).text
|
||||
return data
|
||||
|
||||
156
Shared/lib/python3.4/site-packages/ox/web/wikipedia.py
Normal file
156
Shared/lib/python3.4/site-packages/ox/web/wikipedia.py
Normal file
|
|
@ -0,0 +1,156 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
from __future__ import print_function
|
||||
|
||||
import re
|
||||
|
||||
from six.moves import urllib
|
||||
|
||||
from ox.utils import json
|
||||
from ox.cache import read_url
|
||||
from ox import find_re
|
||||
|
||||
|
||||
def get_id(url):
|
||||
return url.split("/")[-1]
|
||||
|
||||
def get_url(id=None, imdb=None, allmovie=None):
|
||||
if imdb:
|
||||
query = '"%s"'% imdb
|
||||
result = find(query)
|
||||
if result:
|
||||
url = result[0][1]
|
||||
data = get_movie_data(url)
|
||||
if 'imdb_id' in data:
|
||||
return url
|
||||
return ""
|
||||
if allmovie:
|
||||
query = '"amg_id = 1:%s"'% allmovie
|
||||
result = find(query)
|
||||
if result:
|
||||
url = result[0][1]
|
||||
return url
|
||||
return ''
|
||||
return "http://en.wikipedia.org/wiki/%s" % id
|
||||
|
||||
def get_movie_id(title, director='', year=''):
|
||||
query = '"%s" film %s %s' % (title, director, year)
|
||||
result = find(query, 1)
|
||||
if result:
|
||||
return result[0][1]
|
||||
return ''
|
||||
|
||||
def get_wiki_data(wikipedia_url):
|
||||
url = wikipedia_url.replace('wikipedia.org/wiki/', 'wikipedia.org/w/index.php?title=')
|
||||
url = "%s&action=raw" % url
|
||||
data = read_url(url).decode('utf-8')
|
||||
return data
|
||||
|
||||
def get_movie_data(wikipedia_url):
|
||||
if not wikipedia_url.startswith('http'):
|
||||
wikipedia_url = get_url(wikipedia_url)
|
||||
data = get_wiki_data(wikipedia_url)
|
||||
filmbox_data = find_re(data, '''\{\{[Ii]nfobox.[Ff]ilm(.*?)\n\}\}''')
|
||||
filmbox = {}
|
||||
_box = filmbox_data.strip().split('|')
|
||||
for row in _box:
|
||||
d = row.split('=')
|
||||
if len(d) == 2:
|
||||
_key = d[0].strip()
|
||||
if _key:
|
||||
key = _key
|
||||
if key[0] == '|':
|
||||
key = key[1:]
|
||||
key = key.strip()
|
||||
value = d[1].strip()
|
||||
value = value.replace('<!-- see WP:ALT -->', '')
|
||||
if '<br>' in value:
|
||||
value = value.split('<br>')
|
||||
if value:
|
||||
if key in filmbox:
|
||||
if isinstance(value, list) and isinstance(filmbox[key], basestring):
|
||||
filmbox[key] = [filmbox[key]] + value
|
||||
else:
|
||||
filmbox[key] += value
|
||||
if isinstance(filmbox[key], list):
|
||||
filmbox[key] = [k for k in filmbox[key] if k]
|
||||
else:
|
||||
filmbox[key] = value
|
||||
if not filmbox_data:
|
||||
return filmbox
|
||||
if 'amg_id' in filmbox and not filmbox['amg_id'].isdigit():
|
||||
del filmbox['amg_id']
|
||||
if 'Allmovie movie' in data:
|
||||
filmbox['amg_id'] = find_re(data, 'Allmovie movie\|.*?(\d+)')
|
||||
elif 'Allmovie title' in data:
|
||||
filmbox['amg_id'] = find_re(data, 'Allmovie title\|.*?(\d+)')
|
||||
|
||||
if 'Official website' in data:
|
||||
filmbox['website'] = find_re(data, 'Official website\|(.*?)}').strip()
|
||||
|
||||
r = re.compile('{{IMDb title\|id=(\d{7})', re.IGNORECASE).findall(data)
|
||||
if r:
|
||||
filmbox['imdb_id'] = r[0]
|
||||
else:
|
||||
r = re.compile('{{IMDb title\|(\d{7})', re.IGNORECASE).findall(data)
|
||||
if r:
|
||||
filmbox['imdb_id'] = r[0]
|
||||
|
||||
r = re.compile('{{Internet Archive.*?\|id=(.*?)[\|}]', re.IGNORECASE).findall(data)
|
||||
if r:
|
||||
filmbox['archiveorg_id'] = r[0]
|
||||
|
||||
r = re.compile('{{mojo title\|(.*?)[\|}]', re.IGNORECASE).findall(data)
|
||||
if r:
|
||||
filmbox['mojo_id'] = r[0].replace('id=', '')
|
||||
|
||||
r = re.compile('{{rotten-tomatoes\|(.*?)[\|}]', re.IGNORECASE).findall(data)
|
||||
if r:
|
||||
filmbox['rottentomatoes_id'] = r[0].replace('id=', '')
|
||||
if 'google video' in data:
|
||||
filmbox['google_video_id'] = find_re(data, 'google video\|.*?(\d*?)[\|}]')
|
||||
if 'DEFAULTSORT' in data:
|
||||
filmbox['title_sort'] = find_re(data, '''\{\{DEFAULTSORT:(.*?)\}\}''')
|
||||
return filmbox
|
||||
|
||||
def get_image_url(name):
|
||||
url = 'http://en.wikipedia.org/wiki/Image:' + name.replace(' ', '%20')
|
||||
data = read_url(url)
|
||||
url = find_re(data, 'href="(http://upload.wikimedia.org/.*?)"')
|
||||
if not url:
|
||||
url = find_re(data, 'href="(//upload.wikimedia.org/.*?)"')
|
||||
if url:
|
||||
url = 'http:' + url
|
||||
return url
|
||||
|
||||
def get_poster_url(wikipedia_url):
|
||||
if not wikipedia_url.startswith('http'): wikipedia_url = get_url(wikipedia_url)
|
||||
data = get_movie_data(wikipedia_url)
|
||||
if 'image' in data:
|
||||
return get_image_url(data['image'])
|
||||
return ''
|
||||
|
||||
def get_movie_poster(wikipedia_url):
|
||||
# deprecated, use get_poster_url()
|
||||
return get_poster_url(wikipedia_url)
|
||||
|
||||
def get_allmovie_id(wikipedia_url):
|
||||
data = get_movie_data(wikipedia_url)
|
||||
return data.get('amg_id', '')
|
||||
|
||||
def find(query, max_results=10):
|
||||
query = {'action': 'query', 'list':'search', 'format': 'json',
|
||||
'srlimit': max_results, 'srwhat': 'text', 'srsearch': query.encode('utf-8')}
|
||||
url = "http://en.wikipedia.org/w/api.php?" + urllib.parse.urlencode(query)
|
||||
data = read_url(url)
|
||||
if not data:
|
||||
data = read_url(url, timeout=0)
|
||||
result = json.loads(data.decode('utf-8'))
|
||||
results = []
|
||||
if result and 'query' in result:
|
||||
for r in result['query']['search']:
|
||||
title = r['title']
|
||||
url = "http://en.wikipedia.org/wiki/%s" % title.replace(' ', '_')
|
||||
results.append((title, url, ''))
|
||||
return results
|
||||
|
||||
217
Shared/lib/python3.4/site-packages/ox/web/youtube.py
Normal file
217
Shared/lib/python3.4/site-packages/ox/web/youtube.py
Normal file
|
|
@ -0,0 +1,217 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
from urllib import quote, unquote_plus
|
||||
import urllib2
|
||||
import cookielib
|
||||
import re
|
||||
from xml.dom.minidom import parseString
|
||||
import json
|
||||
|
||||
import feedparser
|
||||
import ox
|
||||
from ox.cache import read_url, cache_timeout
|
||||
|
||||
|
||||
def get_id(url):
|
||||
match = re.compile('v=(.+?)($|&)').findall(url)
|
||||
if match:
|
||||
return match[0][0]
|
||||
|
||||
def get_url(id):
|
||||
return 'http://www.youtube.com/watch?v=%s' % id
|
||||
|
||||
def video_url(youtubeId, format='mp4', timeout=cache_timeout):
|
||||
"""
|
||||
youtubeId - if of video
|
||||
format - video format, options: webm, 1080p, 720p, mp4, high
|
||||
"""
|
||||
fmt = None
|
||||
if format == '4k':
|
||||
fmt=38
|
||||
elif format == '1080p':
|
||||
fmt=37
|
||||
elif format == '720p':
|
||||
fmt=22
|
||||
elif format == 'mp4':
|
||||
fmt=18
|
||||
elif format == 'high':
|
||||
fmt=35
|
||||
elif format == 'webm':
|
||||
streams = videos(youtubeId, 'webm')
|
||||
return streams[max(streams.keys())]['url']
|
||||
|
||||
streams = videos(youtubeId)
|
||||
if str(fmt) in streams:
|
||||
return streams[str(fmt)]['url']
|
||||
|
||||
def get_video_info(id):
|
||||
eurl = get_url(id)
|
||||
data = read_url(eurl)
|
||||
t = re.compile('\W[\'"]?t[\'"]?: ?[\'"](.+?)[\'"]').findall(data)
|
||||
if t:
|
||||
t = t[0]
|
||||
else:
|
||||
raise IOError
|
||||
url = "http://www.youtube.com/get_video_info?&video_id=%s&el=$el&ps=default&eurl=%s&hl=en_US&t=%s" % (id, quote(eurl), quote(t))
|
||||
data = read_url(url)
|
||||
info = {}
|
||||
for part in data.split('&'):
|
||||
key, value = part.split('=')
|
||||
info[key] = unquote_plus(value).replace('+', ' ')
|
||||
return info
|
||||
|
||||
def find(query, max_results=10, offset=1, orderBy='relevance'):
|
||||
query = quote(query)
|
||||
url = "http://gdata.youtube.com/feeds/api/videos?vq=%s&orderby=%s&start-index=%s&max-results=%s" % (query, orderBy, offset, max_results)
|
||||
data = read_url(url)
|
||||
fd = feedparser.parse(data)
|
||||
videos = []
|
||||
for item in fd.entries:
|
||||
id = item['id'].split('/')[-1]
|
||||
title = item['title']
|
||||
description = item['description']
|
||||
videos.append((title, id, description))
|
||||
if len(videos) >= max_results:
|
||||
return videos
|
||||
return videos
|
||||
|
||||
def info(id, timeout=cache_timeout):
|
||||
info = {}
|
||||
if id.startswith('http'):
|
||||
id = get_id(id)
|
||||
if not id:
|
||||
return info
|
||||
url = "http://gdata.youtube.com/feeds/api/videos/%s?v=2" % id
|
||||
data = read_url(url, timeout=timeout)
|
||||
xml = parseString(data)
|
||||
info['id'] = id
|
||||
info['url'] = get_url(id)
|
||||
info['title'] = xml.getElementsByTagName('title')[0].firstChild.data
|
||||
info['description'] = xml.getElementsByTagName('media:description')[0].firstChild.data
|
||||
info['date'] = xml.getElementsByTagName('published')[0].firstChild.data.split('T')[0]
|
||||
info['author'] = "http://www.youtube.com/user/%s"%xml.getElementsByTagName('name')[0].firstChild.data
|
||||
|
||||
info['categories'] = []
|
||||
for cat in xml.getElementsByTagName('media:category'):
|
||||
info['categories'].append(cat.firstChild.data)
|
||||
|
||||
k = xml.getElementsByTagName('media:keywords')[0].firstChild
|
||||
if k:
|
||||
info['keywords'] = k.data.split(', ')
|
||||
data = read_url(info['url'], timeout=timeout)
|
||||
match = re.compile('<h4>License:</h4>(.*?)</p>', re.DOTALL).findall(data)
|
||||
if match:
|
||||
info['license'] = match[0].strip()
|
||||
info['license'] = re.sub('<.+?>', '', info['license']).strip()
|
||||
|
||||
url = "http://www.youtube.com/api/timedtext?hl=en&type=list&tlangs=1&v=%s&asrs=1" % id
|
||||
data = read_url(url, timeout=timeout)
|
||||
xml = parseString(data)
|
||||
languages = [t.getAttribute('lang_code') for t in xml.getElementsByTagName('track')]
|
||||
if languages:
|
||||
info['subtitles'] = {}
|
||||
for language in languages:
|
||||
url = "http://www.youtube.com/api/timedtext?hl=en&v=%s&type=track&lang=%s&name&kind"%(id, language)
|
||||
data = read_url(url, timeout=timeout)
|
||||
xml = parseString(data)
|
||||
subs = []
|
||||
for t in xml.getElementsByTagName('text'):
|
||||
start = float(t.getAttribute('start'))
|
||||
duration = t.getAttribute('dur')
|
||||
if not duration:
|
||||
duration = '2'
|
||||
end = start + float(duration)
|
||||
if t.firstChild:
|
||||
text = t.firstChild.data
|
||||
subs.append({
|
||||
'in': start,
|
||||
'out': end,
|
||||
'value': ox.decode_html(text),
|
||||
})
|
||||
info['subtitles'][language] = subs
|
||||
return info
|
||||
|
||||
def videos(id, format=''):
|
||||
stream_type = {
|
||||
'flv': 'video/x-flv',
|
||||
'webm': 'video/webm',
|
||||
'mp4': 'video/mp4'
|
||||
}.get(format)
|
||||
info = get_video_info(id)
|
||||
stream_map = info['url_encoded_fmt_stream_map']
|
||||
streams = {}
|
||||
for x in stream_map.split(','):
|
||||
stream = {}
|
||||
#for s in x.split('\\u0026'):
|
||||
for s in x.split('&'):
|
||||
key, value = s.split('=')
|
||||
value = unquote_plus(value)
|
||||
stream[key] = value
|
||||
if 'url' in stream and 'sig' in stream:
|
||||
stream['url'] = '%s&signature=%s' % (stream['url'], stream['sig'])
|
||||
if not stream_type or stream['type'].startswith(stream_type):
|
||||
streams[stream['itag']] = stream
|
||||
return streams
|
||||
|
||||
def playlist(url):
|
||||
data = read_url(url)
|
||||
items = []
|
||||
for i in list(set(re.compile('<a href="(/watch\?v=.*?)" title="(.*?)" ').findall(data))):
|
||||
items.append({
|
||||
'title': i[1],
|
||||
'url': 'http://www.youtube.com' + i[0].split('&')[0]
|
||||
})
|
||||
return items
|
||||
|
||||
def download_webm(id, filename):
|
||||
stream_type = 'video/webm'
|
||||
url = "http://www.youtube.com/watch?v=%s" % id
|
||||
cj = cookielib.CookieJar()
|
||||
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
|
||||
opener.addheaders = [
|
||||
('User-Agent',
|
||||
'Mozilla/5.0 (X11; Linux i686; rv:2.0) Gecko/20100101 Firefox/4.0'),
|
||||
('Accept-Language', 'en-us, en;q=0.50')
|
||||
]
|
||||
u = opener.open(url)
|
||||
data = u.read()
|
||||
u.close()
|
||||
match = re.compile('"url_encoded_fmt_stream_map": "(.*?)"').findall(data)
|
||||
streams = {}
|
||||
for x in match[0].split(','):
|
||||
stream = {}
|
||||
for s in x.split('\\u0026'):
|
||||
key, value = s.split('=')
|
||||
value = unquote_plus(value)
|
||||
stream[key] = value
|
||||
if stream['type'].startswith(stream_type):
|
||||
streams[stream['itag']] = stream
|
||||
if streams:
|
||||
s = max(streams.keys())
|
||||
url = streams[s]['url']
|
||||
if 'sig' in streams[s]:
|
||||
url += 'signature=' + streams[s]['sig']
|
||||
else:
|
||||
return None
|
||||
|
||||
#download video and save to file.
|
||||
u = opener.open(url)
|
||||
f = open(filename, 'w')
|
||||
data = True
|
||||
while data:
|
||||
data = u.read(4096)
|
||||
f.write(data)
|
||||
f.close()
|
||||
u.close()
|
||||
return filename
|
||||
|
||||
def get_config(id):
|
||||
if id.startswith('http'):
|
||||
url = id
|
||||
else:
|
||||
url = get_url(id)
|
||||
data = read_url(url)
|
||||
match = re.compile('ytplayer.config = (.*?);<').findall(data)
|
||||
if match:
|
||||
config = json.load(match[0])
|
||||
return config
|
||||
Loading…
Add table
Add a link
Reference in a new issue