run update
This commit is contained in:
parent
11af4540c5
commit
6806bebb7c
607 changed files with 52543 additions and 31832 deletions
|
|
@ -2,6 +2,7 @@ from __future__ import print_function
|
|||
import json
|
||||
import re
|
||||
|
||||
from six import text_type
|
||||
from ox.cache import read_url
|
||||
|
||||
HEADERS = {
|
||||
|
|
@ -16,9 +17,9 @@ USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7) '
|
|||
USER_AGENT += 'AppleWebKit/534.48.3 (KHTML, like Gecko) Version/5.1 Safari/534.48.3'
|
||||
|
||||
def get_movie_data(title, director):
|
||||
if isinstance(title, unicode):
|
||||
if isinstance(title, text_type):
|
||||
title = title.encode('utf-8')
|
||||
if isinstance(director, unicode):
|
||||
if isinstance(director, text_type):
|
||||
director = director.encode('utf-8')
|
||||
data = {}
|
||||
# itunes section (preferred source for link)
|
||||
|
|
@ -45,7 +46,7 @@ def get_movie_data(title, director):
|
|||
results = js['results']
|
||||
if results:
|
||||
url = host + results[0]['location']
|
||||
if not 'link' in data:
|
||||
if 'link' not in data:
|
||||
data['link'] = url
|
||||
headers = {
|
||||
'User-Agent': USER_AGENT
|
||||
|
|
|
|||
|
|
@ -17,7 +17,7 @@ def get(key):
|
|||
if key in auth:
|
||||
return auth[key]
|
||||
print("please add key %s to json file '%s'" % (key, user_auth))
|
||||
raise Exception,"no key %s found" % key
|
||||
raise Exception("no key %s found" % key)
|
||||
|
||||
def update(key, value):
|
||||
user_auth = os.environ.get('oxAUTH', os.path.expanduser('~/.ox/auth.json'))
|
||||
|
|
@ -31,4 +31,3 @@ def update(key, value):
|
|||
f = open(user_auth, "w")
|
||||
f.write(json.dumps(auth, indent=2))
|
||||
f.close()
|
||||
|
||||
|
|
|
|||
|
|
@ -8,13 +8,13 @@ from ox.cache import read_url
|
|||
from ox.html import strip_tags, decode_html
|
||||
from ox.text import find_re
|
||||
|
||||
import imdb
|
||||
from . import imdb
|
||||
|
||||
def get_id(url):
|
||||
return url.split("/")[-1]
|
||||
|
||||
def get_url(id):
|
||||
return "http://www.criterion.com/films/%s" % id
|
||||
return "https://www.criterion.com/films/%s" % id
|
||||
|
||||
def get_data(id, timeout=ox.cache.cache_timeout, get_imdb=False):
|
||||
'''
|
||||
|
|
@ -28,23 +28,34 @@ def get_data(id, timeout=ox.cache.cache_timeout, get_imdb=False):
|
|||
u'http://s3.amazonaws.com/criterion-production/product_images/185/343_box_348x490.jpg'
|
||||
'''
|
||||
data = {
|
||||
"id": id,
|
||||
"url": get_url(id)
|
||||
}
|
||||
try:
|
||||
html = read_url(data["url"], timeout=timeout, unicode=True)
|
||||
except:
|
||||
html = ox.cache.read_url(data["url"], timeout=timeout)
|
||||
data["number"] = find_re(html, "<li>Spine #(\d+)")
|
||||
html = read_url(data["url"], timeout=timeout).decode('utf-8', 'ignore')
|
||||
|
||||
data["title"] = decode_html(find_re(html, "<h1 class=\"movietitle\">(.*?)</h1>"))
|
||||
data["number"] = find_re(html, "<b>Spine #(\d+)")
|
||||
|
||||
data["title"] = decode_html(find_re(html, "<h1 class=\"header__primarytitle\".*?>(.*?)</h1>"))
|
||||
data["title"] = data["title"].split(u' \u2014 The Television Version')[0].strip()
|
||||
data["director"] = strip_tags(find_re(html, "<h2 class=\"director\">(.*?)</h2>"))
|
||||
results = find_re(html, '<div class="left_column">(.*?)</div>')
|
||||
results = re.compile("<li>(.*?)</li>").findall(results)
|
||||
data["country"] = results[0]
|
||||
data["year"] = results[1]
|
||||
results = find_re(html, '<ul class="film-meta-list">(.*?)</ul>')
|
||||
info = re.compile('<li itemprop="(.*?)".*?>(.*?)</li>', re.DOTALL).findall(results)
|
||||
info = {k: strip_tags(v).strip() for k, v in info}
|
||||
if 'director' in info:
|
||||
data['director'] = info['director']
|
||||
if 'countryOfOrigin' in info:
|
||||
data['country'] = [c.strip() for c in decode_html(info['countryOfOrigin']).split(', ')]
|
||||
if 'inLanguage' in info:
|
||||
data['language'] = [l.strip() for l in decode_html(info['inLanguage']).split(', ')]
|
||||
for v in re.compile('<li>(.*?)</li>', re.DOTALL).findall(results):
|
||||
if 'datePublished' in v:
|
||||
data['year'] = strip_tags(v).strip()
|
||||
elif 'duration' in v:
|
||||
data['duration'] = strip_tags(v).strip()
|
||||
data["synopsis"] = decode_html(strip_tags(find_re(html,
|
||||
"<div class=\"content_block last\">.*?<p>(.*?)</p>")))
|
||||
"<div class=\"product-summary\".*?>.*?<p>(.*?)</p>")))
|
||||
|
||||
result = find_re(html, "<div class=\"purchase\">(.*?)</div>")
|
||||
if 'Blu-Ray' in result or 'Essential Art House DVD' in result:
|
||||
|
|
@ -56,47 +67,46 @@ def get_data(id, timeout=ox.cache.cache_timeout, get_imdb=False):
|
|||
data["posters"] = [result]
|
||||
else:
|
||||
html_ = read_url(result, unicode=True)
|
||||
result = find_re(html_, '<a href="http://www.criterion.com/films/%s.*?">(.*?)</a>' % id)
|
||||
result = find_re(html_, '//www.criterion.com/films/%s.*?">(.*?)</a>' % id)
|
||||
result = find_re(result, "src=\"(.*?)\"")
|
||||
if result:
|
||||
data["posters"] = [result.replace("_w100", "")]
|
||||
else:
|
||||
data["posters"] = []
|
||||
data['posters'] = [re.sub('(\?\d+)$', '', p) for p in data['posters']]
|
||||
data['posters'] = [p for p in data['posters'] if p]
|
||||
|
||||
posters = find_re(html, '<div class="product-box-art".*?>(.*?)</div>')
|
||||
for poster in re.compile('<img src="(.*?)"').findall(posters):
|
||||
data['posters'].append(poster)
|
||||
|
||||
result = find_re(html, "<img alt=\"Film Still\" height=\"252\" src=\"(.*?)\"")
|
||||
if result:
|
||||
data["stills"] = [result]
|
||||
data["trailers"] = []
|
||||
else:
|
||||
data["stills"] = filter(lambda x: x, [find_re(html, "\"thumbnailURL\", \"(.*?)\"")])
|
||||
data["trailers"] = filter(lambda x: x, [find_re(html, "\"videoURL\", \"(.*?)\"")])
|
||||
data["stills"] = list(filter(lambda x: x, [find_re(html, "\"thumbnailURL\", \"(.*?)\"")]))
|
||||
data["trailers"] = list(filter(lambda x: x, [find_re(html, "\"videoURL\", \"(.*?)\"")]))
|
||||
|
||||
if timeout == ox.cache.cache_timeout:
|
||||
timeout = -1
|
||||
if get_imdb:
|
||||
if get_imdb and 'title' in data and 'director' in data:
|
||||
# removed year, as "title (year)" may fail to match
|
||||
data['imdbId'] = imdb.get_movie_id(data['title'], data['director'], timeout=timeout)
|
||||
return data
|
||||
|
||||
def get_ids(page=None):
|
||||
ids = []
|
||||
if page:
|
||||
url = "http://www.criterion.com/library/expanded_view?m=dvd&p=%s&pp=50&s=spine" % page
|
||||
html = read_url(url)
|
||||
results = re.compile("films/(\d+)").findall(html)
|
||||
html = read_url("https://www.criterion.com/shop/browse/list?sort=spine_number", unicode=True)
|
||||
results = re.compile("films/(\d+)-").findall(html)
|
||||
ids += results
|
||||
results = re.compile("boxsets/(.*?)\"").findall(html)
|
||||
for result in results:
|
||||
html = read_url("https://www.criterion.com/boxsets/" + result, unicode=True)
|
||||
results = re.compile("films/(\d+)-").findall(html)
|
||||
ids += results
|
||||
results = re.compile("boxsets/(.*?)\"").findall(html)
|
||||
for result in results:
|
||||
html = read_url("http://www.criterion.com/boxsets/" + result)
|
||||
results = re.compile("films/(\d+)").findall(html)
|
||||
ids += results
|
||||
return set(ids)
|
||||
html = read_url("http://www.criterion.com/library/expanded_view?m=dvd&p=1&pp=50&s=spine", unicode=True)
|
||||
results = re.compile("\&p=(\d+)\&").findall(html)
|
||||
pages = max(map(int, results))
|
||||
for page in range(1, pages):
|
||||
ids += get_ids(page)
|
||||
return sorted(set(ids), key=int)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
print(get_ids())
|
||||
|
|
|
|||
|
|
@ -1,21 +1,21 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
import re
|
||||
from six.moves.urllib.parse import unquote
|
||||
from ox.cache import read_url
|
||||
|
||||
|
||||
def get_video_url(url):
|
||||
'''
|
||||
>>> get_video_url('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3opar_priere-pour-refuznik-1-jeanluc-goda_shortfilms').split('?auth')[0]
|
||||
'http://www.dailymotion.com/cdn/FLV-320x240/video/x3opar_priere-pour-refuznik-1-jean-luc-god_shortfilms.flv'
|
||||
|
||||
>>> get_video_url('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3ou94_priere-pour-refuznik-2-jeanluc-goda_shortfilms').split('?auth')[0]
|
||||
'http://www.dailymotion.com/cdn/FLV-320x240/video/x3ou94_priere-pour-refuznik-2-jean-luc-god_shortfilms.flv'
|
||||
'''
|
||||
data = read_url(url)
|
||||
video = re.compile('''video", "(.*?)"''').findall(data)
|
||||
for v in video:
|
||||
v = unquote(v).split('@@')[0]
|
||||
return v
|
||||
return ''
|
||||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
import re
|
||||
from six.moves.urllib.parse import unquote
|
||||
from ox.cache import read_url
|
||||
|
||||
|
||||
def get_video_url(url):
|
||||
'''
|
||||
>>> get_video_url('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3opar_priere-pour-refuznik-1-jeanluc-goda_shortfilms').split('?auth')[0]
|
||||
'http://www.dailymotion.com/cdn/FLV-320x240/video/x3opar_priere-pour-refuznik-1-jean-luc-god_shortfilms.flv'
|
||||
|
||||
>>> get_video_url('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3ou94_priere-pour-refuznik-2-jeanluc-goda_shortfilms').split('?auth')[0]
|
||||
'http://www.dailymotion.com/cdn/FLV-320x240/video/x3ou94_priere-pour-refuznik-2-jean-luc-god_shortfilms.flv'
|
||||
'''
|
||||
data = read_url(url)
|
||||
video = re.compile('''video", "(.*?)"''').findall(data)
|
||||
for v in video:
|
||||
v = unquote(v).split('@@')[0]
|
||||
return v
|
||||
return ''
|
||||
|
|
|
|||
|
|
@ -6,17 +6,25 @@ from six.moves import urllib
|
|||
import ox
|
||||
from ox import strip_tags, decode_html
|
||||
from ox.cache import read_url
|
||||
import lxml.html
|
||||
|
||||
|
||||
def find(query, timeout=ox.cache.cache_timeout):
|
||||
"""
|
||||
Returns tuples with title, url, description
|
||||
"""
|
||||
if not isinstance(query, bytes):
|
||||
query = query.encode('utf-8')
|
||||
params = urllib.parse.urlencode({'q': query})
|
||||
url = 'http://duckduckgo.com/html/?' + params
|
||||
data = read_url(url, timeout=timeout).decode('utf-8')
|
||||
doc = lxml.html.document_fromstring(data)
|
||||
results = []
|
||||
regex = '<a .*?class="large" href="(.+?)">(.*?)</a>.*?<div class="snippet">(.*?)</div>'
|
||||
for r in re.compile(regex, re.DOTALL).findall(data):
|
||||
results.append((strip_tags(decode_html(r[1])), r[0], strip_tags(decode_html(r[2]))))
|
||||
for e in doc.xpath("//a[contains(@class, 'result__a')]"):
|
||||
url = e.attrib['href']
|
||||
if 'uddg=' in url:
|
||||
url = urllib.parse.unquote(url.split('&uddg=')[-1])
|
||||
title = e.text_content()
|
||||
description = ''
|
||||
results.append((title, url, description))
|
||||
return results
|
||||
|
||||
|
|
|
|||
|
|
@ -7,7 +7,7 @@ import time
|
|||
from ox import strip_tags, find_re
|
||||
from ox.cache import read_url
|
||||
|
||||
import google
|
||||
from . import google
|
||||
|
||||
|
||||
def get_show_url(title):
|
||||
|
|
|
|||
|
|
@ -21,11 +21,11 @@ def find(query, max_results=DEFAULT_MAX_RESULTS, timeout=DEFAULT_TIMEOUT):
|
|||
"""
|
||||
Return max_results tuples with title, url, description
|
||||
|
||||
>>> find("The Matrix site:imdb.com", 1)[0][0]
|
||||
u'The Matrix (1999) - IMDb'
|
||||
>>> str(find("The Matrix site:imdb.com", 1)[0][0])
|
||||
'The Matrix (1999) - IMDb'
|
||||
|
||||
>>> find("The Matrix site:imdb.com", 1)[0][1]
|
||||
u'http://www.imdb.com/title/tt0133093/'
|
||||
>>> str(find("The Matrix site:imdb.com", 1)[0][1])
|
||||
'http://www.imdb.com/title/tt0133093/'
|
||||
"""
|
||||
results = []
|
||||
offset = 0
|
||||
|
|
|
|||
|
|
@ -7,7 +7,7 @@ import time
|
|||
import unicodedata
|
||||
|
||||
from six.moves.urllib.parse import urlencode
|
||||
from six import string_types
|
||||
from six import text_type, string_types
|
||||
|
||||
from .. import find_re, strip_tags, decode_html
|
||||
from .. import cache
|
||||
|
|
@ -18,22 +18,95 @@ from . import duckduckgo
|
|||
from ..utils import datetime
|
||||
from ..geo import normalize_country_name
|
||||
|
||||
def read_url(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None, unicode=False):
|
||||
|
||||
def prepare_url(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None, unicode=False):
|
||||
headers = headers.copy()
|
||||
# https://webapps.stackexchange.com/questions/11003/how-can-i-disable-reconfigure-imdbs-automatic-geo-location-so-it-does-not-defau
|
||||
headers['X-Forwarded-For'] = '72.21.206.80'
|
||||
return url, data, headers, timeout, unicode
|
||||
|
||||
def read_url(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None, unicode=False):
|
||||
url, data, headers, timeout, unicode = prepare_url(url, data, headers, timeout, valid, unicode)
|
||||
return cache.read_url(url, data, headers, timeout, unicode=unicode)
|
||||
|
||||
def delete_url(url, data=None, headers=cache.DEFAULT_HEADERS):
|
||||
url, data, headers, timeout, unicode = prepare_url(url, data, headers)
|
||||
cache.store.delete(url, data, headers)
|
||||
|
||||
def get_url(id):
|
||||
return "http://www.imdb.com/title/tt%s/" % id
|
||||
|
||||
|
||||
def reference_section(id):
|
||||
return {
|
||||
'page': 'reference',
|
||||
're': [
|
||||
'<h4 name="{id}" id="{id}".*?<table(.*?)</table>'.format(id=id),
|
||||
'<a href="/name/.*?>(.*?)</a>'
|
||||
],
|
||||
'type': 'list'
|
||||
}
|
||||
|
||||
|
||||
def zebra_list(label, more=None):
|
||||
conditions = {
|
||||
'page': 'reference',
|
||||
're': [
|
||||
'_label">' + label + '</td>.*?<ul(.*?)</ul>',
|
||||
'<li.*?>(.*?)</li>'
|
||||
],
|
||||
'type': 'list',
|
||||
}
|
||||
if more:
|
||||
conditions['re'] += more
|
||||
return conditions
|
||||
|
||||
def zebra_table(label, more=None, type='string'):
|
||||
conditions = {
|
||||
'page': 'reference',
|
||||
're': [
|
||||
'_label">' + label + '</td>.*?<td>(.*?)</td>',
|
||||
],
|
||||
'type': type,
|
||||
}
|
||||
if more:
|
||||
conditions['re'] += more
|
||||
return conditions
|
||||
|
||||
def parse_aspectratio(value):
|
||||
r = value
|
||||
if ':' in value:
|
||||
r = value.split(':')
|
||||
n = r[0]
|
||||
d = r[1].strip().split(' ')[0]
|
||||
try:
|
||||
if float(d):
|
||||
value = str(float(n) / float(d))
|
||||
else:
|
||||
value = str(float(n))
|
||||
except:
|
||||
print('failed to parse aspect: %s' % value)
|
||||
else:
|
||||
value = '.'.join(value.strip().split('.')[:2])
|
||||
return value
|
||||
|
||||
'''
|
||||
'posterIds': {
|
||||
'page': 'posters',
|
||||
're': '/unknown-thumbnail/media/rm(.*?)/tt',
|
||||
'type': 'list'
|
||||
},
|
||||
'''
|
||||
|
||||
class Imdb(SiteParser):
|
||||
'''
|
||||
>>> Imdb('0068646')['title']
|
||||
u'The Godfather'
|
||||
>>> Imdb('0068646')['title'] == text_type(u'The Godfather')
|
||||
True
|
||||
|
||||
>>> Imdb('0133093')['title']
|
||||
u'The Matrix'
|
||||
>>> Imdb('0133093')['title'] == text_type(u'The Matrix')
|
||||
True
|
||||
'''
|
||||
regex = {
|
||||
regex = {
|
||||
'alternativeTitles': {
|
||||
'page': 'releaseinfo',
|
||||
're': [
|
||||
|
|
@ -41,98 +114,49 @@ class Imdb(SiteParser):
|
|||
"td>(.*?)</td>.*?<td>(.*?)</td>"
|
||||
],
|
||||
'type': 'list'
|
||||
|
||||
},
|
||||
'aspectratio': {
|
||||
'page': 'combined',
|
||||
're': 'Aspect Ratio:</h5><div class="info-content">([\d\.]+)',
|
||||
'page': 'reference',
|
||||
're': [
|
||||
'Aspect Ratio</td>.*?ipl-inline-list__item">\s+([\d\.\:\ ]+)',
|
||||
parse_aspectratio,
|
||||
],
|
||||
'type': 'float',
|
||||
},
|
||||
'budget': {
|
||||
'page': 'business',
|
||||
're': [
|
||||
'<h5>Budget</h5>\s*?\$(.*?)<br',
|
||||
lambda data: find_re(decode_html(data).replace(',', ''), '\d+')
|
||||
],
|
||||
'type': 'int'
|
||||
},
|
||||
'budget': zebra_table('Budget', more=[
|
||||
lambda data: find_re(decode_html(data).replace(',', ''), '\d+')
|
||||
], type='int'),
|
||||
'cast': {
|
||||
'page': 'combined',
|
||||
'page': 'reference',
|
||||
're': [
|
||||
'<td class="nm">.*?>(.*?)</a>.*?<td class="char">(.*?)</td>',
|
||||
lambda ll: [strip_tags(l) for l in ll]
|
||||
],
|
||||
'type': 'list'
|
||||
},
|
||||
'cinematographer': {
|
||||
'page': 'combined',
|
||||
're': [
|
||||
lambda data: data.split('Series Crew')[0],
|
||||
'Cinematography by</a>(.*?)</table>',
|
||||
'<a href="/name/.*?/">(.*?)</a>'
|
||||
' <table class="cast_list">(.*?)</table>',
|
||||
'<td.*?itemprop="actor".*?>.*?>(.*?)</a>.*?<td class="character">(.*?)</td>',
|
||||
lambda ll: [strip_tags(l) for l in ll] if isinstance(ll, list) else strip_tags(ll)
|
||||
],
|
||||
'type': 'list'
|
||||
},
|
||||
'cinematographer': reference_section('cinematographers'),
|
||||
'connections': {
|
||||
'page': 'movieconnections',
|
||||
're': '<h4 class="li_group">(.*?)</h4>(.*?)(<\/div>\n <a|<script)',
|
||||
'type': 'list'
|
||||
},
|
||||
'country': {
|
||||
'page': 'combined',
|
||||
're': [
|
||||
'<div class="info"><h5>Country:</h5>.*?<div class="info">',
|
||||
#'<a href="/country/.*?">(.*?)</a>', #links changed to work with existing caches, just take all links
|
||||
'<a.*?>(.*?)</a>',
|
||||
],
|
||||
'type': 'list'
|
||||
},
|
||||
'country': zebra_list('Country', more=['<a.*?>(.*?)</a>']),
|
||||
'creator': {
|
||||
'page': 'combined',
|
||||
'page': '',
|
||||
're': [
|
||||
'<h5>Creator.?:</h5>.*?<div class="info-content">(.*?)</div>',
|
||||
'<a href="/name/.*?>(.*?)</a>'
|
||||
],
|
||||
'type': 'list'
|
||||
},
|
||||
'director': {
|
||||
'page': 'combined',
|
||||
're': [
|
||||
lambda data: data.split('<b>Series Crew</b>')[0],
|
||||
'Directed by</a>(.*?)</table>',
|
||||
'<a href="/name/.*?>(.*?)</a>'
|
||||
],
|
||||
'type': 'list'
|
||||
},
|
||||
'_director': {
|
||||
'page': 'combined',
|
||||
're': [
|
||||
'<h5>Director:</h5>.*?<div class="info-content">(.*?)</div>',
|
||||
'<a href="/name/.*?>(.*?)</a>'
|
||||
],
|
||||
'type': 'list'
|
||||
},
|
||||
'editor': {
|
||||
'page': 'combined',
|
||||
're': [
|
||||
lambda data: data.split('Series Crew')[0],
|
||||
'Film Editing by</a>(.*?)</table>',
|
||||
'<a href="/name/.*?>(.*?)</a>'
|
||||
],
|
||||
'type': 'list'
|
||||
},
|
||||
'composer': {
|
||||
'page': 'combined',
|
||||
're': [
|
||||
lambda data: data.split('Series Crew')[0],
|
||||
'Original Music by</a>(.*?)</table>',
|
||||
'<a href="/name/.*?>(.*?)</a>'
|
||||
'<div class="credit_summary_item">.*?<h4.*?>Creator.?:</h4>(.*?)</div>',
|
||||
'<a href="/name/.*?>(.*?)</a>',
|
||||
lambda ll: strip_tags(ll)
|
||||
],
|
||||
'type': 'list'
|
||||
},
|
||||
'director': reference_section('directors'),
|
||||
'editor': reference_section('editors'),
|
||||
'composer': reference_section('composers'),
|
||||
'episodeTitle': {
|
||||
'page': 'combined',
|
||||
're': '<div id="tn15title">.*?<em>(.*?)</em>',
|
||||
'page': 'reference',
|
||||
're': '<h3 itemprop="name">(.*?)<',
|
||||
'type': 'string'
|
||||
},
|
||||
'filmingLocations': {
|
||||
|
|
@ -143,71 +167,44 @@ class Imdb(SiteParser):
|
|||
],
|
||||
'type': 'list'
|
||||
},
|
||||
'genre': {
|
||||
'page': 'combined',
|
||||
're': [
|
||||
'<h5>Genre:</h5>(.*?)<hr',
|
||||
'<a href="/Sections/Genres/.*?/">(.*?)</a>'
|
||||
],
|
||||
'type': 'list'
|
||||
},
|
||||
'gross': {
|
||||
'page': 'business',
|
||||
're': [
|
||||
'<h5>Gross</h5>\s*?\$(.*?)<br',
|
||||
lambda data: find_re(data.replace(',', ''), '\d+')
|
||||
],
|
||||
'type': 'int'
|
||||
},
|
||||
'genre': zebra_list('Genres', more=['<a.*?>(.*?)</a>']),
|
||||
'gross': zebra_table('Cumulative Worldwide Gross', more=[
|
||||
lambda data: find_re(decode_html(data).replace(',', ''), '\d+')
|
||||
], type='int'),
|
||||
'keyword': {
|
||||
'page': 'keywords',
|
||||
're': '<a href="/keyword/.*?>(.*?)</a>',
|
||||
'type': 'list'
|
||||
},
|
||||
'language': {
|
||||
'page': 'combined',
|
||||
're': [
|
||||
'<div class="info"><h5>Language:</h5>.*?<div class="info">',
|
||||
#'<a href="/language/.*?">(.*?)</a>', #links changed to work with existing caches, just take all links
|
||||
'<a.*?>(.*?)</a>',
|
||||
],
|
||||
'type': 'list'
|
||||
},
|
||||
'summary': {
|
||||
'page': 'plotsummary',
|
||||
're': '<p class="plotSummary">(.*?)<\/p>',
|
||||
'language': zebra_list('Language', more=['<a.*?>(.*?)</a>']),
|
||||
'originalTitle': {
|
||||
'page': 'releaseinfo',
|
||||
're': '<td>\(original title\)</td>\s*<td>(.*?)</td>',
|
||||
'type': 'string'
|
||||
},
|
||||
'summary': zebra_table('Plot Summary', more=[
|
||||
'<p>(.*?)<em'
|
||||
]),
|
||||
'posterId': {
|
||||
'page': 'combined',
|
||||
're': '/primary-photo/media/rm(.*?)/tt',
|
||||
'page': 'reference',
|
||||
're': '<img.*?class="titlereference-primary-image".*?src="(.*?)".*?>',
|
||||
'type': 'string'
|
||||
},
|
||||
'posterIds': {
|
||||
'page': 'posters',
|
||||
're': '/unknown-thumbnail/media/rm(.*?)/tt',
|
||||
'type': 'list'
|
||||
},
|
||||
'producer': {
|
||||
'page': 'combined',
|
||||
're': [
|
||||
lambda data: data.split('Series Crew')[0],
|
||||
'Produced by</a>(.*?)</table>',
|
||||
'<a href="/name/.*?/">(.*?)</a>'
|
||||
],
|
||||
'type': 'list'
|
||||
},
|
||||
'producer': reference_section('producers'),
|
||||
'productionCompany': {
|
||||
'page': 'combined',
|
||||
'page': 'reference',
|
||||
're': [
|
||||
'Production Companies</b><ul>(.*?)</ul>',
|
||||
'Production Companies.*?<ul(.*?)</ul>',
|
||||
'<a href="/company/.*?/">(.*?)</a>'
|
||||
],
|
||||
'type': 'list'
|
||||
},
|
||||
'rating': {
|
||||
'page': 'combined',
|
||||
're': '<div class="starbar-meta">.*?<b>([\d,.]+?)/10</b>',
|
||||
'page': 'reference',
|
||||
're': [
|
||||
'<div class="ipl-rating-star ">(.*?)</div>',
|
||||
'ipl-rating-star__rating">([\d,.]+?)</span>',
|
||||
],
|
||||
'type': 'float'
|
||||
},
|
||||
'releasedate': {
|
||||
|
|
@ -218,64 +215,55 @@ class Imdb(SiteParser):
|
|||
],
|
||||
'type': 'list'
|
||||
},
|
||||
'reviews': {
|
||||
'page': 'externalreviews',
|
||||
're': [
|
||||
'<ol>(.*?)</ol>',
|
||||
'<li><a href="(http.*?)".*?>(.*?)</a></li>'
|
||||
],
|
||||
'type': 'list'
|
||||
},
|
||||
'runtime': {
|
||||
'page': 'combined',
|
||||
're': '<h5>Runtime:</h5><div class="info-content">.*?([0-9]+ sec|[0-9]+ min).*?</div>',
|
||||
'type': 'string'
|
||||
},
|
||||
'color': {
|
||||
'page': 'combined',
|
||||
're': [
|
||||
'<h5>Color:</h5><div class="info-content">(.*?)</div>',
|
||||
'<a.*?>(.*?)</a>'
|
||||
],
|
||||
'type': 'list'
|
||||
},
|
||||
'sound': {
|
||||
'page': 'combined',
|
||||
're': [
|
||||
'<h5>Sound Mix:</h5><div class="info-content">(.*?)</div>',
|
||||
'<a.*?>(.*?)</a>'
|
||||
],
|
||||
'type': 'list'
|
||||
},
|
||||
#FIXME using some /offsite/ redirect now
|
||||
#'reviews': {
|
||||
# 'page': 'externalreviews',
|
||||
# 're': [
|
||||
# '<ul class="simpleList">(.*?)</ul>',
|
||||
# '<li>.*?<a href="(http.*?)".*?>(.*?)</a>.*?</li>'
|
||||
# ],
|
||||
# 'type': 'list'
|
||||
#},
|
||||
'runtime': zebra_list('Runtime'),
|
||||
'color': zebra_list('Color', more=[
|
||||
'<a.*?>([^(<]+)',
|
||||
lambda r: r[0] if isinstance(r, list) else r,
|
||||
strip_tags
|
||||
]),
|
||||
'sound': zebra_list('Sound Mix', more=[
|
||||
'<a.*?>([^(<]+)',
|
||||
lambda r: r[0] if isinstance(r, list) else r,
|
||||
strip_tags
|
||||
]),
|
||||
'season': {
|
||||
'page': 'combined',
|
||||
'page': 'reference',
|
||||
're': [
|
||||
'<h5>Original Air Date:</h5>.*?<div class="info-content">(.*?)</div>',
|
||||
'\(Season (\d+), Episode \d+\)',
|
||||
'<ul class="ipl-inline-list titlereference-overview-season-episode-numbers">(.*?)</ul>',
|
||||
'Season (\d+)',
|
||||
],
|
||||
'type': 'int'
|
||||
},
|
||||
'episode': {
|
||||
'page': 'combined',
|
||||
'page': 'reference',
|
||||
're': [
|
||||
'<h5>Original Air Date:</h5>.*?<div class="info-content">(.*?)</div>',
|
||||
'\(Season \d+, Episode (\d+)\)',
|
||||
'<ul class="ipl-inline-list titlereference-overview-season-episode-numbers">(.*?)</ul>',
|
||||
'Episode (\d+)',
|
||||
],
|
||||
'type': 'int'
|
||||
},
|
||||
'series': {
|
||||
'page': 'combined',
|
||||
're': '<h5>TV Series:</h5>.*?<a href="/title/tt(\d{7})',
|
||||
'page': 'reference',
|
||||
're': '<h4 itemprop="name">.*?<a href="/title/tt(\d{7})',
|
||||
'type': 'string'
|
||||
},
|
||||
'isSeries': {
|
||||
'page': 'combined',
|
||||
're': '<span class="tv-extra">(TV series|TV mini-series) ',
|
||||
'page': 'reference',
|
||||
're': 'property=\'og:title\'.*?content=".*?(TV series|TV mini-series).*?"',
|
||||
'type': 'string'
|
||||
},
|
||||
'title': {
|
||||
'page': 'combined',
|
||||
're': '<h1>(.*?) <span>',
|
||||
'page': 'releaseinfo',
|
||||
're': 'h3 itemprop="name">.*?>(.*?)</a>',
|
||||
'type': 'string'
|
||||
},
|
||||
'trivia': {
|
||||
|
|
@ -287,38 +275,45 @@ class Imdb(SiteParser):
|
|||
'type': 'list',
|
||||
},
|
||||
'votes': {
|
||||
'page': 'combined',
|
||||
're': '<a href="ratings" class="tn15more">([\d,]*?) votes</a>',
|
||||
'page': 'reference',
|
||||
're': [
|
||||
'class="ipl-rating-star__total-votes">\((.*?)\)',
|
||||
lambda r: r.replace(',', '')
|
||||
],
|
||||
'type': 'string'
|
||||
},
|
||||
'writer': {
|
||||
'page': 'combined',
|
||||
'writer': reference_section('writers'),
|
||||
'year': {
|
||||
'page': 'reference',
|
||||
're': [
|
||||
lambda data: data.split('Series Crew')[0],
|
||||
'Writing credits</a>(.*?)</table>',
|
||||
'<a href="/name/.*?/">(.*?)</a>'
|
||||
'<span class="titlereference-title-year">(.*?)</span>',
|
||||
'<a.*?>(\d+)',
|
||||
],
|
||||
'type': 'int'
|
||||
},
|
||||
'credits': {
|
||||
'page': 'fullcredits',
|
||||
're': [
|
||||
lambda data: data.split('<h4'),
|
||||
'>(.*?)</h4>.*?(<table.*?</table>)',
|
||||
lambda data: [d for d in data if d]
|
||||
],
|
||||
'type': 'list'
|
||||
},
|
||||
'year': {
|
||||
'page': 'combined',
|
||||
're': '="og:title" content="[^"]*?\((\d{4}).*?"',
|
||||
'type': 'int'
|
||||
}
|
||||
}
|
||||
|
||||
def read_url(self, url, timeout):
|
||||
if not url in self._cache:
|
||||
if url not in self._cache:
|
||||
self._cache[url] = read_url(url, timeout=timeout, unicode=True)
|
||||
return self._cache[url]
|
||||
|
||||
def __init__(self, id, timeout=-1):
|
||||
#use akas.imdb.com to always get original title:
|
||||
#http://www.imdb.com/help/show_leaf?titlelanguagedisplay
|
||||
self.baseUrl = "http://akas.imdb.com/title/tt%s/" % id
|
||||
# use akas.imdb.com to always get original title:
|
||||
# http://www.imdb.com/help/show_leaf?titlelanguagedisplay
|
||||
self.baseUrl = "http://www.imdb.com/title/tt%s/" % id
|
||||
super(Imdb, self).__init__(timeout)
|
||||
|
||||
url = self.baseUrl + 'combined'
|
||||
|
||||
url = self.baseUrl + 'reference'
|
||||
page = self.read_url(url, timeout=-1)
|
||||
if '<title>IMDb: Page not found</title>' in page \
|
||||
or 'The requested URL was not found on our server.' in page:
|
||||
|
|
@ -332,119 +327,15 @@ class Imdb(SiteParser):
|
|||
isinstance(self['alternativeTitles'][0], string_types):
|
||||
self['alternativeTitles'] = [self['alternativeTitles']]
|
||||
|
||||
for key in ('country', 'genre', 'language', 'sound', 'color'):
|
||||
if key in self:
|
||||
self[key] = [x[0] if len(x) == 1 and isinstance(x, list) else x for x in self[key]]
|
||||
self[key] = list(filter(lambda x: x.lower() != 'home', self[key]))
|
||||
|
||||
#normalize country names
|
||||
if 'country' in self:
|
||||
self['country'] = [normalize_country_name(c) or c for c in self['country']]
|
||||
|
||||
if 'sound' in self:
|
||||
self['sound'] = list(set(self['sound']))
|
||||
|
||||
types = {}
|
||||
stop_words = [
|
||||
'alternative spelling',
|
||||
'alternative title',
|
||||
'alternative transliteration',
|
||||
'closing credits title',
|
||||
'complete title',
|
||||
'IMAX version',
|
||||
'informal short title',
|
||||
'International (Spanish title)',
|
||||
'Japan (imdb display title)',
|
||||
'longer version',
|
||||
'new title',
|
||||
'original subtitled version',
|
||||
'pre-release title',
|
||||
'promotional abbreviation',
|
||||
'recut version',
|
||||
'reissue title',
|
||||
'restored version',
|
||||
'script title',
|
||||
'short title',
|
||||
'(subtitle)',
|
||||
'TV title',
|
||||
'working title',
|
||||
'World-wide (Spanish title)',
|
||||
]
|
||||
#ignore english japanese titles
|
||||
#for movies that are not only from japan
|
||||
if ['Japan'] != self.get('country', []):
|
||||
stop_words += [
|
||||
'Japan (English title)'
|
||||
]
|
||||
for t in self.get('alternativeTitles', []):
|
||||
for type in t[0].split('/'):
|
||||
type = type.strip()
|
||||
stop_word = False
|
||||
for key in stop_words:
|
||||
if key in type:
|
||||
stop_word = True
|
||||
break
|
||||
if not stop_word:
|
||||
if not type in types:
|
||||
types[type] = []
|
||||
types[type].append(t[1])
|
||||
titles = {}
|
||||
for type in types:
|
||||
for title in types[type]:
|
||||
if not title in titles:
|
||||
titles[title] = []
|
||||
titles[title].append(type)
|
||||
def select_title(type):
|
||||
title = types[type][0]
|
||||
count = 0
|
||||
if len(types[type]) > 1:
|
||||
for t in types[type]:
|
||||
if len(titles[t]) > count:
|
||||
count = len(titles[t])
|
||||
title = t
|
||||
return title
|
||||
|
||||
#FIXME: does work in python2.6, possible to import from __future__?
|
||||
#types = {type: select_title(type) for type in types}
|
||||
_types = {}
|
||||
for type in types:
|
||||
_types[type] = select_title(type)
|
||||
types = _types
|
||||
|
||||
regexps = [
|
||||
"^.+ \(imdb display title\) \(English title\)$",
|
||||
"^USA \(imdb display title\)$",
|
||||
"^International \(English title\)$",
|
||||
"^International \(English title\)$",
|
||||
"^UK \(imdb display title\)$",
|
||||
"^International \(.+\) \(English title\)$",
|
||||
"^World-wide \(English title\)$",
|
||||
]
|
||||
if 'Hong Kong' in self.get('country', []):
|
||||
regexps += [
|
||||
"Hong Kong \(English title\)"
|
||||
]
|
||||
english_countries = (
|
||||
'USA', 'UK', 'United States', 'United Kingdom',
|
||||
'Australia', 'New Zealand'
|
||||
)
|
||||
if not filter(lambda c: c in english_countries, self.get('country', [])):
|
||||
regexps += [
|
||||
"^[^(]+ \(English title\)$",
|
||||
"^.+ \(.+\) \(English title\)$",
|
||||
"^USA$",
|
||||
"^UK$",
|
||||
"^USA \(.+\)$",
|
||||
"^UK \(.+\)$",
|
||||
"^Australia \(.+\)$",
|
||||
"World-wide \(English title\)",
|
||||
"\(literal English title\)",
|
||||
"^International \(.+ title\)$",
|
||||
"^International \(.+\) \(.+ title\)$",
|
||||
]
|
||||
for regexp in regexps:
|
||||
for type in types:
|
||||
if re.compile(regexp).findall(type):
|
||||
#print types[type], type
|
||||
self['internationalTitle'] = types[type]
|
||||
break
|
||||
if 'internationalTitle' in self:
|
||||
break
|
||||
|
||||
def cleanup_title(title):
|
||||
if title.startswith('"') and title.endswith('"'):
|
||||
|
|
@ -454,44 +345,43 @@ class Imdb(SiteParser):
|
|||
title = re.sub('\(\#[.\d]+\)', '', title)
|
||||
return title.strip()
|
||||
|
||||
for t in ('title', 'internationalTitle'):
|
||||
for t in ('title', 'originalTitle'):
|
||||
if t in self:
|
||||
self[t] = cleanup_title(self[t])
|
||||
|
||||
if 'internationalTitle' in self and \
|
||||
self.get('title', '').lower() == self['internationalTitle'].lower():
|
||||
del self['internationalTitle']
|
||||
|
||||
if 'alternativeTitles' in self:
|
||||
alt = {}
|
||||
for t in self['alternativeTitles']:
|
||||
title = cleanup_title(t[1])
|
||||
if title not in (self.get('title'), self.get('internationalTitle')):
|
||||
if title.lower() not in (self.get('title', '').lower(), self.get('originalTitle', '').lower()):
|
||||
if title not in alt:
|
||||
alt[title] = []
|
||||
for c in t[0].split('/'):
|
||||
if not '(working title)' in c:
|
||||
c = c.replace('International', '').replace('World-wide', '').split('(')[0].strip()
|
||||
if c:
|
||||
alt[title].append(c)
|
||||
for cleanup in ('International', '(working title)', 'World-wide'):
|
||||
c = c.replace(cleanup, '')
|
||||
c = c.split('(')[0].strip()
|
||||
if c:
|
||||
alt[title].append(c)
|
||||
self['alternativeTitles'] = []
|
||||
for t in sorted(alt, key=lambda a: sorted(alt[a])):
|
||||
countries = sorted([normalize_country_name(c) or c for c in alt[t]])
|
||||
countries = sorted(set([normalize_country_name(c) or c for c in alt[t]]))
|
||||
self['alternativeTitles'].append((t, countries))
|
||||
if not self['alternativeTitles']:
|
||||
del self['alternativeTitles']
|
||||
|
||||
if 'internationalTitle' in self:
|
||||
self['originalTitle'] = self['title']
|
||||
self['title'] = self.pop('internationalTitle')
|
||||
|
||||
if 'runtime' in self and self['runtime']:
|
||||
if 'min' in self['runtime']: base=60
|
||||
else: base=1
|
||||
if isinstance(self['runtime'], list):
|
||||
self['runtime'] = self['runtime'][0]
|
||||
if 'min' in self['runtime']:
|
||||
base = 60
|
||||
else:
|
||||
base = 1
|
||||
self['runtime'] = int(find_re(self['runtime'], '([0-9]+)')) * base
|
||||
if 'runtime' in self and not self['runtime']:
|
||||
del self['runtime']
|
||||
if 'votes' in self: self['votes'] = self['votes'].replace(',', '')
|
||||
|
||||
if 'sound' in self:
|
||||
self['sound'] = list(sorted(set(self['sound'])))
|
||||
|
||||
if 'cast' in self:
|
||||
if isinstance(self['cast'][0], string_types):
|
||||
|
|
@ -499,6 +389,7 @@ class Imdb(SiteParser):
|
|||
self['actor'] = [c[0] for c in self['cast']]
|
||||
def cleanup_character(c):
|
||||
c = c.replace('(uncredited)', '').strip()
|
||||
c = re.sub('\s+', ' ', c)
|
||||
return c
|
||||
self['cast'] = [{'actor': x[0], 'character': cleanup_character(x[1])}
|
||||
for x in self['cast']]
|
||||
|
|
@ -522,18 +413,8 @@ class Imdb(SiteParser):
|
|||
return r
|
||||
cc[rel] = list(map(get_conn, re.compile('<a href="/title/tt(\d{7})/?">(.*?)</a>(.*?)<\/div', re.DOTALL).findall(data)))
|
||||
|
||||
|
||||
self['connections'] = cc
|
||||
|
||||
for key in ('country', 'genre'):
|
||||
if key in self:
|
||||
self[key] = list(filter(lambda x: x.lower() != 'home', self[key]))
|
||||
#0092999
|
||||
if '_director' in self:
|
||||
if 'series' in self or 'isSeries' in self:
|
||||
self['creator'] = self.pop('_director')
|
||||
else:
|
||||
del self['_director']
|
||||
if 'isSeries' in self:
|
||||
del self['isSeries']
|
||||
self['isSeries'] = True
|
||||
|
|
@ -555,7 +436,7 @@ class Imdb(SiteParser):
|
|||
if 'director' in self:
|
||||
self['episodeDirector'] = self['director']
|
||||
|
||||
if not 'creator' in series and 'director' in series:
|
||||
if 'creator' not in series and 'director' in series:
|
||||
series['creator'] = series['director']
|
||||
if len(series['creator']) > 10:
|
||||
series['creator'] = series['director'][:1]
|
||||
|
|
@ -566,7 +447,7 @@ class Imdb(SiteParser):
|
|||
|
||||
if 'year' in series:
|
||||
self['seriesYear'] = series['year']
|
||||
if not 'year' in self:
|
||||
if 'year' not in self:
|
||||
self['year'] = series['year']
|
||||
|
||||
if 'year' in self:
|
||||
|
|
@ -620,11 +501,48 @@ class Imdb(SiteParser):
|
|||
self['summary'] = self['summary'][0]
|
||||
self['summary'] = self['summary'].split('</p')[0].strip()
|
||||
|
||||
if 'credits' in self:
|
||||
credits = [
|
||||
[
|
||||
strip_tags(d[0].replace(' by', '')).strip(),
|
||||
[
|
||||
[
|
||||
strip_tags(x[0]).strip(),
|
||||
[t.strip().split(' (')[0].strip() for t in x[2].split(' / ')]
|
||||
]
|
||||
for x in
|
||||
re.compile('<td class="name">(.*?)</td>.*?<td>(.*?)</td>.*?<td class="credit">(.*?)</td>', re.DOTALL).findall(d[1])
|
||||
]
|
||||
] for d in self['credits'] if d
|
||||
]
|
||||
credits = [c for c in credits if c[1]]
|
||||
|
||||
self['credits'] = []
|
||||
self['lyricist'] = []
|
||||
self['singer'] = []
|
||||
for department, crew in credits:
|
||||
department = department.replace('(in alphabetical order)', '').strip()
|
||||
for c in crew:
|
||||
name = c[0]
|
||||
roles = c[1]
|
||||
self['credits'].append({
|
||||
'name': name,
|
||||
'roles': roles,
|
||||
'deparment': department
|
||||
})
|
||||
if department == 'Music Department':
|
||||
if 'lyricist' in roles:
|
||||
self['lyricist'].append(name)
|
||||
if 'playback singer' in roles:
|
||||
self['singer'].append(name)
|
||||
if not self['credits']:
|
||||
del self['credits']
|
||||
|
||||
class ImdbCombined(Imdb):
|
||||
def __init__(self, id, timeout=-1):
|
||||
_regex = {}
|
||||
for key in self.regex:
|
||||
if self.regex[key]['page'] in ('combined', 'releaseinfo'):
|
||||
if self.regex[key]['page'] in ('releaseinfo', 'reference'):
|
||||
_regex[key] = self.regex[key]
|
||||
self.regex = _regex
|
||||
super(ImdbCombined, self).__init__(id, timeout)
|
||||
|
|
@ -640,25 +558,25 @@ def get_movie_by_title(title, timeout=-1):
|
|||
If there is more than one film with that title for the year
|
||||
Title (Year/I)
|
||||
|
||||
>>> get_movie_by_title(u'"Father Knows Best" (1954) {(#5.34)}')
|
||||
u'1602860'
|
||||
>>> str(get_movie_by_title(u'"Father Knows Best" (1954) {(#5.34)}'))
|
||||
'1602860'
|
||||
|
||||
>>> get_movie_by_title(u'The Matrix (1999)')
|
||||
u'0133093'
|
||||
>>> str(get_movie_by_title(u'The Matrix (1999)'))
|
||||
'0133093'
|
||||
|
||||
>>> get_movie_by_title(u'Little Egypt (1951)')
|
||||
u'0043748'
|
||||
>>> str(get_movie_by_title(u'Little Egypt (1951)'))
|
||||
'0043748'
|
||||
|
||||
>>> str(get_movie_by_title(u'Little Egypt (1897/I)'))
|
||||
'0214882'
|
||||
|
||||
>>> get_movie_by_title(u'Little Egypt (1897/I)')
|
||||
u'0214882'
|
||||
|
||||
>>> get_movie_by_title(u'Little Egypt')
|
||||
None
|
||||
|
||||
>>> get_movie_by_title(u'"Dexter" (2006) {Father Knows Best (#1.9)}')
|
||||
u'0866567'
|
||||
>>> str(get_movie_by_title(u'"Dexter" (2006) {Father Knows Best (#1.9)}'))
|
||||
'0866567'
|
||||
'''
|
||||
params = {'s':'tt','q': title}
|
||||
params = {'s': 'tt', 'q': title}
|
||||
if not isinstance(title, bytes):
|
||||
try:
|
||||
params['q'] = unicodedata.normalize('NFKC', params['q']).encode('latin-1')
|
||||
|
|
@ -676,20 +594,21 @@ def get_movie_by_title(title, timeout=-1):
|
|||
|
||||
def get_movie_id(title, director='', year='', timeout=-1):
|
||||
'''
|
||||
>>> get_movie_id('The Matrix')
|
||||
u'0133093'
|
||||
>>> str(get_movie_id('The Matrix'))
|
||||
'0133093'
|
||||
|
||||
>>> get_movie_id('2 or 3 Things I Know About Her', 'Jean-Luc Godard')
|
||||
u'0060304'
|
||||
>>> str(get_movie_id('2 or 3 Things I Know About Her', 'Jean-Luc Godard'))
|
||||
'0060304'
|
||||
|
||||
>>> get_movie_id('2 or 3 Things I Know About Her', 'Jean-Luc Godard', '1967')
|
||||
u'0060304'
|
||||
>>> str(get_movie_id('2 or 3 Things I Know About Her', 'Jean-Luc Godard', '1967'))
|
||||
'0060304'
|
||||
|
||||
>>> get_movie_id(u"Histoire(s) du cinema: Le controle de l'univers", 'Jean-Luc Godard')
|
||||
u'0179214'
|
||||
>>> str(get_movie_id(u"Histoire(s) du cinema: Le controle de l'univers", u'Jean-Luc Godard'))
|
||||
'0179214'
|
||||
|
||||
>>> str(get_movie_id(u"Histoire(s) du cinéma: Le contrôle de l'univers", u'Jean-Luc Godard'))
|
||||
'0179214'
|
||||
|
||||
>>> get_movie_id(u"Histoire(s) du cinéma: Le contrôle de l'univers", 'Jean-Luc Godard')
|
||||
u'0179214'
|
||||
'''
|
||||
imdbId = {
|
||||
(u'Le jour se l\xe8ve', u'Marcel Carn\xe9'): '0031514',
|
||||
|
|
@ -729,7 +648,7 @@ def get_movie_id(title, director='', year='', timeout=-1):
|
|||
}.get((title, director), None)
|
||||
if imdbId:
|
||||
return imdbId
|
||||
params = {'s':'tt','q': title}
|
||||
params = {'s': 'tt', 'q': title}
|
||||
if director:
|
||||
params['q'] = u'"%s" %s' % (title, director)
|
||||
if year:
|
||||
|
|
@ -756,8 +675,8 @@ def get_movie_id(title, director='', year='', timeout=-1):
|
|||
if results:
|
||||
return results[0]
|
||||
|
||||
#print (title, director), ": '',"
|
||||
#print google_query
|
||||
#print((title, director), ": '',")
|
||||
#print(google_query)
|
||||
#results = google.find(google_query, timeout=timeout)
|
||||
results = duckduckgo.find(google_query, timeout=timeout)
|
||||
if results:
|
||||
|
|
@ -772,15 +691,12 @@ def get_movie_poster(imdbId):
|
|||
'''
|
||||
>>> get_movie_poster('0133093')
|
||||
'http://ia.media-imdb.com/images/M/MV5BMjEzNjg1NTg2NV5BMl5BanBnXkFtZTYwNjY3MzQ5._V1._SX338_SY475_.jpg'
|
||||
|
||||
>>> get_movie_poster('0994352')
|
||||
'http://ia.media-imdb.com/images/M/MV5BMjA3NzMyMzU1MV5BMl5BanBnXkFtZTcwNjc1ODUwMg@@._V1._SX594_SY755_.jpg'
|
||||
'''
|
||||
info = ImdbCombined(imdbId)
|
||||
if 'posterId' in info:
|
||||
url = "http://www.imdb.com/media/rm%s/tt%s" % (info['posterId'], imdbId)
|
||||
data = read_url(url).decode('utf-8', 'ignore')
|
||||
poster = find_re(data, 'img.*?id="primary-img".*?src="(.*?)"')
|
||||
poster = info['posterId']
|
||||
if '@._V' in poster:
|
||||
poster = poster.split('@._V')[0] + '@.jpg'
|
||||
return poster
|
||||
elif 'series' in info:
|
||||
return get_movie_poster(info['series'])
|
||||
|
|
@ -793,7 +709,7 @@ def get_episodes(imdbId, season=None):
|
|||
url += '?season=%d' % season
|
||||
data = cache.read_url(url)
|
||||
for e in re.compile('<div data-const="tt(\d{7})".*?>.*?<div>S(\d+), Ep(\d+)<\/div>\n<\/div>', re.DOTALL).findall(data):
|
||||
episodes['S%02dE%02d' %(int(e[1]), int(e[2]))] = e[0]
|
||||
episodes['S%02dE%02d' % (int(e[1]), int(e[2]))] = e[0]
|
||||
else:
|
||||
data = cache.read_url(url)
|
||||
match = re.compile('<strong>Season (\d+)</strong>').findall(data)
|
||||
|
|
@ -804,9 +720,11 @@ def get_episodes(imdbId, season=None):
|
|||
|
||||
def max_votes():
|
||||
url = 'http://www.imdb.com/search/title?num_votes=500000,&sort=num_votes,desc'
|
||||
data = cache.read_url(url)
|
||||
votes = max([int(v.replace(',', ''))
|
||||
for v in re.compile('<td class="sort_col">([\d,]+)</td>').findall(data)])
|
||||
data = cache.read_url(url).decode('utf-8', 'ignore')
|
||||
votes = max([
|
||||
int(v.replace(',', ''))
|
||||
for v in re.compile('<span name="nv" data-value="(\d+)"').findall(data)
|
||||
])
|
||||
return votes
|
||||
|
||||
def guess(title, director='', timeout=-1):
|
||||
|
|
|
|||
|
|
@ -3,26 +3,34 @@
|
|||
from __future__ import print_function
|
||||
import re
|
||||
|
||||
from ox.cache import read_url
|
||||
import ox.cache
|
||||
from ox.html import strip_tags
|
||||
from ox.text import find_re
|
||||
|
||||
|
||||
def read_url(url, timeout=ox.cache.cache_timeout):
|
||||
data = ox.cache.read_url(url, timeout=timeout)
|
||||
try:
|
||||
data = data.decode('utf-8')
|
||||
except UnicodeDecodeError:
|
||||
data = data.decode('latin-1')
|
||||
return data
|
||||
|
||||
def get_data(id):
|
||||
'''
|
||||
>>> get_data('1991/silence_of_the_lambs')['imdbId']
|
||||
u'0102926'
|
||||
>>> str(get_data('1991/silence_of_the_lambs')['imdbId'])
|
||||
'0102926'
|
||||
|
||||
>>> get_data('1991/silence_of_the_lambs')['posters'][0]
|
||||
u'http://www.impawards.com/1991/posters/silence_of_the_lambs_ver1.jpg'
|
||||
>>> str(get_data('1991/silence_of_the_lambs')['posters'][0])
|
||||
'http://www.impawards.com/1991/posters/silence_of_the_lambs_ver1.jpg'
|
||||
|
||||
>>> get_data('1991/silence_of_the_lambs')['url']
|
||||
u'http://www.impawards.com/1991/silence_of_the_lambs_ver1.html'
|
||||
>>> str(get_data('1991/silence_of_the_lambs')['url'])
|
||||
'http://www.impawards.com/1991/silence_of_the_lambs_ver1.html'
|
||||
'''
|
||||
data = {
|
||||
'url': get_url(id)
|
||||
}
|
||||
html = read_url(data['url'], unicode=True)
|
||||
html = read_url(data['url'])
|
||||
data['imdbId'] = find_re(html, 'imdb.com/title/tt(\d{7})')
|
||||
if not data['imdbId']:
|
||||
data['imdbId'] = _id_map.get(id, '')
|
||||
|
|
@ -37,16 +45,15 @@ def get_data(id):
|
|||
for result in results:
|
||||
result = result.replace('_xlg.html', '.html')
|
||||
url = 'http://www.impawards.com/%s/%s' % (data['year'], result)
|
||||
html = read_url(url, unicode=True)
|
||||
html = read_url(url)
|
||||
result = find_re(html, '<a href = (\w*?_xlg.html)')
|
||||
if result:
|
||||
url = 'http://www.impawards.com/%s/%s' % (data['year'], result)
|
||||
html = read_url(url, unicode=True)
|
||||
html = read_url(url)
|
||||
poster = 'http://www.impawards.com/%s/%s' % (data['year'], find_re(html, '<img SRC="(.*?)"'))
|
||||
else:
|
||||
poster = 'http://www.impawards.com/%s/%s' % (data['year'], find_re(html, '<img src="(posters.*?)"'))
|
||||
data['posters'].append(poster)
|
||||
|
||||
return data
|
||||
|
||||
def get_id(url):
|
||||
|
|
@ -60,27 +67,29 @@ def get_id(url):
|
|||
id = '%s/%s' % (year, '_'.join(split))
|
||||
return id
|
||||
|
||||
|
||||
def get_ids(page=None):
|
||||
ids = []
|
||||
if page:
|
||||
html = read_url('http://www.impawards.com/archives/page%s.html' % page, timeout = -1, unicode=True)
|
||||
html = read_url('http://www.impawards.com/archives/page%s.html' % page, timeout=-1)
|
||||
results = re.compile('<a href = \.\./(.*?)>', re.DOTALL).findall(html)
|
||||
for result in results:
|
||||
url = 'http://impawards.com/%s' % result
|
||||
ids.append(get_id(url))
|
||||
return set(ids)
|
||||
#get all
|
||||
html = read_url('http://www.impawards.com/archives/latest.html', timeout = 60*60, unicode=True)
|
||||
# get all
|
||||
html = read_url('http://www.impawards.com/archives/latest.html', timeout=60*60)
|
||||
pages = int(find_re(html, '<a href= page(.*?).html>')) + 1
|
||||
for page in range(pages, 0, -1):
|
||||
for id in get_ids(page):
|
||||
if not id in ids:
|
||||
if id not in ids:
|
||||
ids.append(id)
|
||||
return ids
|
||||
|
||||
|
||||
def get_url(id):
|
||||
url = u"http://www.impawards.com/%s.html" % id
|
||||
html = read_url(url, unicode=True)
|
||||
html = read_url(url)
|
||||
if find_re(html, "No Movie Posters on This Page"):
|
||||
url = u"http://www.impawards.com/%s_ver1.html" % id
|
||||
return url
|
||||
|
|
|
|||
|
|
@ -28,22 +28,32 @@ def get_show_url(title):
|
|||
def get_data(url):
|
||||
data = read_url(url, unicode=True)
|
||||
doc = document_fromstring(data)
|
||||
score = filter(lambda s: s.attrib.get('property') == 'v:average',
|
||||
doc.xpath('//span[@class="score_value"]'))
|
||||
score = [s for s in doc.xpath('//span[@class="score_value"]')
|
||||
if s.attrib.get('property') == 'v:average']
|
||||
if score:
|
||||
score = int(score[0].text)
|
||||
else:
|
||||
score = -1
|
||||
authors = [a.text
|
||||
for a in doc.xpath('//div[@class="review_content"]//div[@class="author"]//a')]
|
||||
sources = [d.text
|
||||
for d in doc.xpath('//div[@class="review_content"]//div[@class="source"]/a')]
|
||||
reviews = [d.text
|
||||
for d in doc.xpath('//div[@class="review_content"]//div[@class="review_body"]')]
|
||||
scores = [int(d.text.strip())
|
||||
for d in doc.xpath('//div[@class="review_content"]//div[contains(@class, "critscore")]')]
|
||||
urls = [a.attrib['href']
|
||||
for a in doc.xpath('//div[@class="review_content"]//a[contains(@class, "external")]')]
|
||||
authors = [
|
||||
a.text
|
||||
for a in doc.xpath('//div[@class="review_content"]//div[@class="author"]//a')
|
||||
]
|
||||
sources = [
|
||||
d.text
|
||||
for d in doc.xpath('//div[@class="review_content"]//div[@class="source"]/a')
|
||||
]
|
||||
reviews = [
|
||||
d.text
|
||||
for d in doc.xpath('//div[@class="review_content"]//div[@class="review_body"]')
|
||||
]
|
||||
scores = [
|
||||
int(d.text.strip())
|
||||
for d in doc.xpath('//div[@class="review_content"]//div[contains(@class, "critscore")]')
|
||||
]
|
||||
urls = [
|
||||
a.attrib['href']
|
||||
for a in doc.xpath('//div[@class="review_content"]//a[contains(@class, "external")]')
|
||||
]
|
||||
|
||||
metacritics = []
|
||||
for i in range(len(authors)):
|
||||
|
|
@ -54,7 +64,7 @@ def get_data(url):
|
|||
'quote': strip_tags(reviews[i]).strip(),
|
||||
'score': scores[i],
|
||||
})
|
||||
|
||||
|
||||
return {
|
||||
'critics': metacritics,
|
||||
'id': get_id(url),
|
||||
|
|
|
|||
|
|
@ -1,121 +0,0 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
from datetime import datetime
|
||||
import re
|
||||
import socket
|
||||
from six.moves.urllib.parse import quote
|
||||
|
||||
from ox.cache import read_url
|
||||
from ox import find_re, cache, strip_tags, decode_html, get_torrent_info, int_value, normalize_newlines
|
||||
from ox.normalize import normalize_imdbid
|
||||
import ox
|
||||
|
||||
from torrent import Torrent
|
||||
|
||||
|
||||
def _parse_results_page(data, max_results=10):
|
||||
results=[]
|
||||
regexp = '''<tr><td>(.*?)</td><td>(.*?)<a href="/tor/(.*?)">(.*?)</a>.*?</td>.*?</tr>'''
|
||||
for row in re.compile(regexp, re.DOTALL).findall(data):
|
||||
torrentDate = row[0]
|
||||
torrentExtra = row[1]
|
||||
torrentId = row[2]
|
||||
torrentTitle = decode_html(row[3]).strip()
|
||||
torrentLink = "http://www.mininova.org/tor/" + torrentId
|
||||
privateTracker = 'priv.gif' in torrentExtra
|
||||
if not privateTracker:
|
||||
results.append((torrentTitle, torrentLink, ''))
|
||||
return results
|
||||
|
||||
def find_movie(query=None, imdb=None, max_results=10):
|
||||
'''search for torrents on mininova
|
||||
'''
|
||||
if imdb:
|
||||
url = "http://www.mininova.org/imdb/?imdb=%s" % normalize_imdbid(imdb)
|
||||
else:
|
||||
url = "http://www.mininova.org/search/%s/seeds" % quote(query)
|
||||
data = read_url(url, unicode=True)
|
||||
return _parse_results_page(data, max_results)
|
||||
|
||||
def get_id(mininovaId):
|
||||
mininovaId = unicode(mininovaId)
|
||||
d = find_re(mininovaId, "/(\d+)")
|
||||
if d:
|
||||
return d
|
||||
mininovaId = mininovaId.split('/')
|
||||
if len(mininovaId) == 1:
|
||||
return mininovaId[0]
|
||||
else:
|
||||
return mininovaId[-1]
|
||||
|
||||
def exists(mininovaId):
|
||||
mininovaId = get_id(mininovaId)
|
||||
data = ox.net.read_url("http://www.mininova.org/tor/%s" % mininovaId)
|
||||
if not data or 'Torrent not found...' in data:
|
||||
return False
|
||||
if 'tracker</a> of this torrent requires registration.' in data:
|
||||
return False
|
||||
return True
|
||||
|
||||
def get_data(mininovaId):
|
||||
_key_map = {
|
||||
'by': u'uploader',
|
||||
}
|
||||
mininovaId = get_id(mininovaId)
|
||||
torrent = dict()
|
||||
torrent[u'id'] = mininovaId
|
||||
torrent[u'domain'] = 'mininova.org'
|
||||
torrent[u'comment_link'] = "http://www.mininova.org/tor/%s" % mininovaId
|
||||
torrent[u'torrent_link'] = "http://www.mininova.org/get/%s" % mininovaId
|
||||
torrent[u'details_link'] = "http://www.mininova.org/det/%s" % mininovaId
|
||||
|
||||
data = read_url(torrent['comment_link'], unicode=True) + read_url(torrent['details_link'], unicode=True)
|
||||
if '<h1>Torrent not found...</h1>' in data:
|
||||
return None
|
||||
|
||||
for d in re.compile('<p>.<strong>(.*?):</strong>(.*?)</p>', re.DOTALL).findall(data):
|
||||
key = d[0].lower().strip()
|
||||
key = _key_map.get(key, key)
|
||||
value = decode_html(strip_tags(d[1].strip()))
|
||||
torrent[key] = value
|
||||
|
||||
torrent[u'title'] = find_re(data, '<title>(.*?):.*?</title>')
|
||||
torrent[u'imdbId'] = find_re(data, 'title/tt(\d{7})')
|
||||
torrent[u'description'] = find_re(data, '<div id="description">(.*?)</div>')
|
||||
if torrent['description']:
|
||||
torrent['description'] = normalize_newlines(decode_html(strip_tags(torrent['description']))).strip()
|
||||
t = read_url(torrent[u'torrent_link'])
|
||||
torrent[u'torrent_info'] = get_torrent_info(t)
|
||||
return torrent
|
||||
|
||||
class Mininova(Torrent):
|
||||
'''
|
||||
>>> Mininova('123')
|
||||
{}
|
||||
>>> Mininova('1072195')['infohash']
|
||||
'72dfa59d2338e4a48c78cec9de25964cddb64104'
|
||||
'''
|
||||
def __init__(self, mininovaId):
|
||||
self.data = get_data(mininovaId)
|
||||
if not self.data:
|
||||
return
|
||||
Torrent.__init__(self)
|
||||
ratio = self.data['share ratio'].split(',')
|
||||
self['seeder'] = -1
|
||||
self['leecher'] = -1
|
||||
if len(ratio) == 2:
|
||||
val = int_value(ratio[0].replace(',','').strip())
|
||||
if val:
|
||||
self['seeder'] = int(val)
|
||||
val = int_value(ratio[1].replace(',','').strip())
|
||||
if val:
|
||||
self['leecher'] = int(val)
|
||||
val = int_value(self.data['downloads'].replace(',','').strip())
|
||||
if val:
|
||||
self['downloaded'] = int(val)
|
||||
else:
|
||||
self['downloaded'] = -1
|
||||
published = self.data['added on']
|
||||
published = published.split(' +')[0]
|
||||
self['published'] = datetime.strptime(published, "%a, %d %b %Y %H:%M:%S")
|
||||
|
||||
|
|
@ -2,12 +2,12 @@
|
|||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
import re
|
||||
|
||||
import feedparser
|
||||
from ox.cache import read_url
|
||||
from ox import find_re, strip_tags
|
||||
from ox.iso import langCode2To3, langTo3Code
|
||||
|
||||
def find_subtitles(imdb, parts = 1, language = "eng"):
|
||||
import feedparser
|
||||
if len(language) == 2:
|
||||
language = langCode2To3(language)
|
||||
elif len(language) != 3:
|
||||
|
|
|
|||
|
|
@ -32,7 +32,7 @@ def get_data(url):
|
|||
r['summary'] = get_og(data, 'description')
|
||||
|
||||
meter = re.compile('<span id="all-critics-meter" class="meter(.*?)">(.*?)</span>').findall(data)
|
||||
meter = filter(lambda m: m[1].isdigit(), meter)
|
||||
meter = [m for m in meter if m[1].isdigit()]
|
||||
if meter:
|
||||
r['tomatometer'] = meter[0][1]
|
||||
r['rating'] = find_re(data, 'Average Rating: <span>([\d.]+)/10</span>')
|
||||
|
|
|
|||
|
|
@ -33,7 +33,7 @@ class SiteParser(dict):
|
|||
return "%s%s" % (self.baseUrl, page)
|
||||
|
||||
def read_url(self, url, timeout):
|
||||
if not url in self._cache:
|
||||
if url not in self._cache:
|
||||
self._cache[url] = read_url(url, timeout=timeout, unicode=True)
|
||||
return self._cache[url]
|
||||
|
||||
|
|
|
|||
|
|
@ -95,7 +95,7 @@ def format_subsection(string):
|
|||
'ussports': 'US-Sports',
|
||||
'wunderbar': 'wunderBAR'
|
||||
}
|
||||
if subsection.has_key(string):
|
||||
if string in subsection:
|
||||
return subsection[string].replace(u'\xc3', 'ae')
|
||||
return string[:1].upper() + string[1:]
|
||||
|
||||
|
|
@ -219,8 +219,8 @@ def archive_news():
|
|||
else:
|
||||
dMax = days[m]
|
||||
for d in range(dMax, 0, -1):
|
||||
print('getNews(%d, %d, %d)' % (y, m, d))
|
||||
news = getNews(y, m ,d)
|
||||
print('get_news(%d, %d, %d)' % (y, m, d))
|
||||
news = get_news(y, m, d)
|
||||
for new in news:
|
||||
dirname = archivePath + '/' + new['date'][0:4] + '/' + new['date'][5:7] + new['date'][8:10] + '/' + new['date'][11:13] + new['date'][14:16]
|
||||
if not os.path.exists(dirname):
|
||||
|
|
@ -230,7 +230,7 @@ def archive_news():
|
|||
else:
|
||||
filename = dirname + '/' + new['url'] + '.json'
|
||||
if not os.path.exists(filename) or True:
|
||||
data = json.dumps(new, ensure_ascii = False)
|
||||
data = json.dumps(new, ensure_ascii=False)
|
||||
f = open(filename, 'w')
|
||||
f.write(data)
|
||||
f.close()
|
||||
|
|
@ -253,7 +253,7 @@ def archive_news():
|
|||
string = strings[3]
|
||||
if len(strings) == 6:
|
||||
string += '/' + strings[4]
|
||||
if not count.has_key(string):
|
||||
if string not in count:
|
||||
count[string] = {'count': 1, 'string': '%s %s http://www.spiegel.de/%s/0,1518,archiv-%d-%03d,00.html' % (new['date'], new['date'], new['section'].lower(), y, int(datetime(y, m, d).strftime('%j')))}
|
||||
else:
|
||||
count[string] = {'count': count[string]['count'] + 1, 'string': '%s %s' % (new['date'], count[string]['string'][17:])}
|
||||
|
|
@ -269,12 +269,12 @@ if __name__ == '__main__':
|
|||
# spiegel = Spiegel(2008, 8)
|
||||
# print(spiegel.getContents())
|
||||
# news = News(2001, 9, 10)
|
||||
# output(news.getNews())
|
||||
# output(news.get_news())
|
||||
'''
|
||||
x = []
|
||||
for d in range(10, 30):
|
||||
print('2/%d' % d)
|
||||
news = getNews(2008, 2, d)
|
||||
news = get_news(2008, 2, d)
|
||||
for new in news:
|
||||
strings = new['url'].split('/')
|
||||
string = format_section(strings[3])
|
||||
|
|
|
|||
|
|
@ -21,10 +21,10 @@ def find(query, max_results=DEFAULT_MAX_RESULTS, timeout=DEFAULT_TIMEOUT):
|
|||
Return max_results tuples with title, url, description
|
||||
|
||||
>>> find("The Matrix site:imdb.com", 1)[0][0]
|
||||
u'The Matrix (1999) - IMDb'
|
||||
'The Matrix (1999) - IMDb'
|
||||
|
||||
>>> find("The Matrix site:imdb.com", 1)[0][1]
|
||||
u'http://www.imdb.com/title/tt0133093/'
|
||||
'http://www.imdb.com/title/tt0133093/'
|
||||
"""
|
||||
results = []
|
||||
url = 'https://eu1.startpage.com/do/search?nosteeraway=1&abp=1&language=english&cmd=process_search&query=%s&x=0&y=0&cat=web&engine0=v1all' % quote_plus(query)
|
||||
|
|
|
|||
|
|
@ -9,11 +9,10 @@ from ox import find_re, cache, strip_tags, decode_html, get_torrent_info, normal
|
|||
from ox.normalize import normalize_imdbid
|
||||
import ox
|
||||
|
||||
from torrent import Torrent
|
||||
|
||||
cache_timeout = 24*60*60 # cache search only for 24 hours
|
||||
|
||||
season_episode = re.compile("S..E..", re.IGNORECASE)
|
||||
baseurl = "https://thepiratebay.org/"
|
||||
|
||||
|
||||
def read_url(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None, unicode=False):
|
||||
|
|
@ -25,7 +24,7 @@ def find_movies(query=None, imdb=None, max_results=10):
|
|||
if imdb:
|
||||
query = "tt" + normalize_imdbid(imdb)
|
||||
results = []
|
||||
next = ["https://thepiratebay.se/search/%s/0/3/200" % quote(query), ]
|
||||
next = [baseurl + "hsearch/%s/0/3/200" % quote(query), ]
|
||||
page_count = 1
|
||||
while next and page_count < 4:
|
||||
page_count += 1
|
||||
|
|
@ -33,12 +32,12 @@ def find_movies(query=None, imdb=None, max_results=10):
|
|||
if not url.startswith('http'):
|
||||
if not url.startswith('/'):
|
||||
url = "/" + url
|
||||
url = "https://thepiratebay.se" + url
|
||||
url = baseurl + url
|
||||
data = read_url(url, timeout=cache_timeout, unicode=True)
|
||||
regexp = '''<tr.*?<td class="vertTh"><a href="/browse/(.*?)".*?<td><a href="(/torrent/.*?)" class="detLink".*?>(.*?)</a>.*?</tr>'''
|
||||
for row in re.compile(regexp, re.DOTALL).findall(data):
|
||||
torrentType = row[0]
|
||||
torrentLink = "https://thepiratebay.se" + row[1]
|
||||
torrentLink = baseurl + row[1]
|
||||
torrentTitle = decode_html(row[2])
|
||||
# 201 = Movies , 202 = Movie DVDR, 205 TV Shows
|
||||
if torrentType in ['201']:
|
||||
|
|
@ -61,7 +60,7 @@ def get_id(piratebayId):
|
|||
|
||||
def exists(piratebayId):
|
||||
piratebayId = get_id(piratebayId)
|
||||
return ox.net.exists("https://thepiratebay.se/torrent/%s" % piratebayId)
|
||||
return ox.net.exists(baseurl + "torrent/%s" % piratebayId)
|
||||
|
||||
def get_data(piratebayId):
|
||||
_key_map = {
|
||||
|
|
@ -75,7 +74,7 @@ def get_data(piratebayId):
|
|||
torrent = dict()
|
||||
torrent[u'id'] = piratebayId
|
||||
torrent[u'domain'] = 'thepiratebay.org'
|
||||
torrent[u'comment_link'] = 'https://thepiratebay.se/torrent/%s' % piratebayId
|
||||
torrent[u'comment_link'] = baseurl + 'torrent/%s' % piratebayId
|
||||
|
||||
data = read_url(torrent['comment_link'], unicode=True)
|
||||
torrent[u'title'] = find_re(data, '<title>(.*?) \(download torrent\) - TPB</title>')
|
||||
|
|
@ -84,33 +83,15 @@ def get_data(piratebayId):
|
|||
torrent[u'title'] = decode_html(torrent[u'title']).strip()
|
||||
torrent[u'imdbId'] = find_re(data, 'title/tt(\d{7})')
|
||||
title = quote(torrent['title'].encode('utf-8'))
|
||||
torrent[u'torrent_link']="http://torrents.thepiratebay.org/%s/%s.torrent" % (piratebayId, title)
|
||||
torrent[u'magent_link']= find_re(data, '"(magnet:.*?)"')
|
||||
torrent[u'infohash'] = find_re(torrent[u'magent_link'], "btih:(.*?)&")
|
||||
for d in re.compile('dt>(.*?):</dt>.*?<dd.*?>(.*?)</dd>', re.DOTALL).findall(data):
|
||||
key = d[0].lower().strip()
|
||||
key = _key_map.get(key, key)
|
||||
value = decode_html(strip_tags(d[1].strip()))
|
||||
torrent[key] = value
|
||||
if not '<' in key:
|
||||
torrent[key] = value
|
||||
torrent[u'description'] = find_re(data, '<div class="nfo">(.*?)</div>')
|
||||
if torrent[u'description']:
|
||||
torrent['description'] = normalize_newlines(decode_html(strip_tags(torrent['description']))).strip()
|
||||
t = read_url(torrent[u'torrent_link'])
|
||||
torrent[u'torrent_info'] = get_torrent_info(t)
|
||||
return torrent
|
||||
|
||||
class Thepiratebay(Torrent):
|
||||
'''
|
||||
>>> Thepiratebay('123')
|
||||
{}
|
||||
|
||||
>>> Thepiratebay('3951349')['infohash']
|
||||
'4e84415d36ed7b54066160c05a0b0f061898d12b'
|
||||
'''
|
||||
def __init__(self, piratebayId):
|
||||
self.data = get_data(piratebayId)
|
||||
if not self.data:
|
||||
return
|
||||
Torrent.__init__(self)
|
||||
published = self.data['uploaded']
|
||||
published = published.replace(' GMT', '').split(' +')[0]
|
||||
self['published'] = datetime.strptime(published, "%Y-%m-%d %H:%M:%S")
|
||||
|
||||
|
|
|
|||
|
|
@ -1,37 +0,0 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
from ox import int_value
|
||||
|
||||
|
||||
class Torrent(dict):
|
||||
'''
|
||||
>>> Torrent()
|
||||
{'files': 1, 'domain': u'', 'subtitle language': u'', 'seeder': -1, 'description': u'', 'language': u'', 'title': u'', 'imdbId': u'', 'downloaded': -1, 'leecher': -1, 'torrent_link': u'', 'torrent_info': {}, 'published': u'', 'announce': '', 'infohash': '', 'id': u'', 'comment_link': u'', 'size': -1}
|
||||
'''
|
||||
_string_keys = ('id', 'title', 'description', 'infohash', 'torrent_link', 'comment_link',
|
||||
'imdbId', 'announce', 'domain', 'published', 'language', 'subtitle language')
|
||||
_int_keys = ('size', 'seeder', 'leecher', 'downloaded', 'files')
|
||||
_dict_keys = ('torrent_info', )
|
||||
_list_keys = ()
|
||||
data = {'torrent_info': {}}
|
||||
|
||||
def __init__(self):
|
||||
for key in self._string_keys:
|
||||
self[key] = self.data.get(key, u'')
|
||||
for key in self._dict_keys:
|
||||
self[key] = self.data.get(key, {})
|
||||
for key in self._list_keys:
|
||||
self[key] = self.data.get(key, [])
|
||||
for key in self._int_keys:
|
||||
value = self.data.get(key, -1)
|
||||
if not isinstance(value, int):
|
||||
value = int(int_value(value))
|
||||
self[key] = value
|
||||
self['infohash'] = self.data['torrent_info'].get('hash', '')
|
||||
self['size'] = self.data['torrent_info'].get('size', -1)
|
||||
self['announce'] = self.data['torrent_info'].get('announce', '')
|
||||
if 'files' in self.data['torrent_info']:
|
||||
self['files'] = len(self.data['torrent_info']['files'])
|
||||
else:
|
||||
self['files'] = 1
|
||||
|
||||
|
|
@ -116,7 +116,7 @@ def get_movie_data(wikipedia_url):
|
|||
|
||||
def get_image_url(name):
|
||||
url = 'http://en.wikipedia.org/wiki/Image:' + name.replace(' ', '%20')
|
||||
data = read_url(url)
|
||||
data = read_url(url).decode('utf-8')
|
||||
url = find_re(data, 'href="(http://upload.wikimedia.org/.*?)"')
|
||||
if not url:
|
||||
url = find_re(data, 'href="(//upload.wikimedia.org/.*?)"')
|
||||
|
|
@ -145,7 +145,7 @@ def find(query, max_results=10):
|
|||
url = "http://en.wikipedia.org/w/api.php?" + urllib.parse.urlencode(query)
|
||||
data = read_url(url)
|
||||
if not data:
|
||||
data = read_url(url, timeout=0)
|
||||
data = read_url(url, timeout=0)
|
||||
result = json.loads(data.decode('utf-8'))
|
||||
results = []
|
||||
if result and 'query' in result:
|
||||
|
|
|
|||
|
|
@ -7,7 +7,6 @@ import re
|
|||
from xml.dom.minidom import parseString
|
||||
import json
|
||||
|
||||
import feedparser
|
||||
import ox
|
||||
from ox.cache import read_url, cache_timeout
|
||||
|
||||
|
|
@ -27,15 +26,15 @@ def video_url(youtubeId, format='mp4', timeout=cache_timeout):
|
|||
"""
|
||||
fmt = None
|
||||
if format == '4k':
|
||||
fmt=38
|
||||
fmt = 38
|
||||
elif format == '1080p':
|
||||
fmt=37
|
||||
fmt = 37
|
||||
elif format == '720p':
|
||||
fmt=22
|
||||
fmt = 22
|
||||
elif format == 'mp4':
|
||||
fmt=18
|
||||
fmt = 18
|
||||
elif format == 'high':
|
||||
fmt=35
|
||||
fmt = 35
|
||||
elif format == 'webm':
|
||||
streams = videos(youtubeId, 'webm')
|
||||
return streams[max(streams.keys())]['url']
|
||||
|
|
@ -46,14 +45,14 @@ def video_url(youtubeId, format='mp4', timeout=cache_timeout):
|
|||
|
||||
def get_video_info(id):
|
||||
eurl = get_url(id)
|
||||
data = read_url(eurl)
|
||||
data = read_url(eurl).decode('utf-8')
|
||||
t = re.compile('\W[\'"]?t[\'"]?: ?[\'"](.+?)[\'"]').findall(data)
|
||||
if t:
|
||||
t = t[0]
|
||||
else:
|
||||
raise IOError
|
||||
url = "http://www.youtube.com/get_video_info?&video_id=%s&el=$el&ps=default&eurl=%s&hl=en_US&t=%s" % (id, quote(eurl), quote(t))
|
||||
data = read_url(url)
|
||||
data = read_url(url).decode('utf-8')
|
||||
info = {}
|
||||
for part in data.split('&'):
|
||||
key, value = part.split('=')
|
||||
|
|
@ -61,6 +60,7 @@ def get_video_info(id):
|
|||
return info
|
||||
|
||||
def find(query, max_results=10, offset=1, orderBy='relevance'):
|
||||
import feedparser
|
||||
query = quote(query)
|
||||
url = "http://gdata.youtube.com/feeds/api/videos?vq=%s&orderby=%s&start-index=%s&max-results=%s" % (query, orderBy, offset, max_results)
|
||||
data = read_url(url)
|
||||
|
|
@ -104,14 +104,20 @@ def info(id, timeout=cache_timeout):
|
|||
info['license'] = match[0].strip()
|
||||
info['license'] = re.sub('<.+?>', '', info['license']).strip()
|
||||
|
||||
subs = subtitles(id, timeout)
|
||||
if subs:
|
||||
info['subtitles'] = subs
|
||||
return info
|
||||
|
||||
def subtitles(id, timeout=cache_timeout):
|
||||
url = "http://www.youtube.com/api/timedtext?hl=en&type=list&tlangs=1&v=%s&asrs=1" % id
|
||||
data = read_url(url, timeout=timeout)
|
||||
xml = parseString(data)
|
||||
languages = [t.getAttribute('lang_code') for t in xml.getElementsByTagName('track')]
|
||||
subtitles = {}
|
||||
if languages:
|
||||
info['subtitles'] = {}
|
||||
for language in languages:
|
||||
url = "http://www.youtube.com/api/timedtext?hl=en&v=%s&type=track&lang=%s&name&kind"%(id, language)
|
||||
url = "http://www.youtube.com/api/timedtext?hl=en&v=%s&type=track&lang=%s&name&kind" % (id, language)
|
||||
data = read_url(url, timeout=timeout)
|
||||
xml = parseString(data)
|
||||
subs = []
|
||||
|
|
@ -128,8 +134,8 @@ def info(id, timeout=cache_timeout):
|
|||
'out': end,
|
||||
'value': ox.decode_html(text),
|
||||
})
|
||||
info['subtitles'][language] = subs
|
||||
return info
|
||||
subtitles[language] = subs
|
||||
return subtitles
|
||||
|
||||
def videos(id, format=''):
|
||||
stream_type = {
|
||||
|
|
@ -154,7 +160,7 @@ def videos(id, format=''):
|
|||
return streams
|
||||
|
||||
def playlist(url):
|
||||
data = read_url(url)
|
||||
data = read_url(url).decode('utf-8')
|
||||
items = []
|
||||
for i in list(set(re.compile('<a href="(/watch\?v=.*?)" title="(.*?)" ').findall(data))):
|
||||
items.append({
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue