run update

This commit is contained in:
j 2018-12-15 01:08:54 +01:00
commit 6806bebb7c
607 changed files with 52543 additions and 31832 deletions

View file

@ -2,6 +2,7 @@ from __future__ import print_function
import json
import re
from six import text_type
from ox.cache import read_url
HEADERS = {
@ -16,9 +17,9 @@ USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7) '
USER_AGENT += 'AppleWebKit/534.48.3 (KHTML, like Gecko) Version/5.1 Safari/534.48.3'
def get_movie_data(title, director):
if isinstance(title, unicode):
if isinstance(title, text_type):
title = title.encode('utf-8')
if isinstance(director, unicode):
if isinstance(director, text_type):
director = director.encode('utf-8')
data = {}
# itunes section (preferred source for link)
@ -45,7 +46,7 @@ def get_movie_data(title, director):
results = js['results']
if results:
url = host + results[0]['location']
if not 'link' in data:
if 'link' not in data:
data['link'] = url
headers = {
'User-Agent': USER_AGENT

View file

@ -17,7 +17,7 @@ def get(key):
if key in auth:
return auth[key]
print("please add key %s to json file '%s'" % (key, user_auth))
raise Exception,"no key %s found" % key
raise Exception("no key %s found" % key)
def update(key, value):
user_auth = os.environ.get('oxAUTH', os.path.expanduser('~/.ox/auth.json'))
@ -31,4 +31,3 @@ def update(key, value):
f = open(user_auth, "w")
f.write(json.dumps(auth, indent=2))
f.close()

View file

@ -8,13 +8,13 @@ from ox.cache import read_url
from ox.html import strip_tags, decode_html
from ox.text import find_re
import imdb
from . import imdb
def get_id(url):
return url.split("/")[-1]
def get_url(id):
return "http://www.criterion.com/films/%s" % id
return "https://www.criterion.com/films/%s" % id
def get_data(id, timeout=ox.cache.cache_timeout, get_imdb=False):
'''
@ -28,23 +28,34 @@ def get_data(id, timeout=ox.cache.cache_timeout, get_imdb=False):
u'http://s3.amazonaws.com/criterion-production/product_images/185/343_box_348x490.jpg'
'''
data = {
"id": id,
"url": get_url(id)
}
try:
html = read_url(data["url"], timeout=timeout, unicode=True)
except:
html = ox.cache.read_url(data["url"], timeout=timeout)
data["number"] = find_re(html, "<li>Spine #(\d+)")
html = read_url(data["url"], timeout=timeout).decode('utf-8', 'ignore')
data["title"] = decode_html(find_re(html, "<h1 class=\"movietitle\">(.*?)</h1>"))
data["number"] = find_re(html, "<b>Spine #(\d+)")
data["title"] = decode_html(find_re(html, "<h1 class=\"header__primarytitle\".*?>(.*?)</h1>"))
data["title"] = data["title"].split(u' \u2014 The Television Version')[0].strip()
data["director"] = strip_tags(find_re(html, "<h2 class=\"director\">(.*?)</h2>"))
results = find_re(html, '<div class="left_column">(.*?)</div>')
results = re.compile("<li>(.*?)</li>").findall(results)
data["country"] = results[0]
data["year"] = results[1]
results = find_re(html, '<ul class="film-meta-list">(.*?)</ul>')
info = re.compile('<li itemprop="(.*?)".*?>(.*?)</li>', re.DOTALL).findall(results)
info = {k: strip_tags(v).strip() for k, v in info}
if 'director' in info:
data['director'] = info['director']
if 'countryOfOrigin' in info:
data['country'] = [c.strip() for c in decode_html(info['countryOfOrigin']).split(', ')]
if 'inLanguage' in info:
data['language'] = [l.strip() for l in decode_html(info['inLanguage']).split(', ')]
for v in re.compile('<li>(.*?)</li>', re.DOTALL).findall(results):
if 'datePublished' in v:
data['year'] = strip_tags(v).strip()
elif 'duration' in v:
data['duration'] = strip_tags(v).strip()
data["synopsis"] = decode_html(strip_tags(find_re(html,
"<div class=\"content_block last\">.*?<p>(.*?)</p>")))
"<div class=\"product-summary\".*?>.*?<p>(.*?)</p>")))
result = find_re(html, "<div class=\"purchase\">(.*?)</div>")
if 'Blu-Ray' in result or 'Essential Art House DVD' in result:
@ -56,47 +67,46 @@ def get_data(id, timeout=ox.cache.cache_timeout, get_imdb=False):
data["posters"] = [result]
else:
html_ = read_url(result, unicode=True)
result = find_re(html_, '<a href="http://www.criterion.com/films/%s.*?">(.*?)</a>' % id)
result = find_re(html_, '//www.criterion.com/films/%s.*?">(.*?)</a>' % id)
result = find_re(result, "src=\"(.*?)\"")
if result:
data["posters"] = [result.replace("_w100", "")]
else:
data["posters"] = []
data['posters'] = [re.sub('(\?\d+)$', '', p) for p in data['posters']]
data['posters'] = [p for p in data['posters'] if p]
posters = find_re(html, '<div class="product-box-art".*?>(.*?)</div>')
for poster in re.compile('<img src="(.*?)"').findall(posters):
data['posters'].append(poster)
result = find_re(html, "<img alt=\"Film Still\" height=\"252\" src=\"(.*?)\"")
if result:
data["stills"] = [result]
data["trailers"] = []
else:
data["stills"] = filter(lambda x: x, [find_re(html, "\"thumbnailURL\", \"(.*?)\"")])
data["trailers"] = filter(lambda x: x, [find_re(html, "\"videoURL\", \"(.*?)\"")])
data["stills"] = list(filter(lambda x: x, [find_re(html, "\"thumbnailURL\", \"(.*?)\"")]))
data["trailers"] = list(filter(lambda x: x, [find_re(html, "\"videoURL\", \"(.*?)\"")]))
if timeout == ox.cache.cache_timeout:
timeout = -1
if get_imdb:
if get_imdb and 'title' in data and 'director' in data:
# removed year, as "title (year)" may fail to match
data['imdbId'] = imdb.get_movie_id(data['title'], data['director'], timeout=timeout)
return data
def get_ids(page=None):
ids = []
if page:
url = "http://www.criterion.com/library/expanded_view?m=dvd&p=%s&pp=50&s=spine" % page
html = read_url(url)
results = re.compile("films/(\d+)").findall(html)
html = read_url("https://www.criterion.com/shop/browse/list?sort=spine_number", unicode=True)
results = re.compile("films/(\d+)-").findall(html)
ids += results
results = re.compile("boxsets/(.*?)\"").findall(html)
for result in results:
html = read_url("https://www.criterion.com/boxsets/" + result, unicode=True)
results = re.compile("films/(\d+)-").findall(html)
ids += results
results = re.compile("boxsets/(.*?)\"").findall(html)
for result in results:
html = read_url("http://www.criterion.com/boxsets/" + result)
results = re.compile("films/(\d+)").findall(html)
ids += results
return set(ids)
html = read_url("http://www.criterion.com/library/expanded_view?m=dvd&p=1&pp=50&s=spine", unicode=True)
results = re.compile("\&amp;p=(\d+)\&").findall(html)
pages = max(map(int, results))
for page in range(1, pages):
ids += get_ids(page)
return sorted(set(ids), key=int)
if __name__ == '__main__':
print(get_ids())

View file

@ -1,21 +1,21 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import re
from six.moves.urllib.parse import unquote
from ox.cache import read_url
def get_video_url(url):
'''
>>> get_video_url('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3opar_priere-pour-refuznik-1-jeanluc-goda_shortfilms').split('?auth')[0]
'http://www.dailymotion.com/cdn/FLV-320x240/video/x3opar_priere-pour-refuznik-1-jean-luc-god_shortfilms.flv'
>>> get_video_url('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3ou94_priere-pour-refuznik-2-jeanluc-goda_shortfilms').split('?auth')[0]
'http://www.dailymotion.com/cdn/FLV-320x240/video/x3ou94_priere-pour-refuznik-2-jean-luc-god_shortfilms.flv'
'''
data = read_url(url)
video = re.compile('''video", "(.*?)"''').findall(data)
for v in video:
v = unquote(v).split('@@')[0]
return v
return ''
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import re
from six.moves.urllib.parse import unquote
from ox.cache import read_url
def get_video_url(url):
'''
>>> get_video_url('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3opar_priere-pour-refuznik-1-jeanluc-goda_shortfilms').split('?auth')[0]
'http://www.dailymotion.com/cdn/FLV-320x240/video/x3opar_priere-pour-refuznik-1-jean-luc-god_shortfilms.flv'
>>> get_video_url('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3ou94_priere-pour-refuznik-2-jeanluc-goda_shortfilms').split('?auth')[0]
'http://www.dailymotion.com/cdn/FLV-320x240/video/x3ou94_priere-pour-refuznik-2-jean-luc-god_shortfilms.flv'
'''
data = read_url(url)
video = re.compile('''video", "(.*?)"''').findall(data)
for v in video:
v = unquote(v).split('@@')[0]
return v
return ''

View file

@ -6,17 +6,25 @@ from six.moves import urllib
import ox
from ox import strip_tags, decode_html
from ox.cache import read_url
import lxml.html
def find(query, timeout=ox.cache.cache_timeout):
"""
Returns tuples with title, url, description
"""
if not isinstance(query, bytes):
query = query.encode('utf-8')
params = urllib.parse.urlencode({'q': query})
url = 'http://duckduckgo.com/html/?' + params
data = read_url(url, timeout=timeout).decode('utf-8')
doc = lxml.html.document_fromstring(data)
results = []
regex = '<a .*?class="large" href="(.+?)">(.*?)</a>.*?<div class="snippet">(.*?)</div>'
for r in re.compile(regex, re.DOTALL).findall(data):
results.append((strip_tags(decode_html(r[1])), r[0], strip_tags(decode_html(r[2]))))
for e in doc.xpath("//a[contains(@class, 'result__a')]"):
url = e.attrib['href']
if 'uddg=' in url:
url = urllib.parse.unquote(url.split('&uddg=')[-1])
title = e.text_content()
description = ''
results.append((title, url, description))
return results

View file

@ -7,7 +7,7 @@ import time
from ox import strip_tags, find_re
from ox.cache import read_url
import google
from . import google
def get_show_url(title):

View file

@ -21,11 +21,11 @@ def find(query, max_results=DEFAULT_MAX_RESULTS, timeout=DEFAULT_TIMEOUT):
"""
Return max_results tuples with title, url, description
>>> find("The Matrix site:imdb.com", 1)[0][0]
u'The Matrix (1999) - IMDb'
>>> str(find("The Matrix site:imdb.com", 1)[0][0])
'The Matrix (1999) - IMDb'
>>> find("The Matrix site:imdb.com", 1)[0][1]
u'http://www.imdb.com/title/tt0133093/'
>>> str(find("The Matrix site:imdb.com", 1)[0][1])
'http://www.imdb.com/title/tt0133093/'
"""
results = []
offset = 0

View file

@ -7,7 +7,7 @@ import time
import unicodedata
from six.moves.urllib.parse import urlencode
from six import string_types
from six import text_type, string_types
from .. import find_re, strip_tags, decode_html
from .. import cache
@ -18,22 +18,95 @@ from . import duckduckgo
from ..utils import datetime
from ..geo import normalize_country_name
def read_url(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None, unicode=False):
def prepare_url(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None, unicode=False):
headers = headers.copy()
# https://webapps.stackexchange.com/questions/11003/how-can-i-disable-reconfigure-imdbs-automatic-geo-location-so-it-does-not-defau
headers['X-Forwarded-For'] = '72.21.206.80'
return url, data, headers, timeout, unicode
def read_url(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None, unicode=False):
url, data, headers, timeout, unicode = prepare_url(url, data, headers, timeout, valid, unicode)
return cache.read_url(url, data, headers, timeout, unicode=unicode)
def delete_url(url, data=None, headers=cache.DEFAULT_HEADERS):
url, data, headers, timeout, unicode = prepare_url(url, data, headers)
cache.store.delete(url, data, headers)
def get_url(id):
return "http://www.imdb.com/title/tt%s/" % id
def reference_section(id):
return {
'page': 'reference',
're': [
'<h4 name="{id}" id="{id}".*?<table(.*?)</table>'.format(id=id),
'<a href="/name/.*?>(.*?)</a>'
],
'type': 'list'
}
def zebra_list(label, more=None):
conditions = {
'page': 'reference',
're': [
'_label">' + label + '</td>.*?<ul(.*?)</ul>',
'<li.*?>(.*?)</li>'
],
'type': 'list',
}
if more:
conditions['re'] += more
return conditions
def zebra_table(label, more=None, type='string'):
conditions = {
'page': 'reference',
're': [
'_label">' + label + '</td>.*?<td>(.*?)</td>',
],
'type': type,
}
if more:
conditions['re'] += more
return conditions
def parse_aspectratio(value):
r = value
if ':' in value:
r = value.split(':')
n = r[0]
d = r[1].strip().split(' ')[0]
try:
if float(d):
value = str(float(n) / float(d))
else:
value = str(float(n))
except:
print('failed to parse aspect: %s' % value)
else:
value = '.'.join(value.strip().split('.')[:2])
return value
'''
'posterIds': {
'page': 'posters',
're': '/unknown-thumbnail/media/rm(.*?)/tt',
'type': 'list'
},
'''
class Imdb(SiteParser):
'''
>>> Imdb('0068646')['title']
u'The Godfather'
>>> Imdb('0068646')['title'] == text_type(u'The Godfather')
True
>>> Imdb('0133093')['title']
u'The Matrix'
>>> Imdb('0133093')['title'] == text_type(u'The Matrix')
True
'''
regex = {
regex = {
'alternativeTitles': {
'page': 'releaseinfo',
're': [
@ -41,98 +114,49 @@ class Imdb(SiteParser):
"td>(.*?)</td>.*?<td>(.*?)</td>"
],
'type': 'list'
},
'aspectratio': {
'page': 'combined',
're': 'Aspect Ratio:</h5><div class="info-content">([\d\.]+)',
'page': 'reference',
're': [
'Aspect Ratio</td>.*?ipl-inline-list__item">\s+([\d\.\:\ ]+)',
parse_aspectratio,
],
'type': 'float',
},
'budget': {
'page': 'business',
're': [
'<h5>Budget</h5>\s*?\$(.*?)<br',
lambda data: find_re(decode_html(data).replace(',', ''), '\d+')
],
'type': 'int'
},
'budget': zebra_table('Budget', more=[
lambda data: find_re(decode_html(data).replace(',', ''), '\d+')
], type='int'),
'cast': {
'page': 'combined',
'page': 'reference',
're': [
'<td class="nm">.*?>(.*?)</a>.*?<td class="char">(.*?)</td>',
lambda ll: [strip_tags(l) for l in ll]
],
'type': 'list'
},
'cinematographer': {
'page': 'combined',
're': [
lambda data: data.split('Series Crew')[0],
'Cinematography by</a>(.*?)</table>',
'<a href="/name/.*?/">(.*?)</a>'
' <table class="cast_list">(.*?)</table>',
'<td.*?itemprop="actor".*?>.*?>(.*?)</a>.*?<td class="character">(.*?)</td>',
lambda ll: [strip_tags(l) for l in ll] if isinstance(ll, list) else strip_tags(ll)
],
'type': 'list'
},
'cinematographer': reference_section('cinematographers'),
'connections': {
'page': 'movieconnections',
're': '<h4 class="li_group">(.*?)</h4>(.*?)(<\/div>\n <a|<script)',
'type': 'list'
},
'country': {
'page': 'combined',
're': [
'<div class="info"><h5>Country:</h5>.*?<div class="info">',
#'<a href="/country/.*?">(.*?)</a>', #links changed to work with existing caches, just take all links
'<a.*?>(.*?)</a>',
],
'type': 'list'
},
'country': zebra_list('Country', more=['<a.*?>(.*?)</a>']),
'creator': {
'page': 'combined',
'page': '',
're': [
'<h5>Creator.?:</h5>.*?<div class="info-content">(.*?)</div>',
'<a href="/name/.*?>(.*?)</a>'
],
'type': 'list'
},
'director': {
'page': 'combined',
're': [
lambda data: data.split('<b>Series Crew</b>')[0],
'Directed by</a>(.*?)</table>',
'<a href="/name/.*?>(.*?)</a>'
],
'type': 'list'
},
'_director': {
'page': 'combined',
're': [
'<h5>Director:</h5>.*?<div class="info-content">(.*?)</div>',
'<a href="/name/.*?>(.*?)</a>'
],
'type': 'list'
},
'editor': {
'page': 'combined',
're': [
lambda data: data.split('Series Crew')[0],
'Film Editing by</a>(.*?)</table>',
'<a href="/name/.*?>(.*?)</a>'
],
'type': 'list'
},
'composer': {
'page': 'combined',
're': [
lambda data: data.split('Series Crew')[0],
'Original Music by</a>(.*?)</table>',
'<a href="/name/.*?>(.*?)</a>'
'<div class="credit_summary_item">.*?<h4.*?>Creator.?:</h4>(.*?)</div>',
'<a href="/name/.*?>(.*?)</a>',
lambda ll: strip_tags(ll)
],
'type': 'list'
},
'director': reference_section('directors'),
'editor': reference_section('editors'),
'composer': reference_section('composers'),
'episodeTitle': {
'page': 'combined',
're': '<div id="tn15title">.*?<em>(.*?)</em>',
'page': 'reference',
're': '<h3 itemprop="name">(.*?)<',
'type': 'string'
},
'filmingLocations': {
@ -143,71 +167,44 @@ class Imdb(SiteParser):
],
'type': 'list'
},
'genre': {
'page': 'combined',
're': [
'<h5>Genre:</h5>(.*?)<hr',
'<a href="/Sections/Genres/.*?/">(.*?)</a>'
],
'type': 'list'
},
'gross': {
'page': 'business',
're': [
'<h5>Gross</h5>\s*?\$(.*?)<br',
lambda data: find_re(data.replace(',', ''), '\d+')
],
'type': 'int'
},
'genre': zebra_list('Genres', more=['<a.*?>(.*?)</a>']),
'gross': zebra_table('Cumulative Worldwide Gross', more=[
lambda data: find_re(decode_html(data).replace(',', ''), '\d+')
], type='int'),
'keyword': {
'page': 'keywords',
're': '<a href="/keyword/.*?>(.*?)</a>',
'type': 'list'
},
'language': {
'page': 'combined',
're': [
'<div class="info"><h5>Language:</h5>.*?<div class="info">',
#'<a href="/language/.*?">(.*?)</a>', #links changed to work with existing caches, just take all links
'<a.*?>(.*?)</a>',
],
'type': 'list'
},
'summary': {
'page': 'plotsummary',
're': '<p class="plotSummary">(.*?)<\/p>',
'language': zebra_list('Language', more=['<a.*?>(.*?)</a>']),
'originalTitle': {
'page': 'releaseinfo',
're': '<td>\(original title\)</td>\s*<td>(.*?)</td>',
'type': 'string'
},
'summary': zebra_table('Plot Summary', more=[
'<p>(.*?)<em'
]),
'posterId': {
'page': 'combined',
're': '/primary-photo/media/rm(.*?)/tt',
'page': 'reference',
're': '<img.*?class="titlereference-primary-image".*?src="(.*?)".*?>',
'type': 'string'
},
'posterIds': {
'page': 'posters',
're': '/unknown-thumbnail/media/rm(.*?)/tt',
'type': 'list'
},
'producer': {
'page': 'combined',
're': [
lambda data: data.split('Series Crew')[0],
'Produced by</a>(.*?)</table>',
'<a href="/name/.*?/">(.*?)</a>'
],
'type': 'list'
},
'producer': reference_section('producers'),
'productionCompany': {
'page': 'combined',
'page': 'reference',
're': [
'Production Companies</b><ul>(.*?)</ul>',
'Production Companies.*?<ul(.*?)</ul>',
'<a href="/company/.*?/">(.*?)</a>'
],
'type': 'list'
},
'rating': {
'page': 'combined',
're': '<div class="starbar-meta">.*?<b>([\d,.]+?)/10</b>',
'page': 'reference',
're': [
'<div class="ipl-rating-star ">(.*?)</div>',
'ipl-rating-star__rating">([\d,.]+?)</span>',
],
'type': 'float'
},
'releasedate': {
@ -218,64 +215,55 @@ class Imdb(SiteParser):
],
'type': 'list'
},
'reviews': {
'page': 'externalreviews',
're': [
'<ol>(.*?)</ol>',
'<li><a href="(http.*?)".*?>(.*?)</a></li>'
],
'type': 'list'
},
'runtime': {
'page': 'combined',
're': '<h5>Runtime:</h5><div class="info-content">.*?([0-9]+ sec|[0-9]+ min).*?</div>',
'type': 'string'
},
'color': {
'page': 'combined',
're': [
'<h5>Color:</h5><div class="info-content">(.*?)</div>',
'<a.*?>(.*?)</a>'
],
'type': 'list'
},
'sound': {
'page': 'combined',
're': [
'<h5>Sound Mix:</h5><div class="info-content">(.*?)</div>',
'<a.*?>(.*?)</a>'
],
'type': 'list'
},
#FIXME using some /offsite/ redirect now
#'reviews': {
# 'page': 'externalreviews',
# 're': [
# '<ul class="simpleList">(.*?)</ul>',
# '<li>.*?<a href="(http.*?)".*?>(.*?)</a>.*?</li>'
# ],
# 'type': 'list'
#},
'runtime': zebra_list('Runtime'),
'color': zebra_list('Color', more=[
'<a.*?>([^(<]+)',
lambda r: r[0] if isinstance(r, list) else r,
strip_tags
]),
'sound': zebra_list('Sound Mix', more=[
'<a.*?>([^(<]+)',
lambda r: r[0] if isinstance(r, list) else r,
strip_tags
]),
'season': {
'page': 'combined',
'page': 'reference',
're': [
'<h5>Original Air Date:</h5>.*?<div class="info-content">(.*?)</div>',
'\(Season (\d+), Episode \d+\)',
'<ul class="ipl-inline-list titlereference-overview-season-episode-numbers">(.*?)</ul>',
'Season (\d+)',
],
'type': 'int'
},
'episode': {
'page': 'combined',
'page': 'reference',
're': [
'<h5>Original Air Date:</h5>.*?<div class="info-content">(.*?)</div>',
'\(Season \d+, Episode (\d+)\)',
'<ul class="ipl-inline-list titlereference-overview-season-episode-numbers">(.*?)</ul>',
'Episode (\d+)',
],
'type': 'int'
},
'series': {
'page': 'combined',
're': '<h5>TV Series:</h5>.*?<a href="/title/tt(\d{7})',
'page': 'reference',
're': '<h4 itemprop="name">.*?<a href="/title/tt(\d{7})',
'type': 'string'
},
'isSeries': {
'page': 'combined',
're': '<span class="tv-extra">(TV series|TV mini-series) ',
'page': 'reference',
're': 'property=\'og:title\'.*?content=".*?(TV series|TV mini-series).*?"',
'type': 'string'
},
'title': {
'page': 'combined',
're': '<h1>(.*?) <span>',
'page': 'releaseinfo',
're': 'h3 itemprop="name">.*?>(.*?)</a>',
'type': 'string'
},
'trivia': {
@ -287,38 +275,45 @@ class Imdb(SiteParser):
'type': 'list',
},
'votes': {
'page': 'combined',
're': '<a href="ratings" class="tn15more">([\d,]*?) votes</a>',
'page': 'reference',
're': [
'class="ipl-rating-star__total-votes">\((.*?)\)',
lambda r: r.replace(',', '')
],
'type': 'string'
},
'writer': {
'page': 'combined',
'writer': reference_section('writers'),
'year': {
'page': 'reference',
're': [
lambda data: data.split('Series Crew')[0],
'Writing credits</a>(.*?)</table>',
'<a href="/name/.*?/">(.*?)</a>'
'<span class="titlereference-title-year">(.*?)</span>',
'<a.*?>(\d+)',
],
'type': 'int'
},
'credits': {
'page': 'fullcredits',
're': [
lambda data: data.split('<h4'),
'>(.*?)</h4>.*?(<table.*?</table>)',
lambda data: [d for d in data if d]
],
'type': 'list'
},
'year': {
'page': 'combined',
're': '="og:title" content="[^"]*?\((\d{4}).*?"',
'type': 'int'
}
}
def read_url(self, url, timeout):
if not url in self._cache:
if url not in self._cache:
self._cache[url] = read_url(url, timeout=timeout, unicode=True)
return self._cache[url]
def __init__(self, id, timeout=-1):
#use akas.imdb.com to always get original title:
#http://www.imdb.com/help/show_leaf?titlelanguagedisplay
self.baseUrl = "http://akas.imdb.com/title/tt%s/" % id
# use akas.imdb.com to always get original title:
# http://www.imdb.com/help/show_leaf?titlelanguagedisplay
self.baseUrl = "http://www.imdb.com/title/tt%s/" % id
super(Imdb, self).__init__(timeout)
url = self.baseUrl + 'combined'
url = self.baseUrl + 'reference'
page = self.read_url(url, timeout=-1)
if '<title>IMDb: Page not found</title>' in page \
or 'The requested URL was not found on our server.' in page:
@ -332,119 +327,15 @@ class Imdb(SiteParser):
isinstance(self['alternativeTitles'][0], string_types):
self['alternativeTitles'] = [self['alternativeTitles']]
for key in ('country', 'genre', 'language', 'sound', 'color'):
if key in self:
self[key] = [x[0] if len(x) == 1 and isinstance(x, list) else x for x in self[key]]
self[key] = list(filter(lambda x: x.lower() != 'home', self[key]))
#normalize country names
if 'country' in self:
self['country'] = [normalize_country_name(c) or c for c in self['country']]
if 'sound' in self:
self['sound'] = list(set(self['sound']))
types = {}
stop_words = [
'alternative spelling',
'alternative title',
'alternative transliteration',
'closing credits title',
'complete title',
'IMAX version',
'informal short title',
'International (Spanish title)',
'Japan (imdb display title)',
'longer version',
'new title',
'original subtitled version',
'pre-release title',
'promotional abbreviation',
'recut version',
'reissue title',
'restored version',
'script title',
'short title',
'(subtitle)',
'TV title',
'working title',
'World-wide (Spanish title)',
]
#ignore english japanese titles
#for movies that are not only from japan
if ['Japan'] != self.get('country', []):
stop_words += [
'Japan (English title)'
]
for t in self.get('alternativeTitles', []):
for type in t[0].split('/'):
type = type.strip()
stop_word = False
for key in stop_words:
if key in type:
stop_word = True
break
if not stop_word:
if not type in types:
types[type] = []
types[type].append(t[1])
titles = {}
for type in types:
for title in types[type]:
if not title in titles:
titles[title] = []
titles[title].append(type)
def select_title(type):
title = types[type][0]
count = 0
if len(types[type]) > 1:
for t in types[type]:
if len(titles[t]) > count:
count = len(titles[t])
title = t
return title
#FIXME: does work in python2.6, possible to import from __future__?
#types = {type: select_title(type) for type in types}
_types = {}
for type in types:
_types[type] = select_title(type)
types = _types
regexps = [
"^.+ \(imdb display title\) \(English title\)$",
"^USA \(imdb display title\)$",
"^International \(English title\)$",
"^International \(English title\)$",
"^UK \(imdb display title\)$",
"^International \(.+\) \(English title\)$",
"^World-wide \(English title\)$",
]
if 'Hong Kong' in self.get('country', []):
regexps += [
"Hong Kong \(English title\)"
]
english_countries = (
'USA', 'UK', 'United States', 'United Kingdom',
'Australia', 'New Zealand'
)
if not filter(lambda c: c in english_countries, self.get('country', [])):
regexps += [
"^[^(]+ \(English title\)$",
"^.+ \(.+\) \(English title\)$",
"^USA$",
"^UK$",
"^USA \(.+\)$",
"^UK \(.+\)$",
"^Australia \(.+\)$",
"World-wide \(English title\)",
"\(literal English title\)",
"^International \(.+ title\)$",
"^International \(.+\) \(.+ title\)$",
]
for regexp in regexps:
for type in types:
if re.compile(regexp).findall(type):
#print types[type], type
self['internationalTitle'] = types[type]
break
if 'internationalTitle' in self:
break
def cleanup_title(title):
if title.startswith('"') and title.endswith('"'):
@ -454,44 +345,43 @@ class Imdb(SiteParser):
title = re.sub('\(\#[.\d]+\)', '', title)
return title.strip()
for t in ('title', 'internationalTitle'):
for t in ('title', 'originalTitle'):
if t in self:
self[t] = cleanup_title(self[t])
if 'internationalTitle' in self and \
self.get('title', '').lower() == self['internationalTitle'].lower():
del self['internationalTitle']
if 'alternativeTitles' in self:
alt = {}
for t in self['alternativeTitles']:
title = cleanup_title(t[1])
if title not in (self.get('title'), self.get('internationalTitle')):
if title.lower() not in (self.get('title', '').lower(), self.get('originalTitle', '').lower()):
if title not in alt:
alt[title] = []
for c in t[0].split('/'):
if not '(working title)' in c:
c = c.replace('International', '').replace('World-wide', '').split('(')[0].strip()
if c:
alt[title].append(c)
for cleanup in ('International', '(working title)', 'World-wide'):
c = c.replace(cleanup, '')
c = c.split('(')[0].strip()
if c:
alt[title].append(c)
self['alternativeTitles'] = []
for t in sorted(alt, key=lambda a: sorted(alt[a])):
countries = sorted([normalize_country_name(c) or c for c in alt[t]])
countries = sorted(set([normalize_country_name(c) or c for c in alt[t]]))
self['alternativeTitles'].append((t, countries))
if not self['alternativeTitles']:
del self['alternativeTitles']
if 'internationalTitle' in self:
self['originalTitle'] = self['title']
self['title'] = self.pop('internationalTitle')
if 'runtime' in self and self['runtime']:
if 'min' in self['runtime']: base=60
else: base=1
if isinstance(self['runtime'], list):
self['runtime'] = self['runtime'][0]
if 'min' in self['runtime']:
base = 60
else:
base = 1
self['runtime'] = int(find_re(self['runtime'], '([0-9]+)')) * base
if 'runtime' in self and not self['runtime']:
del self['runtime']
if 'votes' in self: self['votes'] = self['votes'].replace(',', '')
if 'sound' in self:
self['sound'] = list(sorted(set(self['sound'])))
if 'cast' in self:
if isinstance(self['cast'][0], string_types):
@ -499,6 +389,7 @@ class Imdb(SiteParser):
self['actor'] = [c[0] for c in self['cast']]
def cleanup_character(c):
c = c.replace('(uncredited)', '').strip()
c = re.sub('\s+', ' ', c)
return c
self['cast'] = [{'actor': x[0], 'character': cleanup_character(x[1])}
for x in self['cast']]
@ -522,18 +413,8 @@ class Imdb(SiteParser):
return r
cc[rel] = list(map(get_conn, re.compile('<a href="/title/tt(\d{7})/?">(.*?)</a>(.*?)<\/div', re.DOTALL).findall(data)))
self['connections'] = cc
for key in ('country', 'genre'):
if key in self:
self[key] = list(filter(lambda x: x.lower() != 'home', self[key]))
#0092999
if '_director' in self:
if 'series' in self or 'isSeries' in self:
self['creator'] = self.pop('_director')
else:
del self['_director']
if 'isSeries' in self:
del self['isSeries']
self['isSeries'] = True
@ -555,7 +436,7 @@ class Imdb(SiteParser):
if 'director' in self:
self['episodeDirector'] = self['director']
if not 'creator' in series and 'director' in series:
if 'creator' not in series and 'director' in series:
series['creator'] = series['director']
if len(series['creator']) > 10:
series['creator'] = series['director'][:1]
@ -566,7 +447,7 @@ class Imdb(SiteParser):
if 'year' in series:
self['seriesYear'] = series['year']
if not 'year' in self:
if 'year' not in self:
self['year'] = series['year']
if 'year' in self:
@ -620,11 +501,48 @@ class Imdb(SiteParser):
self['summary'] = self['summary'][0]
self['summary'] = self['summary'].split('</p')[0].strip()
if 'credits' in self:
credits = [
[
strip_tags(d[0].replace(' by', '')).strip(),
[
[
strip_tags(x[0]).strip(),
[t.strip().split(' (')[0].strip() for t in x[2].split(' / ')]
]
for x in
re.compile('<td class="name">(.*?)</td>.*?<td>(.*?)</td>.*?<td class="credit">(.*?)</td>', re.DOTALL).findall(d[1])
]
] for d in self['credits'] if d
]
credits = [c for c in credits if c[1]]
self['credits'] = []
self['lyricist'] = []
self['singer'] = []
for department, crew in credits:
department = department.replace('(in alphabetical order)', '').strip()
for c in crew:
name = c[0]
roles = c[1]
self['credits'].append({
'name': name,
'roles': roles,
'deparment': department
})
if department == 'Music Department':
if 'lyricist' in roles:
self['lyricist'].append(name)
if 'playback singer' in roles:
self['singer'].append(name)
if not self['credits']:
del self['credits']
class ImdbCombined(Imdb):
def __init__(self, id, timeout=-1):
_regex = {}
for key in self.regex:
if self.regex[key]['page'] in ('combined', 'releaseinfo'):
if self.regex[key]['page'] in ('releaseinfo', 'reference'):
_regex[key] = self.regex[key]
self.regex = _regex
super(ImdbCombined, self).__init__(id, timeout)
@ -640,25 +558,25 @@ def get_movie_by_title(title, timeout=-1):
If there is more than one film with that title for the year
Title (Year/I)
>>> get_movie_by_title(u'"Father Knows Best" (1954) {(#5.34)}')
u'1602860'
>>> str(get_movie_by_title(u'"Father Knows Best" (1954) {(#5.34)}'))
'1602860'
>>> get_movie_by_title(u'The Matrix (1999)')
u'0133093'
>>> str(get_movie_by_title(u'The Matrix (1999)'))
'0133093'
>>> get_movie_by_title(u'Little Egypt (1951)')
u'0043748'
>>> str(get_movie_by_title(u'Little Egypt (1951)'))
'0043748'
>>> str(get_movie_by_title(u'Little Egypt (1897/I)'))
'0214882'
>>> get_movie_by_title(u'Little Egypt (1897/I)')
u'0214882'
>>> get_movie_by_title(u'Little Egypt')
None
>>> get_movie_by_title(u'"Dexter" (2006) {Father Knows Best (#1.9)}')
u'0866567'
>>> str(get_movie_by_title(u'"Dexter" (2006) {Father Knows Best (#1.9)}'))
'0866567'
'''
params = {'s':'tt','q': title}
params = {'s': 'tt', 'q': title}
if not isinstance(title, bytes):
try:
params['q'] = unicodedata.normalize('NFKC', params['q']).encode('latin-1')
@ -676,20 +594,21 @@ def get_movie_by_title(title, timeout=-1):
def get_movie_id(title, director='', year='', timeout=-1):
'''
>>> get_movie_id('The Matrix')
u'0133093'
>>> str(get_movie_id('The Matrix'))
'0133093'
>>> get_movie_id('2 or 3 Things I Know About Her', 'Jean-Luc Godard')
u'0060304'
>>> str(get_movie_id('2 or 3 Things I Know About Her', 'Jean-Luc Godard'))
'0060304'
>>> get_movie_id('2 or 3 Things I Know About Her', 'Jean-Luc Godard', '1967')
u'0060304'
>>> str(get_movie_id('2 or 3 Things I Know About Her', 'Jean-Luc Godard', '1967'))
'0060304'
>>> get_movie_id(u"Histoire(s) du cinema: Le controle de l'univers", 'Jean-Luc Godard')
u'0179214'
>>> str(get_movie_id(u"Histoire(s) du cinema: Le controle de l'univers", u'Jean-Luc Godard'))
'0179214'
>>> str(get_movie_id(u"Histoire(s) du cinéma: Le contrôle de l'univers", u'Jean-Luc Godard'))
'0179214'
>>> get_movie_id(u"Histoire(s) du cinéma: Le contrôle de l'univers", 'Jean-Luc Godard')
u'0179214'
'''
imdbId = {
(u'Le jour se l\xe8ve', u'Marcel Carn\xe9'): '0031514',
@ -729,7 +648,7 @@ def get_movie_id(title, director='', year='', timeout=-1):
}.get((title, director), None)
if imdbId:
return imdbId
params = {'s':'tt','q': title}
params = {'s': 'tt', 'q': title}
if director:
params['q'] = u'"%s" %s' % (title, director)
if year:
@ -756,8 +675,8 @@ def get_movie_id(title, director='', year='', timeout=-1):
if results:
return results[0]
#print (title, director), ": '',"
#print google_query
#print((title, director), ": '',")
#print(google_query)
#results = google.find(google_query, timeout=timeout)
results = duckduckgo.find(google_query, timeout=timeout)
if results:
@ -772,15 +691,12 @@ def get_movie_poster(imdbId):
'''
>>> get_movie_poster('0133093')
'http://ia.media-imdb.com/images/M/MV5BMjEzNjg1NTg2NV5BMl5BanBnXkFtZTYwNjY3MzQ5._V1._SX338_SY475_.jpg'
>>> get_movie_poster('0994352')
'http://ia.media-imdb.com/images/M/MV5BMjA3NzMyMzU1MV5BMl5BanBnXkFtZTcwNjc1ODUwMg@@._V1._SX594_SY755_.jpg'
'''
info = ImdbCombined(imdbId)
if 'posterId' in info:
url = "http://www.imdb.com/media/rm%s/tt%s" % (info['posterId'], imdbId)
data = read_url(url).decode('utf-8', 'ignore')
poster = find_re(data, 'img.*?id="primary-img".*?src="(.*?)"')
poster = info['posterId']
if '@._V' in poster:
poster = poster.split('@._V')[0] + '@.jpg'
return poster
elif 'series' in info:
return get_movie_poster(info['series'])
@ -793,7 +709,7 @@ def get_episodes(imdbId, season=None):
url += '?season=%d' % season
data = cache.read_url(url)
for e in re.compile('<div data-const="tt(\d{7})".*?>.*?<div>S(\d+), Ep(\d+)<\/div>\n<\/div>', re.DOTALL).findall(data):
episodes['S%02dE%02d' %(int(e[1]), int(e[2]))] = e[0]
episodes['S%02dE%02d' % (int(e[1]), int(e[2]))] = e[0]
else:
data = cache.read_url(url)
match = re.compile('<strong>Season (\d+)</strong>').findall(data)
@ -804,9 +720,11 @@ def get_episodes(imdbId, season=None):
def max_votes():
url = 'http://www.imdb.com/search/title?num_votes=500000,&sort=num_votes,desc'
data = cache.read_url(url)
votes = max([int(v.replace(',', ''))
for v in re.compile('<td class="sort_col">([\d,]+)</td>').findall(data)])
data = cache.read_url(url).decode('utf-8', 'ignore')
votes = max([
int(v.replace(',', ''))
for v in re.compile('<span name="nv" data-value="(\d+)"').findall(data)
])
return votes
def guess(title, director='', timeout=-1):

View file

@ -3,26 +3,34 @@
from __future__ import print_function
import re
from ox.cache import read_url
import ox.cache
from ox.html import strip_tags
from ox.text import find_re
def read_url(url, timeout=ox.cache.cache_timeout):
data = ox.cache.read_url(url, timeout=timeout)
try:
data = data.decode('utf-8')
except UnicodeDecodeError:
data = data.decode('latin-1')
return data
def get_data(id):
'''
>>> get_data('1991/silence_of_the_lambs')['imdbId']
u'0102926'
>>> str(get_data('1991/silence_of_the_lambs')['imdbId'])
'0102926'
>>> get_data('1991/silence_of_the_lambs')['posters'][0]
u'http://www.impawards.com/1991/posters/silence_of_the_lambs_ver1.jpg'
>>> str(get_data('1991/silence_of_the_lambs')['posters'][0])
'http://www.impawards.com/1991/posters/silence_of_the_lambs_ver1.jpg'
>>> get_data('1991/silence_of_the_lambs')['url']
u'http://www.impawards.com/1991/silence_of_the_lambs_ver1.html'
>>> str(get_data('1991/silence_of_the_lambs')['url'])
'http://www.impawards.com/1991/silence_of_the_lambs_ver1.html'
'''
data = {
'url': get_url(id)
}
html = read_url(data['url'], unicode=True)
html = read_url(data['url'])
data['imdbId'] = find_re(html, 'imdb.com/title/tt(\d{7})')
if not data['imdbId']:
data['imdbId'] = _id_map.get(id, '')
@ -37,16 +45,15 @@ def get_data(id):
for result in results:
result = result.replace('_xlg.html', '.html')
url = 'http://www.impawards.com/%s/%s' % (data['year'], result)
html = read_url(url, unicode=True)
html = read_url(url)
result = find_re(html, '<a href = (\w*?_xlg.html)')
if result:
url = 'http://www.impawards.com/%s/%s' % (data['year'], result)
html = read_url(url, unicode=True)
html = read_url(url)
poster = 'http://www.impawards.com/%s/%s' % (data['year'], find_re(html, '<img SRC="(.*?)"'))
else:
poster = 'http://www.impawards.com/%s/%s' % (data['year'], find_re(html, '<img src="(posters.*?)"'))
data['posters'].append(poster)
return data
def get_id(url):
@ -60,27 +67,29 @@ def get_id(url):
id = '%s/%s' % (year, '_'.join(split))
return id
def get_ids(page=None):
ids = []
if page:
html = read_url('http://www.impawards.com/archives/page%s.html' % page, timeout = -1, unicode=True)
html = read_url('http://www.impawards.com/archives/page%s.html' % page, timeout=-1)
results = re.compile('<a href = \.\./(.*?)>', re.DOTALL).findall(html)
for result in results:
url = 'http://impawards.com/%s' % result
ids.append(get_id(url))
return set(ids)
#get all
html = read_url('http://www.impawards.com/archives/latest.html', timeout = 60*60, unicode=True)
# get all
html = read_url('http://www.impawards.com/archives/latest.html', timeout=60*60)
pages = int(find_re(html, '<a href= page(.*?).html>')) + 1
for page in range(pages, 0, -1):
for id in get_ids(page):
if not id in ids:
if id not in ids:
ids.append(id)
return ids
def get_url(id):
url = u"http://www.impawards.com/%s.html" % id
html = read_url(url, unicode=True)
html = read_url(url)
if find_re(html, "No Movie Posters on This Page"):
url = u"http://www.impawards.com/%s_ver1.html" % id
return url

View file

@ -28,22 +28,32 @@ def get_show_url(title):
def get_data(url):
data = read_url(url, unicode=True)
doc = document_fromstring(data)
score = filter(lambda s: s.attrib.get('property') == 'v:average',
doc.xpath('//span[@class="score_value"]'))
score = [s for s in doc.xpath('//span[@class="score_value"]')
if s.attrib.get('property') == 'v:average']
if score:
score = int(score[0].text)
else:
score = -1
authors = [a.text
for a in doc.xpath('//div[@class="review_content"]//div[@class="author"]//a')]
sources = [d.text
for d in doc.xpath('//div[@class="review_content"]//div[@class="source"]/a')]
reviews = [d.text
for d in doc.xpath('//div[@class="review_content"]//div[@class="review_body"]')]
scores = [int(d.text.strip())
for d in doc.xpath('//div[@class="review_content"]//div[contains(@class, "critscore")]')]
urls = [a.attrib['href']
for a in doc.xpath('//div[@class="review_content"]//a[contains(@class, "external")]')]
authors = [
a.text
for a in doc.xpath('//div[@class="review_content"]//div[@class="author"]//a')
]
sources = [
d.text
for d in doc.xpath('//div[@class="review_content"]//div[@class="source"]/a')
]
reviews = [
d.text
for d in doc.xpath('//div[@class="review_content"]//div[@class="review_body"]')
]
scores = [
int(d.text.strip())
for d in doc.xpath('//div[@class="review_content"]//div[contains(@class, "critscore")]')
]
urls = [
a.attrib['href']
for a in doc.xpath('//div[@class="review_content"]//a[contains(@class, "external")]')
]
metacritics = []
for i in range(len(authors)):
@ -54,7 +64,7 @@ def get_data(url):
'quote': strip_tags(reviews[i]).strip(),
'score': scores[i],
})
return {
'critics': metacritics,
'id': get_id(url),

View file

@ -1,121 +0,0 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
from datetime import datetime
import re
import socket
from six.moves.urllib.parse import quote
from ox.cache import read_url
from ox import find_re, cache, strip_tags, decode_html, get_torrent_info, int_value, normalize_newlines
from ox.normalize import normalize_imdbid
import ox
from torrent import Torrent
def _parse_results_page(data, max_results=10):
results=[]
regexp = '''<tr><td>(.*?)</td><td>(.*?)<a href="/tor/(.*?)">(.*?)</a>.*?</td>.*?</tr>'''
for row in re.compile(regexp, re.DOTALL).findall(data):
torrentDate = row[0]
torrentExtra = row[1]
torrentId = row[2]
torrentTitle = decode_html(row[3]).strip()
torrentLink = "http://www.mininova.org/tor/" + torrentId
privateTracker = 'priv.gif' in torrentExtra
if not privateTracker:
results.append((torrentTitle, torrentLink, ''))
return results
def find_movie(query=None, imdb=None, max_results=10):
'''search for torrents on mininova
'''
if imdb:
url = "http://www.mininova.org/imdb/?imdb=%s" % normalize_imdbid(imdb)
else:
url = "http://www.mininova.org/search/%s/seeds" % quote(query)
data = read_url(url, unicode=True)
return _parse_results_page(data, max_results)
def get_id(mininovaId):
mininovaId = unicode(mininovaId)
d = find_re(mininovaId, "/(\d+)")
if d:
return d
mininovaId = mininovaId.split('/')
if len(mininovaId) == 1:
return mininovaId[0]
else:
return mininovaId[-1]
def exists(mininovaId):
mininovaId = get_id(mininovaId)
data = ox.net.read_url("http://www.mininova.org/tor/%s" % mininovaId)
if not data or 'Torrent not found...' in data:
return False
if 'tracker</a> of this torrent requires registration.' in data:
return False
return True
def get_data(mininovaId):
_key_map = {
'by': u'uploader',
}
mininovaId = get_id(mininovaId)
torrent = dict()
torrent[u'id'] = mininovaId
torrent[u'domain'] = 'mininova.org'
torrent[u'comment_link'] = "http://www.mininova.org/tor/%s" % mininovaId
torrent[u'torrent_link'] = "http://www.mininova.org/get/%s" % mininovaId
torrent[u'details_link'] = "http://www.mininova.org/det/%s" % mininovaId
data = read_url(torrent['comment_link'], unicode=True) + read_url(torrent['details_link'], unicode=True)
if '<h1>Torrent not found...</h1>' in data:
return None
for d in re.compile('<p>.<strong>(.*?):</strong>(.*?)</p>', re.DOTALL).findall(data):
key = d[0].lower().strip()
key = _key_map.get(key, key)
value = decode_html(strip_tags(d[1].strip()))
torrent[key] = value
torrent[u'title'] = find_re(data, '<title>(.*?):.*?</title>')
torrent[u'imdbId'] = find_re(data, 'title/tt(\d{7})')
torrent[u'description'] = find_re(data, '<div id="description">(.*?)</div>')
if torrent['description']:
torrent['description'] = normalize_newlines(decode_html(strip_tags(torrent['description']))).strip()
t = read_url(torrent[u'torrent_link'])
torrent[u'torrent_info'] = get_torrent_info(t)
return torrent
class Mininova(Torrent):
'''
>>> Mininova('123')
{}
>>> Mininova('1072195')['infohash']
'72dfa59d2338e4a48c78cec9de25964cddb64104'
'''
def __init__(self, mininovaId):
self.data = get_data(mininovaId)
if not self.data:
return
Torrent.__init__(self)
ratio = self.data['share ratio'].split(',')
self['seeder'] = -1
self['leecher'] = -1
if len(ratio) == 2:
val = int_value(ratio[0].replace(',','').strip())
if val:
self['seeder'] = int(val)
val = int_value(ratio[1].replace(',','').strip())
if val:
self['leecher'] = int(val)
val = int_value(self.data['downloads'].replace(',','').strip())
if val:
self['downloaded'] = int(val)
else:
self['downloaded'] = -1
published = self.data['added on']
published = published.split(' +')[0]
self['published'] = datetime.strptime(published, "%a, %d %b %Y %H:%M:%S")

View file

@ -2,12 +2,12 @@
# vi:si:et:sw=4:sts=4:ts=4
import re
import feedparser
from ox.cache import read_url
from ox import find_re, strip_tags
from ox.iso import langCode2To3, langTo3Code
def find_subtitles(imdb, parts = 1, language = "eng"):
import feedparser
if len(language) == 2:
language = langCode2To3(language)
elif len(language) != 3:

View file

@ -32,7 +32,7 @@ def get_data(url):
r['summary'] = get_og(data, 'description')
meter = re.compile('<span id="all-critics-meter" class="meter(.*?)">(.*?)</span>').findall(data)
meter = filter(lambda m: m[1].isdigit(), meter)
meter = [m for m in meter if m[1].isdigit()]
if meter:
r['tomatometer'] = meter[0][1]
r['rating'] = find_re(data, 'Average Rating: <span>([\d.]+)/10</span>')

View file

@ -33,7 +33,7 @@ class SiteParser(dict):
return "%s%s" % (self.baseUrl, page)
def read_url(self, url, timeout):
if not url in self._cache:
if url not in self._cache:
self._cache[url] = read_url(url, timeout=timeout, unicode=True)
return self._cache[url]

View file

@ -95,7 +95,7 @@ def format_subsection(string):
'ussports': 'US-Sports',
'wunderbar': 'wunderBAR'
}
if subsection.has_key(string):
if string in subsection:
return subsection[string].replace(u'\xc3', 'ae')
return string[:1].upper() + string[1:]
@ -219,8 +219,8 @@ def archive_news():
else:
dMax = days[m]
for d in range(dMax, 0, -1):
print('getNews(%d, %d, %d)' % (y, m, d))
news = getNews(y, m ,d)
print('get_news(%d, %d, %d)' % (y, m, d))
news = get_news(y, m, d)
for new in news:
dirname = archivePath + '/' + new['date'][0:4] + '/' + new['date'][5:7] + new['date'][8:10] + '/' + new['date'][11:13] + new['date'][14:16]
if not os.path.exists(dirname):
@ -230,7 +230,7 @@ def archive_news():
else:
filename = dirname + '/' + new['url'] + '.json'
if not os.path.exists(filename) or True:
data = json.dumps(new, ensure_ascii = False)
data = json.dumps(new, ensure_ascii=False)
f = open(filename, 'w')
f.write(data)
f.close()
@ -253,7 +253,7 @@ def archive_news():
string = strings[3]
if len(strings) == 6:
string += '/' + strings[4]
if not count.has_key(string):
if string not in count:
count[string] = {'count': 1, 'string': '%s %s http://www.spiegel.de/%s/0,1518,archiv-%d-%03d,00.html' % (new['date'], new['date'], new['section'].lower(), y, int(datetime(y, m, d).strftime('%j')))}
else:
count[string] = {'count': count[string]['count'] + 1, 'string': '%s %s' % (new['date'], count[string]['string'][17:])}
@ -269,12 +269,12 @@ if __name__ == '__main__':
# spiegel = Spiegel(2008, 8)
# print(spiegel.getContents())
# news = News(2001, 9, 10)
# output(news.getNews())
# output(news.get_news())
'''
x = []
for d in range(10, 30):
print('2/%d' % d)
news = getNews(2008, 2, d)
news = get_news(2008, 2, d)
for new in news:
strings = new['url'].split('/')
string = format_section(strings[3])

View file

@ -21,10 +21,10 @@ def find(query, max_results=DEFAULT_MAX_RESULTS, timeout=DEFAULT_TIMEOUT):
Return max_results tuples with title, url, description
>>> find("The Matrix site:imdb.com", 1)[0][0]
u'The Matrix (1999) - IMDb'
'The Matrix (1999) - IMDb'
>>> find("The Matrix site:imdb.com", 1)[0][1]
u'http://www.imdb.com/title/tt0133093/'
'http://www.imdb.com/title/tt0133093/'
"""
results = []
url = 'https://eu1.startpage.com/do/search?nosteeraway=1&abp=1&language=english&cmd=process_search&query=%s&x=0&y=0&cat=web&engine0=v1all' % quote_plus(query)

View file

@ -9,11 +9,10 @@ from ox import find_re, cache, strip_tags, decode_html, get_torrent_info, normal
from ox.normalize import normalize_imdbid
import ox
from torrent import Torrent
cache_timeout = 24*60*60 # cache search only for 24 hours
season_episode = re.compile("S..E..", re.IGNORECASE)
baseurl = "https://thepiratebay.org/"
def read_url(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None, unicode=False):
@ -25,7 +24,7 @@ def find_movies(query=None, imdb=None, max_results=10):
if imdb:
query = "tt" + normalize_imdbid(imdb)
results = []
next = ["https://thepiratebay.se/search/%s/0/3/200" % quote(query), ]
next = [baseurl + "hsearch/%s/0/3/200" % quote(query), ]
page_count = 1
while next and page_count < 4:
page_count += 1
@ -33,12 +32,12 @@ def find_movies(query=None, imdb=None, max_results=10):
if not url.startswith('http'):
if not url.startswith('/'):
url = "/" + url
url = "https://thepiratebay.se" + url
url = baseurl + url
data = read_url(url, timeout=cache_timeout, unicode=True)
regexp = '''<tr.*?<td class="vertTh"><a href="/browse/(.*?)".*?<td><a href="(/torrent/.*?)" class="detLink".*?>(.*?)</a>.*?</tr>'''
for row in re.compile(regexp, re.DOTALL).findall(data):
torrentType = row[0]
torrentLink = "https://thepiratebay.se" + row[1]
torrentLink = baseurl + row[1]
torrentTitle = decode_html(row[2])
# 201 = Movies , 202 = Movie DVDR, 205 TV Shows
if torrentType in ['201']:
@ -61,7 +60,7 @@ def get_id(piratebayId):
def exists(piratebayId):
piratebayId = get_id(piratebayId)
return ox.net.exists("https://thepiratebay.se/torrent/%s" % piratebayId)
return ox.net.exists(baseurl + "torrent/%s" % piratebayId)
def get_data(piratebayId):
_key_map = {
@ -75,7 +74,7 @@ def get_data(piratebayId):
torrent = dict()
torrent[u'id'] = piratebayId
torrent[u'domain'] = 'thepiratebay.org'
torrent[u'comment_link'] = 'https://thepiratebay.se/torrent/%s' % piratebayId
torrent[u'comment_link'] = baseurl + 'torrent/%s' % piratebayId
data = read_url(torrent['comment_link'], unicode=True)
torrent[u'title'] = find_re(data, '<title>(.*?) \(download torrent\) - TPB</title>')
@ -84,33 +83,15 @@ def get_data(piratebayId):
torrent[u'title'] = decode_html(torrent[u'title']).strip()
torrent[u'imdbId'] = find_re(data, 'title/tt(\d{7})')
title = quote(torrent['title'].encode('utf-8'))
torrent[u'torrent_link']="http://torrents.thepiratebay.org/%s/%s.torrent" % (piratebayId, title)
torrent[u'magent_link']= find_re(data, '"(magnet:.*?)"')
torrent[u'infohash'] = find_re(torrent[u'magent_link'], "btih:(.*?)&")
for d in re.compile('dt>(.*?):</dt>.*?<dd.*?>(.*?)</dd>', re.DOTALL).findall(data):
key = d[0].lower().strip()
key = _key_map.get(key, key)
value = decode_html(strip_tags(d[1].strip()))
torrent[key] = value
if not '<' in key:
torrent[key] = value
torrent[u'description'] = find_re(data, '<div class="nfo">(.*?)</div>')
if torrent[u'description']:
torrent['description'] = normalize_newlines(decode_html(strip_tags(torrent['description']))).strip()
t = read_url(torrent[u'torrent_link'])
torrent[u'torrent_info'] = get_torrent_info(t)
return torrent
class Thepiratebay(Torrent):
'''
>>> Thepiratebay('123')
{}
>>> Thepiratebay('3951349')['infohash']
'4e84415d36ed7b54066160c05a0b0f061898d12b'
'''
def __init__(self, piratebayId):
self.data = get_data(piratebayId)
if not self.data:
return
Torrent.__init__(self)
published = self.data['uploaded']
published = published.replace(' GMT', '').split(' +')[0]
self['published'] = datetime.strptime(published, "%Y-%m-%d %H:%M:%S")

View file

@ -1,37 +0,0 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
from ox import int_value
class Torrent(dict):
'''
>>> Torrent()
{'files': 1, 'domain': u'', 'subtitle language': u'', 'seeder': -1, 'description': u'', 'language': u'', 'title': u'', 'imdbId': u'', 'downloaded': -1, 'leecher': -1, 'torrent_link': u'', 'torrent_info': {}, 'published': u'', 'announce': '', 'infohash': '', 'id': u'', 'comment_link': u'', 'size': -1}
'''
_string_keys = ('id', 'title', 'description', 'infohash', 'torrent_link', 'comment_link',
'imdbId', 'announce', 'domain', 'published', 'language', 'subtitle language')
_int_keys = ('size', 'seeder', 'leecher', 'downloaded', 'files')
_dict_keys = ('torrent_info', )
_list_keys = ()
data = {'torrent_info': {}}
def __init__(self):
for key in self._string_keys:
self[key] = self.data.get(key, u'')
for key in self._dict_keys:
self[key] = self.data.get(key, {})
for key in self._list_keys:
self[key] = self.data.get(key, [])
for key in self._int_keys:
value = self.data.get(key, -1)
if not isinstance(value, int):
value = int(int_value(value))
self[key] = value
self['infohash'] = self.data['torrent_info'].get('hash', '')
self['size'] = self.data['torrent_info'].get('size', -1)
self['announce'] = self.data['torrent_info'].get('announce', '')
if 'files' in self.data['torrent_info']:
self['files'] = len(self.data['torrent_info']['files'])
else:
self['files'] = 1

View file

@ -116,7 +116,7 @@ def get_movie_data(wikipedia_url):
def get_image_url(name):
url = 'http://en.wikipedia.org/wiki/Image:' + name.replace(' ', '%20')
data = read_url(url)
data = read_url(url).decode('utf-8')
url = find_re(data, 'href="(http://upload.wikimedia.org/.*?)"')
if not url:
url = find_re(data, 'href="(//upload.wikimedia.org/.*?)"')
@ -145,7 +145,7 @@ def find(query, max_results=10):
url = "http://en.wikipedia.org/w/api.php?" + urllib.parse.urlencode(query)
data = read_url(url)
if not data:
data = read_url(url, timeout=0)
data = read_url(url, timeout=0)
result = json.loads(data.decode('utf-8'))
results = []
if result and 'query' in result:

View file

@ -7,7 +7,6 @@ import re
from xml.dom.minidom import parseString
import json
import feedparser
import ox
from ox.cache import read_url, cache_timeout
@ -27,15 +26,15 @@ def video_url(youtubeId, format='mp4', timeout=cache_timeout):
"""
fmt = None
if format == '4k':
fmt=38
fmt = 38
elif format == '1080p':
fmt=37
fmt = 37
elif format == '720p':
fmt=22
fmt = 22
elif format == 'mp4':
fmt=18
fmt = 18
elif format == 'high':
fmt=35
fmt = 35
elif format == 'webm':
streams = videos(youtubeId, 'webm')
return streams[max(streams.keys())]['url']
@ -46,14 +45,14 @@ def video_url(youtubeId, format='mp4', timeout=cache_timeout):
def get_video_info(id):
eurl = get_url(id)
data = read_url(eurl)
data = read_url(eurl).decode('utf-8')
t = re.compile('\W[\'"]?t[\'"]?: ?[\'"](.+?)[\'"]').findall(data)
if t:
t = t[0]
else:
raise IOError
url = "http://www.youtube.com/get_video_info?&video_id=%s&el=$el&ps=default&eurl=%s&hl=en_US&t=%s" % (id, quote(eurl), quote(t))
data = read_url(url)
data = read_url(url).decode('utf-8')
info = {}
for part in data.split('&'):
key, value = part.split('=')
@ -61,6 +60,7 @@ def get_video_info(id):
return info
def find(query, max_results=10, offset=1, orderBy='relevance'):
import feedparser
query = quote(query)
url = "http://gdata.youtube.com/feeds/api/videos?vq=%s&orderby=%s&start-index=%s&max-results=%s" % (query, orderBy, offset, max_results)
data = read_url(url)
@ -104,14 +104,20 @@ def info(id, timeout=cache_timeout):
info['license'] = match[0].strip()
info['license'] = re.sub('<.+?>', '', info['license']).strip()
subs = subtitles(id, timeout)
if subs:
info['subtitles'] = subs
return info
def subtitles(id, timeout=cache_timeout):
url = "http://www.youtube.com/api/timedtext?hl=en&type=list&tlangs=1&v=%s&asrs=1" % id
data = read_url(url, timeout=timeout)
xml = parseString(data)
languages = [t.getAttribute('lang_code') for t in xml.getElementsByTagName('track')]
subtitles = {}
if languages:
info['subtitles'] = {}
for language in languages:
url = "http://www.youtube.com/api/timedtext?hl=en&v=%s&type=track&lang=%s&name&kind"%(id, language)
url = "http://www.youtube.com/api/timedtext?hl=en&v=%s&type=track&lang=%s&name&kind" % (id, language)
data = read_url(url, timeout=timeout)
xml = parseString(data)
subs = []
@ -128,8 +134,8 @@ def info(id, timeout=cache_timeout):
'out': end,
'value': ox.decode_html(text),
})
info['subtitles'][language] = subs
return info
subtitles[language] = subs
return subtitles
def videos(id, format=''):
stream_type = {
@ -154,7 +160,7 @@ def videos(id, format=''):
return streams
def playlist(url):
data = read_url(url)
data = read_url(url).decode('utf-8')
items = []
for i in list(set(re.compile('<a href="(/watch\?v=.*?)" title="(.*?)" ').findall(data))):
items.append({