run update

This commit is contained in:
j 2018-12-15 01:08:54 +01:00
commit 6806bebb7c
607 changed files with 52543 additions and 31832 deletions

View file

@ -1 +1 @@
VERSION="2.3.b'786'"
VERSION="2.3.895"

View file

@ -1,13 +1,18 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
# GPL 2011
from __future__ import with_statement
from __future__ import print_function
from types import MethodType
import gzip
import os
import shutil
import sys
import time
from six.moves import http_cookiejar as cookielib
import gzip
from six import BytesIO, PY2
from six.moves import urllib
from types import MethodType
from six.moves.urllib.parse import urlparse
from . import __version__
from .utils import json
@ -15,6 +20,8 @@ from .form import MultiPartForm
__all__ = ['getAPI', 'API']
CHUNK_SIZE = 1024*1024*5
def getAPI(url, cj=None):
return API(url, cj)
@ -101,7 +108,7 @@ class API(object):
result = result.decode('utf-8')
result = json.loads(result)
except:
result = {'status':{}}
result = {'status': {}}
result['status']['code'] = e.code
result['status']['text'] = str(e)
return result
@ -123,3 +130,112 @@ class API(object):
form.add_field('data', json.dumps(data))
return self._json_request(self.url, form)
def save_url(self, url, filename, overwrite=False):
chunk_size = 16 * 1024
if not os.path.exists(filename) or overwrite:
dirname = os.path.dirname(filename)
if dirname and not os.path.exists(dirname):
os.makedirs(dirname)
request = urllib.request.Request(url, method='GET')
tmpname = filename + '.tmp'
with open(tmpname, 'wb') as fd:
u = self._opener.open(request)
for chunk in iter(lambda: u.read(chunk_size), b''):
fd.write(chunk)
shutil.move(tmpname, filename)
def upload_chunks(self, url, filename, data=None):
form = MultiPartForm()
if data:
for key in data:
form.add_field(key, data[key])
data = self._json_request(url, form)
def full_url(path):
if path.startswith('/'):
u = urlparse(url)
path = '%s://%s%s' % (u.scheme, u.netloc, path)
return path
if 'uploadUrl' in data:
uploadUrl = full_url(data['uploadUrl'])
f = open(filename, 'rb')
fsize = os.stat(filename).st_size
done = 0
if 'offset' in data and data['offset'] < fsize:
done = data['offset']
f.seek(done)
resume_offset = done
else:
resume_offset = 0
chunk = f.read(CHUNK_SIZE)
fname = os.path.basename(filename)
if not isinstance(fname, bytes):
fname = fname.encode('utf-8')
while chunk:
form = MultiPartForm()
form.add_file('chunk', fname, chunk)
if len(chunk) < CHUNK_SIZE or f.tell() == fsize:
form.add_field('done', '1')
form.add_field('offset', str(done))
try:
data = self._json_request(uploadUrl, form)
except KeyboardInterrupt:
print("\ninterrupted by user.")
sys.exit(1)
except:
print("uploading chunk failed, will try again in 5 seconds\r", end='')
sys.stdout.flush()
data = {'result': -1}
time.sleep(5)
if data and 'status' in data:
if data['status']['code'] == 403:
print("login required")
return False
if data['status']['code'] != 200:
print("request returned error, will try again in 5 seconds")
if DEBUG:
print(data)
time.sleep(5)
if data and data.get('result') == 1:
done += len(chunk)
if data.get('offset') not in (None, done):
print('server offset out of sync, continue from', data['offset'])
done = data['offset']
f.seek(done)
chunk = f.read(CHUNK_SIZE)
if data and 'result' in data and data.get('result') == 1:
return data.get('id', True)
else:
return False
return False
def signin(url):
import sys
from getpass import getpass
from .web import auth
if not url.startswith('http'):
site = url
url = 'https://%s/api/' % url
else:
site = url.split('/')[2]
api = API(url)
update = False
try:
credentials = auth.get(site)
except:
credentials = {}
print('Please provide your username and password for %s:' % site)
credentials['username'] = input('Username: ')
credentials['password'] = getpass('Password: ')
update = True
r = api.signin(**credentials)
if 'errors' in r.get('data', {}):
for kv in r['data']['errors'].items():
print('%s: %s' % kv)
sys.exit(1)
if update:
auth.update(site, credentials)
return api

View file

@ -1,17 +1,23 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
# GPL 2011
from __future__ import with_statement, print_function
from __future__ import print_function
import gzip
import zlib
import hashlib
import os
from six import BytesIO
import sqlite3
import time
import zlib
from six import BytesIO
from six.moves import urllib
from six import PY2
import sqlite3
try:
import requests
USE_REQUESTS = True
except:
USE_REQUESTS = False
from .utils import json
from .file import makedirs
@ -19,12 +25,14 @@ from .file import makedirs
from . import net
from .net import DEFAULT_HEADERS, detect_encoding
cache_timeout = 30*24*60*60 # default is 30 days
cache_timeout = 30*24*60*60 # default is 30 days
COMPRESS_TYPES = (
'text/html',
'text/plain',
'text/xml',
'text/x-wiki',
'application/json',
'application/xhtml+xml',
'application/x-javascript',
@ -33,7 +41,7 @@ COMPRESS_TYPES = (
'application/rss+xml'
)
def status(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):
def status(url, data=None, headers=None, timeout=cache_timeout):
'''
>>> status('http://google.com')
200
@ -43,7 +51,7 @@ def status(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):
headers = get_headers(url, data, headers)
return int(headers['status'])
def exists(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):
def exists(url, data=None, headers=None, timeout=cache_timeout):
'''
>>> exists('http://google.com')
True
@ -55,14 +63,14 @@ def exists(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):
return True
return False
def get_headers(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):
def get_headers(url, data=None, headers=None, timeout=cache_timeout):
url_headers = store.get(url, data, headers, timeout, "headers")
if not url_headers:
url_headers = net.get_headers(url, data, headers)
store.set(url, data, -1, url_headers)
return url_headers
def get_json(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):
def get_json(url, data=None, headers=None, timeout=cache_timeout):
return json.loads(read_url(url, data, headers, timeout).decode('utf-8'))
class InvalidResult(Exception):
@ -76,7 +84,7 @@ def _fix_unicode_url(url):
url = url.encode('utf-8')
return url
def read_url(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout, valid=None, unicode=False):
def read_url(url, data=None, headers=None, timeout=cache_timeout, valid=None, unicode=False):
'''
url - url to load
data - possible post data
@ -87,24 +95,35 @@ def read_url(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout, val
'''
if net.DEBUG:
print('ox.cache.read_url', url)
#FIXME: send last-modified / etag from cache and only update if needed
#url = _fix_unicode_url(url)
# FIXME: send last-modified / etag from cache and only update if needed
# url = _fix_unicode_url(url)
result = store.get(url, data, headers, timeout)
url_headers = {}
if not result:
try:
url_headers, result = net.read_url(url, data, headers, return_headers=True)
except urllib.error.HTTPError as e:
e.headers['Status'] = "%s" % e.code
for key in e.headers:
url_headers[key.lower()] = e.headers[key]
result = e.read()
if url_headers.get('content-encoding', None) == 'gzip':
result = gzip.GzipFile(fileobj=BytesIO(result)).read()
if not valid or valid(result, url_headers):
store.set(url, post_data=data, data=result, headers=url_headers)
if USE_REQUESTS:
r = requests.get(url, headers=headers)
for key in r.headers:
url_headers[key.lower()] = r.headers[key]
result = r.content
url_headers['Status'] = "%s" % r.status_code
if not valid or valid(result, url_headers):
store.set(url, post_data=data, data=result, headers=url_headers)
else:
raise InvalidResult(result, url_headers)
else:
raise InvalidResult(result, url_headers)
try:
url_headers, result = net.read_url(url, data, headers, return_headers=True)
except urllib.error.HTTPError as e:
e.headers['Status'] = "%s" % e.code
for key in e.headers:
url_headers[key.lower()] = e.headers[key]
result = e.read()
if url_headers.get('content-encoding', None) == 'gzip':
result = gzip.GzipFile(fileobj=BytesIO(result)).read()
if not valid or valid(result, url_headers):
store.set(url, post_data=data, data=result, headers=url_headers)
else:
raise InvalidResult(result, url_headers)
if unicode:
ctype = url_headers.get('content-type', '').lower()
if 'charset' in ctype:
@ -116,13 +135,13 @@ def read_url(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout, val
result = result.decode(encoding)
return result
get_url=read_url
get_url = read_url
def save_url(url, filename, overwrite=False):
if not os.path.exists(filename) or overwrite:
dirname = os.path.dirname(filename)
if dirname and not os.path.exists(dirname):
os.makedirs(dirname)
makedirs(dirname)
data = read_url(url)
with open(filename, 'wb') as f:
f.write(data)
@ -134,7 +153,7 @@ class Cache:
def __init__(self):
pass
def get(self, url, data, headers=DEFAULT_HEADERS, timeout=-1, value="data"):
def get(self, url, data, headers=None, timeout=-1, value="data"):
'''
if value == 'data' return data of url if its in the cache else None
if value == 'headers' return headers for url
@ -159,7 +178,7 @@ class SQLiteCache(Cache):
def __init__(self):
path = cache_path()
if not os.path.exists(path):
os.makedirs(path)
makedirs(path)
self.db = os.path.join(path, "cache.sqlite")
self.create()
@ -192,7 +211,7 @@ class SQLiteCache(Cache):
def set_setting(self, c, key, value):
c.execute(u'INSERT OR REPLACE INTO setting values (?, ?)', (key, str(value)))
def get(self, url, data={}, headers=DEFAULT_HEADERS, timeout=-1, value="data"):
def get(self, url, data={}, headers=None, timeout=-1, value="data"):
r = None
if timeout == 0:
return r
@ -225,7 +244,7 @@ class SQLiteCache(Cache):
conn.close()
return r
def delete(self, url, data=None, headers=DEFAULT_HEADERS):
def delete(self, url, data=None, headers=None):
url_hash = self.get_url_hash(url, data)
conn = self.connect()
c = conn.cursor()
@ -244,7 +263,8 @@ class SQLiteCache(Cache):
c = conn.cursor()
# Insert a row of data
if not post_data: post_data=""
if not post_data:
post_data = ""
only_headers = 0
if data == -1:
only_headers = 1
@ -280,11 +300,11 @@ class FileCache(Cache):
def files(self, domain, h):
prefix = os.path.join(self.root, domain, h[:2], h[2:4], h[4:6], h[6:8])
i = os.path.join(prefix, '%s.json'%h)
f = os.path.join(prefix, '%s.dat'%h)
i = os.path.join(prefix, '%s.json' % h)
f = os.path.join(prefix, '%s.dat' % h)
return prefix, i, f
def get(self, url, data={}, headers=DEFAULT_HEADERS, timeout=-1, value="data"):
def get(self, url, data={}, headers=None, timeout=-1, value="data"):
r = None
if timeout == 0:
return r
@ -308,13 +328,13 @@ class FileCache(Cache):
if value == 'headers':
r = info['headers']
else:
with open(f, 'rb') as data:
r = data.read()
with open(f, 'rb') as fd:
r = fd.read()
if info['compressed']:
r = zlib.decompress(r)
return r
def delete(self, url, data=None, headers=DEFAULT_HEADERS):
def delete(self, url, data=None, headers=None):
url_hash = self.get_url_hash(url, data)
domain = self.get_domain(url)
@ -344,15 +364,104 @@ class FileCache(Cache):
if not info['only_headers']:
if info['compressed']:
data = zlib.compress(data)
elif not isinstance(data, str):
elif not isinstance(data, bytes):
data = data.encode('utf-8')
with open(f, 'wb') as _f:
_f.write(data)
with open(i, 'wb') as _i:
with open(i, 'w') as _i:
json.dump(info, _i)
class KVCache(Cache):
_bytes_only = False
def _keys(self, url, data, headers=None):
url_hash = self.get_url_hash(url, data)
domain = self.get_domain(url)
key = 'ox:%s:%s' % (domain, url_hash)
return key, key + ':data'
def get(self, url, data={}, headers=None, timeout=-1, value="data"):
if timeout == 0:
return None
r = None
info_key, data_key = self._keys(url, data, headers)
info = self.backend.get(info_key)
if info:
if self._bytes_only:
info = json.loads(info.decode())
now = time.mktime(time.localtime())
expired = now-timeout
if value != 'headers' and info['only_headers']:
return None
if timeout < 0 or info['created'] > expired:
if value == 'headers':
r = info['headers']
else:
r = self.backend.get(data_key)
if r and info['compressed']:
r = zlib.decompress(r)
return r
def delete(self, url, data=None, headers=None):
for key in self._keys(url, data, headers):
self.backend.delete(key)
def set(self, url, post_data, data, headers):
info_key, data_key = self._keys(url, post_data, headers)
created = time.mktime(time.localtime())
content_type = headers.get('content-type', '').split(';')[0].strip()
info = {
'compressed': content_type in COMPRESS_TYPES,
'only_headers': data == -1,
'created': created,
'headers': headers,
'url': url,
}
if post_data:
info['post_data'] = post_data
if not info['only_headers']:
if info['compressed']:
data = zlib.compress(data)
elif not isinstance(data, bytes):
data = data.encode('utf-8')
self.backend.set(data_key, data)
if self._bytes_only:
info = json.dumps(info, ensure_ascii=False).encode('utf-8')
self.backend.set(info_key, info)
class MemCache(KVCache):
_bytes_only = False
def __init__(self):
import pylibmc
f, self.host = cache_path().split(':', 1)
self.backend = pylibmc.Client([self.host])
self.backend.behaviors['connect_timeout'] = 60000
class RedisCache(KVCache):
_bytes_only = True
def __init__(self):
import redis
f, self.url = cache_path().split(':', 1)
self.backend = redis.from_url(self.url)
if cache_path().startswith('fs:'):
store = FileCache()
elif cache_path().startswith('redis:'):
store = RedisCache()
elif cache_path().startswith('memcache:'):
store = MemCache()
else:
store = SQLiteCache()

View file

@ -1,36 +1,37 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
# GPL 2008
from __future__ import division, with_statement, print_function
import os
from __future__ import division, print_function
from distutils.spawn import find_executable
from glob import glob
import hashlib
import os
import re
import shutil
import sqlite3
import struct
import subprocess
import sqlite3
from distutils.spawn import find_executable
from .utils import json
__all__ = ['sha1sum', 'oshash', 'avinfo', 'makedirs']
__all__ = ['sha1sum', 'oshash', 'avinfo', 'makedirs', 'iexists']
EXTENSIONS = {
'audio': [
'aac', 'aif', 'aiff',
'flac', 'm4a', 'mp3', 'oga', 'ogg', 'wav', 'wma'
'aac', 'aif', 'aiff', 'amr',
'flac', 'm4a', 'mp3', 'oga', 'ogg', 'wav', 'wma', 'opus'
],
'image': [
'bmp', 'gif', 'jpeg', 'jpg', 'png', 'svg', 'webp'
],
'subtitle': [
'idx', 'srt', 'sub'
'idx', 'srt', 'sub', 'vtt'
],
'video': [
'3gp',
'avi', 'divx', 'dv', 'flv', 'm2t', 'm4v', 'mkv', 'mov', 'mp4',
'mpeg', 'mpg', 'mts', 'ogm', 'ogv', 'rm', 'rmvb', 'vob', 'webm', 'wmv',
'mod', 'tod', # http://en.wikipedia.org/wiki/MOD_and_TOD
'avi', 'divx', 'dv', 'flv', 'm2t', 'm2ts', 'm4v', 'mkv', 'mov', 'mp4',
'mpeg', 'mpg', 'mts', 'ogm', 'ogv', 'rm', 'rmvb', 'vob', 'webm', 'wmv', 'asf',
'mod', 'tod', # http://en.wikipedia.org/wiki/MOD_and_TOD
'mxf', 'ts'
],
}
@ -131,25 +132,25 @@ def oshash(filename, cached=True):
if filesize < 65536:
for x in range(int(filesize/bytesize)):
buffer = f.read(bytesize)
(l_value,)= struct.unpack(longlongformat, buffer)
(l_value,) = struct.unpack(longlongformat, buffer)
hash += l_value
hash = hash & 0xFFFFFFFFFFFFFFFF #to remain as 64bit number
hash = hash & 0xFFFFFFFFFFFFFFFF # to remain as 64bit number
else:
for x in range(int(65536/bytesize)):
buffer = f.read(bytesize)
(l_value,)= struct.unpack(longlongformat, buffer)
(l_value,) = struct.unpack(longlongformat, buffer)
hash += l_value
hash = hash & 0xFFFFFFFFFFFFFFFF #to remain as 64bit number
f.seek(max(0,filesize-65536),0)
hash = hash & 0xFFFFFFFFFFFFFFFF # to remain as 64bit number
f.seek(max(0, filesize-65536), 0)
for x in range(int(65536/bytesize)):
buffer = f.read(bytesize)
(l_value,)= struct.unpack(longlongformat, buffer)
(l_value,) = struct.unpack(longlongformat, buffer)
hash += l_value
hash = hash & 0xFFFFFFFFFFFFFFFF
f.close()
returnedhash = "%016x" % hash
returnedhash = "%016x" % hash
return returnedhash
except(IOError):
except IOError:
return "IOError"
def avinfo(filename, cached=True):
@ -160,23 +161,25 @@ def avinfo(filename, cached=True):
return ffprobe(filename)
ffmpeg2theora = cmd('ffmpeg2theora')
p = subprocess.Popen([ffmpeg2theora], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
info, error = p.communicate()
version = info.split('\n')[0].split(' - ')[0].split(' ')[-1]
stdout, error = p.communicate()
stdout = stdout.decode('utf-8')
version = stdout.split('\n')[0].split(' - ')[0].split(' ')[-1]
if version < '0.27':
raise EnvironmentError('version of ffmpeg2theora needs to be 0.27 or later, found %s' % version)
p = subprocess.Popen([ffmpeg2theora, '--info', filename],
stdout=subprocess.PIPE, stderr=subprocess.PIPE)
info, error = p.communicate()
stdout, error = p.communicate()
stdout = stdout.decode('utf-8')
try:
info = json.loads(info)
info = json.loads(stdout)
except:
#remove metadata, can be broken
# remove metadata, can be broken
reg = re.compile('"metadata": {.*?},', re.DOTALL)
info = re.sub(reg, '', info)
info = json.loads(info)
stdout = re.sub(reg, '', stdout)
info = json.loads(stdout)
if 'video' in info:
for v in info['video']:
if not 'display_aspect_ratio' in v and 'width' in v:
if 'display_aspect_ratio' not in v and 'width' in v:
v['display_aspect_ratio'] = '%d:%d' % (v['width'], v['height'])
v['pixel_aspect_ratio'] = '1:1'
if len(info.get('audio', [])) > 1:
@ -189,12 +192,14 @@ def avinfo(filename, cached=True):
ffmpeg = cmd('ffmpeg')
p = subprocess.Popen([ffmpeg, '-i', filename], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout, stderr = p.communicate()
stderr = stderr.decode('utf-8')
languages = [re.compile('\((.+?)\):').findall(l) for l in stderr.split('\n') if 'Stream' in l and 'Audio' in l]
if len(languages) == len(info['audio']):
for i, stream in enumerate(info['audio']):
language = languages[i]
if language and language[0] != 'und':
stream['language'] = language[0]
fix_coverart(info)
return info
return {'path': filename, 'size': 0}
@ -203,6 +208,7 @@ def ffprobe(filename):
p = subprocess.Popen([
cmd('ffprobe'),
'-show_format',
'-show_chapters',
'-show_streams',
'-print_format',
'json',
@ -210,6 +216,7 @@ def ffprobe(filename):
], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
info, error = p.communicate()
info = info.decode('utf-8')
ffinfo = json.loads(info)
def fix_value(key, value):
@ -224,7 +231,7 @@ def ffprobe(filename):
return value
info = {}
if not 'format' in ffinfo:
if 'format' not in ffinfo:
info['error'] = 'badfile'
else:
for key in ('duration', 'size', 'bit_rate'):
@ -235,8 +242,18 @@ def ffprobe(filename):
info['audio'] = []
info['video'] = []
info['metadata'] = ffinfo['format'].get('tags', {})
chapters = ffinfo.get('chapters', [])
if chapters:
info['chapters'] = [
{
'in': float(chapter['start_time']),
'out': float(chapter['end_time']),
'value': chapter.get('tags', {}).get('title')
}
for chapter in chapters if chapter.get('tags', {}).get('title')
]
for s in ffinfo['streams']:
tags = s.pop('tags', {})
tags = s.pop('tags', {})
language = None
for t in tags:
if t == 'language':
@ -278,17 +295,29 @@ def ffprobe(filename):
info[s['codec_type']].append(stream)
else:
pass
#print s
# print s
for v in info['video']:
if 'rotate' in info.get('metadata', {}) and int(info['metadata']['rotate']) in (-180, -90, 90, 180):
v['width'], v['height'] = v['height'], v['width']
k = 'display_aspect_ratio'
if not k in v and 'width' in v \
if k not in v and 'width' in v \
or (k in v and v[k] == '0:1'):
v[k] = '%d:%d' % (v['width'], v['height'])
v['pixel_aspect_ratio'] = '1:1'
info['oshash'] = oshash(filename)
info['path'] = filename
if not 'size' in info:
if 'size' not in info:
info['size'] = os.path.getsize(filename)
fix_coverart(info)
return info
def fix_coverart(info):
if info.get('video') \
and info['path'].split('.')[-1] in EXTENSIONS['audio'] \
and info['video'][0]['codec'] in EXTENSIONS['image'] + ['mjpeg']:
info['cover'] = info.pop('video')
info['video'] = []
return info
def makedirs(path):
@ -353,3 +382,17 @@ def write_path(file):
path = os.path.split(file)[0]
if path and not os.path.exists(path):
os.makedirs(path)
def iexists(path):
parts = path.split(os.sep)
name = parts[-1].lower()
if len(parts) == 1:
folder = '.'
else:
folder = os.path.dirname(path)
try:
files = os.listdir(folder)
except FileNotFoundError:
return False
files = {os.path.basename(f).lower() for f in files}
return name in files

View file

@ -6,7 +6,7 @@ from __future__ import print_function
import unicodedata
from six import unichr, PY2
from six import unichr, text_type
__all__ = ['fix_bad_unicode']
@ -151,10 +151,7 @@ def text_badness(text):
- Improbable single-byte characters, such as ƒ or ¬
- Letters in somewhat rare scripts
'''
if PY2:
assert isinstance(text, unicode)
else:
assert isinstance(text, str)
assert isinstance(text, text_type)
errors = 0
very_weird_things = 0
weird_things = 0

View file

@ -1,11 +1,12 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
# GPL 2014
from __future__ import with_statement, print_function
from __future__ import print_function
import itertools
import mimetypes
import random
import os
import hashlib
import sys
from six import PY2
@ -20,8 +21,7 @@ _fmt = '%%0%dd' % _width
def _make_boundary():
# Craft a random boundary.
token = random.randrange(sys.maxsize)
boundary = ('=' * 15) + (_fmt % token) + '=='
boundary = ('=' * 15) + hashlib.sha1(os.urandom(32)).hexdigest() + '=='
return boundary
class MultiPartForm(object):
@ -75,7 +75,7 @@ class MultiPartForm(object):
# line is separated by '\r\n'.
parts = []
part_boundary = '--' + self.boundary
# Add the form fields
parts.extend(
[ part_boundary,
@ -85,7 +85,7 @@ class MultiPartForm(object):
]
for name, value in self.form_fields
)
# Add the files to upload
parts.extend(
[ part_boundary,
@ -97,7 +97,7 @@ class MultiPartForm(object):
]
for field_name, filename, content_type, body in self.files
)
# Flatten the list and add closing boundary marker,
# then return CR+LF separated data
flattened = list(itertools.chain(*parts))

View file

@ -4,13 +4,14 @@ import math
import re
import string
from six import text_type
def toAZ(num):
"""
Converts an integer to bijective base 26 string using A-Z
>>> for i in range(1, 1000): assert fromAZ(toAZ(i)) == i
>>> toAZ(1)
'A'
@ -20,7 +21,8 @@ def toAZ(num):
>>> toAZ(1234567890)
'CYWOQVJ'
"""
if num < 1: raise ValueError("must supply a positive integer")
if num < 1:
raise ValueError("must supply a positive integer")
digits = string.ascii_uppercase
az = ''
while num != 0:
@ -30,7 +32,7 @@ def toAZ(num):
az = digits[r] + az
return az
encode_base26=toAZ
encode_base26 = toAZ
def fromAZ(num):
"""
@ -45,7 +47,7 @@ def fromAZ(num):
>>> fromAZ('FOO')
4461
"""
num = num.replace('-','')
num = num.replace('-', '')
digits = string.ascii_uppercase
r = 0
for exp, char in enumerate(reversed(num)):
@ -64,7 +66,8 @@ def to26(q):
>>> to26(347485647)
'BDGKMAP'
"""
if q < 0: raise ValueError("must supply a positive integer")
if q < 0:
raise ValueError("must supply a positive integer")
base26 = string.ascii_uppercase
converted = []
while q != 0:
@ -73,7 +76,7 @@ def to26(q):
converted.insert(0, l)
return "".join(converted) or 'A'
decode_base26=toAZ
decode_base26 = toAZ
def from26(q):
"""
@ -82,7 +85,7 @@ def from26(q):
0
"""
base26 = string.ascii_uppercase
q = q.replace('-','')
q = q.replace('-', '')
r = 0
for i in q:
r = r * 26 + base26.index(i.upper())
@ -123,7 +126,8 @@ def to32(q):
ValueError: must supply a positive integer
"""
if q < 0: raise ValueError("must supply a positive integer")
if q < 0:
raise ValueError("must supply a positive integer")
letters = "0123456789ABCDEFGHJKMNPQRSTVWXYZ"
converted = []
while q != 0:
@ -188,7 +192,7 @@ def from32(q):
'Z': 31,
}
base32 = ('0123456789' + string.ascii_uppercase)[:32]
q = q.replace('-','')
q = q.replace('-', '')
q = ''.join([base32[_32map[i.upper()]] for i in q])
return int(q, 32)
@ -210,7 +214,8 @@ def to36(q):
...
ValueError: must supply a positive integer
"""
if q < 0: raise ValueError("must supply a positive integer")
if q < 0:
raise ValueError("must supply a positive integer")
letters = "0123456789abcdefghijklmnopqrstuvwxyz"
converted = []
while q != 0:
@ -233,7 +238,7 @@ def int_value(strValue, default=u''):
u''
"""
try:
val = re.compile('(\d+)').findall(unicode(strValue).strip())[0]
val = re.compile('(\d+)').findall(text_type(strValue).strip())[0]
except:
val = default
return val
@ -250,7 +255,7 @@ def float_value(strValue, default=u''):
u''
"""
try:
val = re.compile('([\d.]+)').findall(unicode(strValue).strip())[0]
val = re.compile('([\d.]+)').findall(text_type(strValue).strip())[0]
except:
val = default
return val
@ -286,7 +291,7 @@ def format_number(number, longName, shortName):
n = number / math.pow(1024, i + 1)
return '%s %s%s' % (format_thousands('%.*f' % (i, n)), prefix[i], shortName)
def format_thousands(number, separator = ','):
def format_thousands(number, separator=','):
"""
Return the number with separators (1,000,000)
@ -316,18 +321,18 @@ def format_pixels(number):
return format_number(number, 'pixel', 'px')
def format_currency(amount, currency="$"):
if amount:
temp = "%.2f" % amount
profile=re.compile(r"(\d)(\d\d\d[.,])")
while 1:
temp, count = re.subn(profile,r"\1,\2",temp)
if not count:
break
if temp.startswith('-'):
return "-"+ currency + temp[1:-3]
return currency + temp[:-3]
else:
return ""
if amount:
temp = "%.2f" % amount
profile = re.compile(r"(\d)(\d\d\d[.,])")
while 1:
temp, count = re.subn(profile, r"\1,\2", temp)
if not count:
break
if temp.startswith('-'):
return "-" + currency + temp[1:-3]
return currency + temp[:-3]
else:
return ""
def plural(amount, unit, plural='s'):
'''
@ -339,7 +344,8 @@ def plural(amount, unit, plural='s'):
if abs(amount) != 1:
if plural == 's':
unit = unit + plural
else: unit = plural
else:
unit = plural
return "%s %s" % (format_thousands(amount), unit)
def format_duration(ms, verbosity=0, years=True, hours=True, milliseconds=True):
@ -390,14 +396,14 @@ def format_duration(ms, verbosity=0, years=True, hours=True, milliseconds=True):
duration += ".%03d" % ms
else:
if verbosity == 1:
durations = ["%sd" % d, "%sh" % h, "%sm" % m, "%ss" % s]
durations = ["%sd" % d, "%sh" % h, "%sm" % m, "%ss" % s]
if years:
durations.insert(0, "%sy" % y)
if milliseconds:
durations.append("%sms" % ms)
else:
durations = [plural(d, 'day'), plural(h,'hour'),
plural(m, 'minute'), plural(s, 'second')]
durations = [plural(d, 'day'), plural(h, 'hour'),
plural(m, 'minute'), plural(s, 'second')]
if years:
durations.insert(0, plural(y, 'year'))
if milliseconds:
@ -434,7 +440,7 @@ def parse_timecode(string):
'''
timecode = 0
for i, v in enumerate(list(reversed(string.split(':')))[:4]):
timecode += float(v) * ( 86400 if i == 3 else pow(60, i))
timecode += float(v) * (86400 if i == 3 else pow(60, i))
return timecode
def ms2runtime(ms, shortenLong=False):
@ -482,7 +488,8 @@ def time2ms(timeString):
p = timeString.split(':')
for i in range(len(p)):
_p = p[i]
if _p.endswith('.'): _p =_p[:-1]
if _p.endswith('.'):
_p = _p[:-1]
ms = ms * 60 + float(_p)
return int(ms * 1000)

View file

@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
# GPL 2008
from __future__ import with_statement, print_function
from __future__ import print_function
import math

View file

@ -10,7 +10,7 @@ letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
# Configuration for add_links() function
LEADING_PUNCTUATION = ['(', '<', '&lt;']
LEADING_PUNCTUATION = ['(', '<', '&lt;']
TRAILING_PUNCTUATION = ['.', ',', ')', '>', '\n', '&gt;', "'", '"']
# list of possible strings used for bullets in bulleted lists
@ -18,16 +18,16 @@ DOTS = ['&middot;', '*', '\xe2\x80\xa2', '&#149;', '&bull;', '&#8226;']
unencoded_ampersands_re = re.compile(r'&(?!(\w+|#\d+);)')
word_split_re = re.compile(r'(\s+)')
punctuation_re = re.compile('^(?P<lead>(?:%s)*)(?P<middle>.*?)(?P<trail>(?:%s)*)$' % \
('|'.join([re.escape(x) for x in LEADING_PUNCTUATION]),
'|'.join([re.escape(x) for x in TRAILING_PUNCTUATION])))
punctuation_re = re.compile('^(?P<lead>(?:%s)*)(?P<middle>.*?)(?P<trail>(?:%s)*)$' % (
'|'.join([re.escape(x) for x in LEADING_PUNCTUATION]),
'|'.join([re.escape(x) for x in TRAILING_PUNCTUATION])))
simple_email_re = re.compile(r'^\S+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9._-]+$')
link_target_attribute_re = re.compile(r'(<a [^>]*?)target=[^\s>]+')
html_gunk_re = re.compile(r'(?:<br clear="all">|<i><\/i>|<b><\/b>|<em><\/em>|<strong><\/strong>|<\/?smallcaps>|<\/?uppercase>)', re.IGNORECASE)
hard_coded_bullets_re = re.compile(r'((?:<p>(?:%s).*?[a-zA-Z].*?</p>\s*)+)' % '|'.join([re.escape(x) for x in DOTS]), re.DOTALL)
trailing_empty_content_re = re.compile(r'(?:<p>(?:&nbsp;|\s|<br \/>)*?</p>\s*)+\Z')
if PY2:
del x # Temporary variable
del x # Temporary variable
def escape(html):
'''
@ -44,7 +44,7 @@ def linebreaks(value):
'''
Converts newlines into <p> and <br />
'''
value = re.sub(r'\r\n|\r|\n', '\n', value) # normalize newlines
value = re.sub(r'\r\n|\r|\n', '\n', value) # normalize newlines
paras = re.split('\n{2,}', value)
paras = ['<p>%s</p>' % p.strip().replace('\n', '<br />') for p in paras]
return '\n\n'.join(paras)
@ -83,21 +83,23 @@ def add_links(text, trim_url_limit=None, nofollow=False):
If nofollow is True, the URLs in link text will get a rel="nofollow" attribute.
"""
trim_url = lambda x, limit=trim_url_limit: limit is not None and (x[:limit] + (len(x) >=limit and '...' or '')) or x
trim_url = lambda x, limit=trim_url_limit: limit is not None and (x[:limit] + (len(x) >= limit and '...' or '')) or x
words = word_split_re.split(text)
nofollow_attr = nofollow and ' rel="nofollow"' or ''
for i, word in enumerate(words):
match = punctuation_re.match(word)
if match:
lead, middle, trail = match.groups()
if middle.startswith('www.') or ('@' not in middle and not middle.startswith('http://') and \
len(middle) > 0 and middle[0] in letters + string.digits and \
(middle.endswith('.org') or middle.endswith('.net') or middle.endswith('.com'))):
if middle.startswith('www.') or ('@' not in middle and not middle.startswith('http://') and
len(middle) > 0 and middle[0] in letters + string.digits and
(middle.endswith('.org') or
middle.endswith('.net') or
middle.endswith('.com'))):
middle = '<a href="http://%s"%s>%s</a>' % (middle, nofollow_attr, trim_url(middle))
if middle.startswith('http://') or middle.startswith('https://'):
middle = '<a href="%s"%s>%s</a>' % (middle, nofollow_attr, trim_url(middle))
if '@' in middle and not middle.startswith('www.') and not ':' in middle \
and simple_email_re.match(middle):
if '@' in middle and not middle.startswith('www.') and ':' not in middle \
and simple_email_re.match(middle):
middle = '<a href="mailto:%s">%s</a>' % (middle, middle)
if lead + middle + trail != word:
words[i] = lead + middle + trail
@ -127,6 +129,7 @@ def clean_html(text):
# Trim stupid HTML such as <br clear="all">.
text = html_gunk_re.sub('', text)
# Convert hard-coded bullets into HTML unordered lists.
def replace_p_tags(match):
s = match.group().replace('</p>', '</li>')
for d in DOTS:
@ -153,6 +156,7 @@ def decode_html(html):
if isinstance(html, bytes):
html = html.decode('utf-8')
uchr = unichr
def entitydecode(match, uchr=uchr):
entity = match.group(1)
if entity == '#x80':
@ -282,7 +286,7 @@ def sanitize_html(html, tags=None, global_attributes=[]):
{'name': 'thead'},
{'name': 'tr'},
# other
{'name': '[]'},
{'name': '[]'},
{
'name': 'a',
'required': ['href'],
@ -328,15 +332,14 @@ def sanitize_html(html, tags=None, global_attributes=[]):
for tag in tags:
valid_attributes[tag['name']] = tag.get('required', []) \
+ tag.get('optional', []) \
+ global_attributes
+ tag.get('optional', []) + global_attributes
required_attributes[tag['name']] = tag.get('required', [])
validation[tag['name']] = tag.get('validation', {})
if '[]' in validation:
html = re.sub(
re.compile('\[((https?:\/\/|\/).+?) (.+?)\]', re.IGNORECASE),
'<a href="\\1">\\3</a>', html);
'<a href="\\1">\\3</a>', html)
parts = split_tags(html)
for i, part in enumerate(parts):
@ -351,17 +354,17 @@ def sanitize_html(html, tags=None, global_attributes=[]):
a = attr_re.findall(attributes)
attrs = dict(a)
if not closing and not name in non_closing_tags:
if not closing and name not in non_closing_tags:
level += 1
if not attrs and attributes or name not in valid_tags:
if not attrs and attributes or name not in valid_tags:
valid = False
else:
valid = True
for key in set(attrs) - set(valid_attributes[name]):
del attrs[key]
for key in required_attributes[tag['name']]:
if not key in attrs:
if key not in attrs:
valid = False
if valid:
@ -395,6 +398,7 @@ def sanitize_html(html, tags=None, global_attributes=[]):
def split_tags(string):
tags = []
def collect(match):
tags.append(match.group(0))
return '\0'

View file

@ -14,12 +14,13 @@ except:
import ImageFont
ZONE_INDEX = []
for pixel_index in range(64):
x, y = pixel_index % 8, int(pixel_index / 8)
ZONE_INDEX.append(int(x / 2) + int(y / 4) * 4)
del x
del y
ZONE_INDEX = [
(int(x / 2) + int(y / 4) * 4)
for x, y in [
(pixel_index % 8, int(pixel_index / 8))
for pixel_index in range(64)
]
]
def drawText(image, position, text, font_file, font_size, color):
draw = ImageDraw.Draw(image)
@ -165,8 +166,10 @@ def wrapText(text, max_width, max_lines, font_file, font_size):
if width <= max_width and width > min_width:
min_width = width
return min_width
def get_width(string):
return draw.textsize(string, font=font)[0]
image = Image.new('RGB', (1, 1))
draw = ImageDraw.Draw(image)
font = ImageFont.truetype(font_file, font_size, encoding='unic')

View file

@ -208,7 +208,7 @@ def langTo3Code(lang):
if lang:
lang = langEnglishName(lang)
if lang:
lang=lang.lower()
lang = lang.lower()
for l in _iso639_languages:
if l[0].lower() == lang:
return l[3]
@ -218,7 +218,7 @@ def langTo2Code(lang):
if lang:
lang = langEnglishName(lang)
if lang:
lang=lang.lower()
lang = lang.lower()
for l in _iso639_languages:
if l[0].lower() == lang:
return l[2]

View file

@ -11,9 +11,9 @@ def minify(source, comment=''):
pass
# python2 performance with unicode string is terrible
if PY2:
if isinstance(source, unicode):
if isinstance(source, unicode): # pylint: disable=undefined-variable
source = source.encode('utf-8')
if isinstance(comment, unicode):
if isinstance(comment, unicode): # pylint: disable=undefined-variable
comment = comment.encode('utf-8')
tokens = tokenize(source)
length = len(tokens)
@ -30,20 +30,20 @@ def minify(source, comment=''):
# numbers or strings or unary operators or grouping operators
# with a single newline, otherwise remove it
if prevToken and nextToken\
and (prevToken['type'] in ['identifier', 'number', 'string']\
or prevToken['value'] in ['++', '--', ')', ']', '}'])\
and (nextToken['type'] in ['identifier', 'number', 'string']\
or nextToken['value'] in ['+', '-', '++', '--', '~', '!', '(', '[', '{']):
and (prevToken['type'] in ['identifier', 'number', 'string']
or prevToken['value'] in ['++', '--', ')', ']', '}']) \
and (nextToken['type'] in ['identifier', 'number', 'string']
or nextToken['value'] in ['+', '-', '++', '--', '~', '!', '(', '[', '{']):
minified += '\n'
elif token['type'] == 'whitespace':
# replace whitespace between two tokens that are identifiers or
# numbers, or between a token that ends with "+" or "-" and one that
# begins with "+" or "-", with a single space, otherwise remove it
if prevToken and nextToken\
and ((prevToken['type'] in ['identifier', 'number']\
and nextToken['type'] in ['identifier', 'number'])
or (prevToken['value'] in ['+', '-', '++', '--']
and nextToken['value'] in ['+', '-', '++', '--'])):
if prevToken and nextToken \
and ((prevToken['type'] in ['identifier', 'number'] and
nextToken['type'] in ['identifier', 'number']) or
(prevToken['value'] in ['+', '-', '++', '--'] and
nextToken['value'] in ['+', '-', '++', '--'])):
minified += ' '
elif token['type'] != 'comment':
# remove comments and leave all other tokens untouched
@ -178,7 +178,7 @@ def tokenize(source):
'value': value
})
if type == 'comment':
lines = value.split('\n');
lines = value.split('\n')
column = len(lines[-1])
line += len(lines) - 1
elif type == 'linebreak':

View file

@ -1,7 +1,7 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
from __future__ import with_statement, print_function
from __future__ import print_function
import re
@ -23,11 +23,11 @@ def loads(source):
try:
m = re.search(r'line (\d+) column (\d+)', msg)
if m:
(lineno, colno) = map(int, m.groups())
(lineno, colno) = [int(n) for n in m.groups()]
except:
pass
if lineno and colno:
s = minified.split('\n')
context = s[lineno-1][max(0, colno-30):colno+30]
msg += ' at:\n\n %s\n %s\033[1m^\033[0m' %(context, ' ' * (colno - max(0, colno-30) - 2))
msg += ' at:\n\n %s\n %s\033[1m^\033[0m' % (context, ' ' * (colno - max(0, colno-30) - 2))
raise ValueError(msg)

View file

@ -29,7 +29,7 @@ def format_path(data, directory_key='director'):
director = data['directorSort'] or ['Unknown Director']
title = data['seriesTitle' if data['isEpisode'] else 'title'] or 'Untitled'
year = data['seriesYear' if data['isEpisode'] else 'year'] or None
parts = list(map(format_underscores, filter(lambda x: x != None, [
parts = list(map(format_underscores, filter(lambda x: x is not None, [
u'; '.join(director[:10]),
u'%s%s' % (title, u' (%s)' % year if year else ''),
u'%s%s%s%s%s%s' % (
@ -60,7 +60,7 @@ def parse_item_files(files):
def get_version_key(file, extension=True):
return '%s/%s-part/%s' % (
file['version'] or '',
'single' if file['part'] == None else 'multi',
'single' if file['part'] is None else 'multi',
file['extension'] if extension else ''
)
# filter out duplicate files (keep shortest path, sorted alphabetically)
@ -70,7 +70,7 @@ def parse_item_files(files):
duplicate_files = []
for key in [get_file_key(file) for file in files]:
key_files = sorted(
sorted([file for file in files if get_file_key(file) == key]),
[file for file in files if get_file_key(file) == key],
key=lambda x: len(x['path'])
)
unique_files.append(key_files[0])
@ -114,10 +114,8 @@ def parse_item_files(files):
# determine preferred subtitle language
language[version_key] = None
subtitle_files = [file for file in version_files[version_key] if file['extension'] == 'srt']
for subtitle_language in sorted(
list(set([file['language'] for file in subtitle_files])),
key=lambda x: LANGUAGES.index(x) if x in LANGUAGES else x
):
subtitle_languages = list(set([file['language'] for file in subtitle_files]))
for subtitle_language in sorted(subtitle_languages, key=subtitle_sort):
language_files = [file for file in subtitle_files if file['language'] == subtitle_language]
if len(subtitle_files) == len(parts):
language[version_key] = subtitle_language
@ -188,25 +186,30 @@ def parse_path(path, directory_key='director'):
# TODO: '.com.avi'
'''
def parse_type(string):
for type in EXTENSIONS:
if string in EXTENSIONS[type]:
return type
return None
def parse_underscores(string):
string = unicodedata.normalize('NFC', string)
# '^_' or '_$' is '.'
string = re.sub('^_', '.', string)
string = re.sub('_$', '.', string)
# '_.foo$' or '_ (' is '?'
string = re.sub('_(?=(\.\w+$| \())', '?', string)
string = re.sub(re.compile('_(?=(\.\w+$| \())', re.U), '?', string)
# ' _..._ ' is '<...>'
string = re.sub('(?<= )_(.+)_(?= )', '<\g<1>>', string)
# 'foo_bar' or 'foo _ bar' is '/'
string = re.sub('(?<=\w)_(?=\w)', '/', string)
string = re.sub(re.compile('(?<=\w)_(?=\w)', re.U), '/', string)
string = re.sub(' _ ', ' / ', string)
# 'foo_ ' is ':'
string = re.sub('(?<=\w)_ ', ': ', string)
string = re.sub(re.compile('(?<=\w)_ ', re.U), ': ', string)
string = unicodedata.normalize('NFD', string)
return string
data = {}
parts = list(map(lambda x: parse_underscores(x.strip()), unicodedata.normalize('NFD', path).split('/')))
# subdirectory
@ -269,12 +272,12 @@ def parse_path(path, directory_key='director'):
# isEpisode, seriesTitle, seriesYear
data['isEpisode'] = False
data['seriesTitle'] = data['seriesYear'] = None
if data['season'] != None or data['episode'] != None or data['episodes']:
if data['season'] is not None or data['episode'] is not None or data['episodes']:
data['isEpisode'] = True
data['seriesTitle'] = data['title']
season = 'S%02d' % data['season'] if data['season'] != None else ''
season = 'S%02d' % data['season'] if data['season'] is not None else ''
episode = ''
if data['episode'] != None:
if data['episode'] is not None:
episode = 'E%02d' % data['episode']
elif data['episodes']:
episode = 'E%02d%s%02d' % (
@ -356,7 +359,7 @@ def parse_movie_path(path):
director = "%s." % director[:-1]
director = director.split('; ')
director = [normalize_name(d).strip() for d in director]
director = filter(lambda d: d not in ('Unknown Director', 'Various Directors'), director)
director = list(filter(lambda d: d not in ('Unknown Director', 'Various Directors'), director))
else:
director = []
@ -376,9 +379,9 @@ def parse_movie_path(path):
season = match.group(3)
episode = match.group(5)
episodeTitle = (match.group(6) or '').strip()
if episode != None:
if episode is not None:
episode = int(episode)
if season != None:
if season is not None:
season = int(season)
if episode and not season:
season = 1
@ -396,7 +399,7 @@ def parse_movie_path(path):
else:
episode = None
if episode and 'Episode %d'%episode in fileparts:
if episode and 'Episode %d' % episode in fileparts:
episodeTitle = fileparts.index('Episode %d' % episode) + 1
episodeTitle = fileparts[episodeTitle]
if episodeTitle == extension or episodeTitle.startswith('Part'):
@ -482,3 +485,11 @@ def get_oxid(title, director=[], year='',
oxid = get_hash('\n'.join([director, title, str(year), str(season)]))[:8] + \
get_hash('\n'.join([str(episode), episode_director, episode_title, str(episode_year)]))[:8]
return u'0x' + oxid
def subtitle_sort(language):
if language in LANGUAGES:
return str(LANGUAGES.index(language))
elif language is None:
return str(len(LANGUAGES))
else:
return language

View file

@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
# GPL 2008
from __future__ import with_statement, print_function
from __future__ import print_function
import gzip
import json
import os
@ -16,14 +16,14 @@ from chardet.universaldetector import UniversalDetector
DEBUG = False
# Default headers for HTTP requests.
DEFAULT_HEADERS = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:38.0) Gecko/20100101 Firefox/38.0',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:55.0) Gecko/20100101 Firefox/55.0',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-us,en;q=0.5',
'Accept-Encoding': 'gzip'
'Accept-Language': 'en-US,en;q=0.8,fr;q=0.6,de;q=0.4',
'Accept-Encoding': 'gzip',
}
def status(url, data=None, headers=DEFAULT_HEADERS):
def status(url, data=None, headers=None):
try:
f = open_url(url, data, headers)
s = f.code
@ -31,13 +31,13 @@ def status(url, data=None, headers=DEFAULT_HEADERS):
s = e.code
return s
def exists(url, data=None, headers=DEFAULT_HEADERS):
def exists(url, data=None, headers=None):
s = status(url, data, headers)
if s >= 200 and s < 400:
return True
return False
def get_headers(url, data=None, headers=DEFAULT_HEADERS):
def get_headers(url, data=None, headers=None):
try:
f = open_url(url, data, headers)
f.headers['Status'] = "%s" % f.code
@ -48,10 +48,12 @@ def get_headers(url, data=None, headers=DEFAULT_HEADERS):
headers = e.headers
return dict(headers)
def get_json(url, data=None, headers=DEFAULT_HEADERS):
return json.loads(read_url(url, data, headers).decode('utf-8'))
def get_json(url, data=None, headers=None):
return json.loads(read_url(url, data, headers).decode('utf-8')) # pylint: disable=no-member
def open_url(url, data=None, headers=DEFAULT_HEADERS):
def open_url(url, data=None, headers=None):
if headers is None:
headers = DEFAULT_HEADERS.copy()
if PY2:
if not isinstance(url, bytes):
url = url.encode('utf-8')
@ -64,7 +66,7 @@ def open_url(url, data=None, headers=DEFAULT_HEADERS):
req = urllib.request.Request(url, data, headers)
return urllib.request.urlopen(req)
def read_url(url, data=None, headers=DEFAULT_HEADERS, return_headers=False, unicode=False):
def read_url(url, data=None, headers=None, return_headers=False, unicode=False):
if DEBUG:
print('ox.net.read_url', url)
f = open_url(url, data, headers)
@ -108,7 +110,7 @@ def detect_encoding(data):
detector.close()
return detector.result['encoding']
get_url=read_url
get_url = read_url
def save_url(url, filename, overwrite=False):
if not os.path.exists(filename) or overwrite:
@ -119,51 +121,50 @@ def save_url(url, filename, overwrite=False):
with open(filename, 'wb') as f:
f.write(data)
def _get_size(url):
req = urllib.request.Request(url, headers=DEFAULT_HEADERS.copy())
req.get_method = lambda: 'HEAD'
u = urllib.request.urlopen(req)
if u.code != 200 or 'Content-Length' not in u.headers:
raise IOError
return int(u.headers['Content-Length'])
def _get_range(url, start, end):
headers = DEFAULT_HEADERS.copy()
headers['Range'] = 'bytes=%s-%s' % (start, end)
req = urllib.request.Request(url, headers=headers)
u = urllib.request.urlopen(req)
return u.read()
def oshash(url):
def get_size(url):
req = urllib.request.Request(url, headers=DEFAULT_HEADERS.copy())
req.get_method = lambda : 'HEAD'
u = urllib.request.urlopen(req)
if u.code != 200 or not 'Content-Length' in u.headers:
raise IOError
return int(u.headers['Content-Length'])
def get_range(url, start, end):
headers = DEFAULT_HEADERS.copy()
headers['Range'] = 'bytes=%s-%s' % (start, end)
req = urllib.request.Request(url, headers=headers)
u = urllib.request.urlopen(req)
return u.read()
try:
longlongformat = 'q' # long long
bytesize = struct.calcsize(longlongformat)
filesize = get_size(url)
hash = filesize
head = get_range(url, 0, min(filesize, 65536))
filesize = _get_size(url)
hash_ = filesize
head = _get_range(url, 0, min(filesize, 65536))
if filesize > 65536:
tail = get_range(url, filesize-65536, filesize)
tail = _get_range(url, filesize-65536, filesize)
if filesize < 65536:
f = BytesIO(head)
for x in range(int(filesize/bytesize)):
for _ in range(int(filesize/bytesize)):
buffer = f.read(bytesize)
(l_value,)= struct.unpack(longlongformat, buffer)
hash += l_value
hash = hash & 0xFFFFFFFFFFFFFFFF #cut off 64bit overflow
(l_value,) = struct.unpack(longlongformat, buffer)
hash_ += l_value
hash_ = hash_ & 0xFFFFFFFFFFFFFFFF # cut off 64bit overflow
else:
for offset in range(0, 65536, bytesize):
buffer = head[offset:offset+bytesize]
(l_value,)= struct.unpack(longlongformat, buffer)
hash += l_value
hash = hash & 0xFFFFFFFFFFFFFFFF #cut of 64bit overflow
(l_value,) = struct.unpack(longlongformat, buffer)
hash_ += l_value
hash_ = hash_ & 0xFFFFFFFFFFFFFFFF # cut of 64bit overflow
for offset in range(0, 65536, bytesize):
buffer = tail[offset:offset+bytesize]
(l_value,)= struct.unpack(longlongformat, buffer)
hash += l_value
hash = hash & 0xFFFFFFFFFFFFFFFF
returnedhash = "%016x" % hash
(l_value,) = struct.unpack(longlongformat, buffer)
hash_ += l_value
hash_ = hash_ & 0xFFFFFFFFFFFFFFFF
returnedhash = "%016x" % hash_
return returnedhash
except(IOError):
except IOError:
return "IOError"

View file

@ -18,7 +18,8 @@ _articles = ('the', 'la', 'a', 'die', 'der', 'le', 'el',
_articlesDict = dict([(x, x) for x in _articles])
_spArticles = []
for article in _articles:
if article[-1] not in ("'", '-'): article += ' '
if article[-1] not in ("'", '-'):
article += ' '
_spArticles.append(article)
_noarticles = (
@ -50,8 +51,10 @@ def canonical_title(title):
'Los Angeles Plays Itself'
"""
try:
if _articlesDict.has_key(title.split(', ')[-1].lower()): return title
except IndexError: pass
if title.split(', ')[-1].lower() in _articlesDict:
return title
except IndexError:
pass
ltitle = title.lower()
for start in _noarticles:
if ltitle.startswith(start):
@ -60,7 +63,8 @@ def canonical_title(title):
if ltitle.startswith(article):
lart = len(article)
title = '%s, %s' % (title[lart:], title[:lart])
if article[-1] == ' ': title = title[:-1]
if article[-1] == ' ':
title = title[:-1]
break
## XXX: an attempt using a dictionary lookup.
##for artSeparator in (' ', "'", '-'):
@ -82,9 +86,10 @@ def normalize_title(title):
'The Movie Title'
"""
stitle = title.split(', ')
if len(stitle) > 1 and _articlesDict.has_key(stitle[-1].lower()):
if len(stitle) > 1 and stitle[-1].lower() in _articlesDict:
sep = ' '
if stitle[-1][-1] in ("'", '-'): sep = ''
if stitle[-1][-1] in ("'", '-'):
sep = ''
title = '%s%s%s' % (stitle[-1], sep, ', '.join(stitle[:-1]))
return title
@ -139,7 +144,8 @@ def canonical_name(name):
# Don't convert names already in the canonical format.
if name in ('Unknown Director', ):
return name
if name.find(', ') != -1: return name
if name.find(', ') != -1:
return name
sname = name.split(' ')
snl = len(sname)
if snl == 2:
@ -147,11 +153,14 @@ def canonical_name(name):
name = '%s, %s' % (sname[1], sname[0])
elif snl > 2:
lsname = [x.lower() for x in sname]
if snl == 3: _indexes = (0, snl-2)
else: _indexes = (0, snl-2, snl-3)
if snl == 3:
_indexes = (0, snl-2)
else:
_indexes = (0, snl-2, snl-3)
# Check for common surname prefixes at the beginning and near the end.
for index in _indexes:
if lsname[index] not in _sname_suffixes: continue
if lsname[index] not in _sname_suffixes:
continue
try:
# Build the surname.
surn = '%s %s' % (sname[index], sname[index+1])
@ -194,11 +203,12 @@ def normalize_name(name):
def normalize_path(path):
path = path.replace(':', '_').replace('/', '_')
if path.endswith('.'): path = path[:-1] + '_'
if path.endswith('.'):
path = path[:-1] + '_'
return path
def strip_accents(s):
if isinstance(s, str):
s = unicode(s)
s = s.decode('utf-8')
return ''.join((c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn'))

View file

@ -6,13 +6,16 @@ from . import cache
from .text import find_re
from .utils import json, ET
def get_embed_code(url, maxwidth=None, maxheight=None):
embed = {}
header = cache.get_headers(url)
if header.get('content-type', '').startswith('text/html'):
html = cache.read_url(url)
json_oembed = filter(lambda l: 'json+oembed' in l, re.compile('<link.*?>').findall(html))
xml_oembed = filter(lambda l: 'xml+oembed' in l, re.compile('<link.*?>').findall(html))
links = re.compile('<link.*?>').findall(html)
json_oembed = [l for l in links if 'json+oembed' in l]
xml_oembed = [l for l in links if 'xml+oembed' in l]
if json_oembed:
oembed_url = find_re(json_oembed[0], 'href="(.*?)"')
if maxwidth:
@ -21,7 +24,7 @@ def get_embed_code(url, maxwidth=None, maxheight=None):
oembed_url += '&maxheight=%d' % maxheight
embed = json.loads(cache.read_url(oembed_url))
elif xml_oembed:
oembed_url = find_re(json_oembed[0], 'href="(.*?)"')
oembed_url = find_re(xml_oembed[0], 'href="(.*?)"')
if maxwidth:
oembed_url += '&maxwidth=%d' % maxwidth
if maxheight:

View file

@ -1,10 +1,11 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
from __future__ import with_statement, division, print_function
import chardet
import re
from __future__ import division, print_function
import codecs
import re
import chardet
from six import PY2
import ox
@ -12,18 +13,21 @@ __all__ = []
def _detect_encoding(fp):
bomDict={ # bytepattern : name
(0x00, 0x00, 0xFE, 0xFF): "utf_32_be",
(0xFF, 0xFE, 0x00, 0x00): "utf_32_le",
(0xFE, 0xFF, None, None): "utf_16_be",
(0xFF, 0xFE, None, None): "utf_16_le",
(0xEF, 0xBB, 0xBF, None): "utf_8",
}
bomDict = { # bytepattern : name
(0x00, 0x00, 0xFE, 0xFF): "utf_32_be",
(0xFF, 0xFE, 0x00, 0x00): "utf_32_le",
(0xFE, 0xFF, None, None): "utf_16_be",
(0xFF, 0xFE, None, None): "utf_16_le",
(0xEF, 0xBB, 0xBF, None): "utf_8",
}
# go to beginning of file and get the first 4 bytes
oldFP = fp.tell()
fp.seek(0)
(byte1, byte2, byte3, byte4) = tuple(map(ord, fp.read(4)))
if PY2:
(byte1, byte2, byte3, byte4) = [ord(b) for b in fp.read(4)]
else:
(byte1, byte2, byte3, byte4) = fp.read(4)
# try bom detection using 4 bytes, 3 bytes, or 2 bytes
bomDetection = bomDict.get((byte1, byte2, byte3, byte4))
@ -31,18 +35,18 @@ def _detect_encoding(fp):
bomDetection = bomDict.get((byte1, byte2, byte3, None))
if not bomDetection:
bomDetection = bomDict.get((byte1, byte2, None, None))
## if BOM detected, we're done :-)
# if BOM detected, we're done :-)
fp.seek(oldFP)
if bomDetection:
return bomDetection
encoding = 'latin-1'
#more character detecting magick using http://chardet.feedparser.org/
# more character detecting magick using http://chardet.feedparser.org/
fp.seek(0)
rawdata = fp.read()
#if data can be decoded as utf-8 use that, try chardet otherwise
#chardet detects utf-8 as ISO-8859-2 most of the time
# if data can be decoded as utf-8 use that, try chardet otherwise
# chardet detects utf-8 as ISO-8859-2 most of the time
try:
data = unicode(rawdata, 'utf-8')
rawdata.decode('utf-8')
encoding = 'utf-8'
except:
encoding = chardet.detect(rawdata)['encoding']
@ -63,26 +67,30 @@ def load(filename, offset=0):
def parse_time(t):
return offset + ox.time2ms(t.replace(',', '.')) / 1000
with open(filename) as f:
with open(filename, 'rb') as f:
encoding = _detect_encoding(f)
data = f.read()
try:
data = unicode(data, encoding)
data = data.decode(encoding)
except:
try:
data = unicode(data, 'latin-1')
data = data.decode('latin-1')
except:
print("failed to detect encoding, giving up")
return srt
data = data.replace('\r\n', '\n')
srts = re.compile('(\d\d:\d\d:\d\d[,.]\d\d\d)\s*?-->\s*?(\d\d:\d\d:\d\d[,.]\d\d\d).*?\n(.*?)\n\n', re.DOTALL)
if not data.endswith('\n\n'):
data += '\n\n'
regexp = r'(\d\d:\d\d:\d\d[,.]\d\d\d)\s*?-->\s*?(\d\d:\d\d:\d\d[,.]\d\d\d).*?\n(.*?)\n\n'
srts = re.compile(regexp, re.DOTALL)
i = 0
for s in srts.findall(data):
_s = {'id': str(i),
'in': parse_time(s[0]),
'out': parse_time(s[1]),
'value': s[2].strip()
_s = {
'id': str(i),
'in': parse_time(s[0]),
'out': parse_time(s[1]),
'value': s[2].strip()
}
srt.append(_s)
i += 1

View file

@ -5,20 +5,67 @@ import math
import re
import unicodedata
from six.moves import reduce
ARTICLES = list(set([
# def sg, def pl, indef sg, indef pl (each m/f/n)
'der', 'die', 'das', 'ein', 'eine', # de
'the', 'a', 'an', # en
'el', 'la', 'lo', 'los', 'las', 'un', 'una', 'unos', 'unas', # es
'le', "l'", 'la', 'les', 'un', 'une', 'des', # fr
'il', 'lo', "l'" 'la', '_i', 'gli', 'le', # it
'de', 'het', 'een', # nl
'o', 'a', 'os', '_as', 'um', 'uma', '_uns', 'umas' # pt
# some _disabled because of collisions
'der', 'die', 'das', 'ein', 'eine', # de
'the', 'a', 'an', # en
'el', 'la', 'lo', 'los', 'las', 'un', 'una', 'unos', 'unas', # es
'le', "l'", 'la', 'les', 'un', 'une', 'des', # fr
'il', 'lo', "l'" 'la', '_i', 'gli', 'le', # it
'de', 'het', 'een', # nl
'o', 'a', 'os', '_as', 'um', 'uma', '_uns', 'umas' # pt
# some _disabled because of collisions
]))
# every given name in 0xDB that matches Xxxx-yyyy Lastname
ASIAN_FIRST_NAMES = [
'a', 'ae', 'aeng', 'ah', 'ai', 'an', 'back', 'bae', 'ban', 'bang', 'bao',
'beom', 'bi', 'bin', 'bo', 'bok', 'bon', 'bong', 'bu', 'bum', 'byeong',
'byoung', 'byung', 'cai', 'chae', 'chan', 'chang', 'chao', 'cheal', 'chen',
'cheng', 'cheol', 'cheon', 'cheong', 'cheul', 'chi', 'chia', 'chiao',
'chieh', 'chien', 'chih', 'chin', 'ching', 'cho', 'choi', 'chong', 'choo',
'chu', 'chuan', 'chuen', 'chul', 'chun', 'chung', 'chuo', 'chyi', 'da',
'dae', 'dah', 'dal', 'dan', 'deok', 'do', 'dong', 'doo', 'duek', 'duk',
'e', 'el', 'en', 'eui', 'eul', 'eun', 'eung', 'fai', 'fan', 'fang', 'fei',
'fen', 'feng', 'fo', 'foo', 'fu', 'ga', 'gae', 'gam', 'gang', 'ge', 'gen',
'geon', 'geun', 'gi', 'gil', 'gin', 'gnad', 'gok', 'goo', 'gook', 'gu',
'gun', 'gwan', 'gye', 'gyeong', 'gyu', 'gyun', 'ha', 'hae', 'hak', 'han',
'hang', 'hao', 'he', 'hee', 'heng', 'heon', 'hie', 'ho', 'hoi', 'hong',
'hoo', 'hoon', 'hou', 'hsi', 'hsiang', 'hsiao', 'hsieh', 'hsien', 'hsin',
'hsing', 'hsiung', 'hu', 'hua', 'huai', 'huang', 'hue', 'hui', 'hun',
'hung', 'hwa', 'hwan', 'hwang', 'hye', 'hyeok', 'hyeon', 'hyeong', 'hyo',
'hyuk', 'hyun', 'hyung', 'i', 'ik', 'il', 'in', 'ja', 'jae', 'jan', 'jang',
'je', 'jee', 'jen', 'jeok', 'jeong', 'jeung', 'ji', 'jia', 'jian', 'jik',
'jin', 'jing', 'jo', 'jong', 'joo', 'joon', 'ju', 'juan', 'jun', 'jung',
'ka', 'kai', 'kam', 'kan', 'kang', 'kap', 'kar', 'ke', 'kee', 'kei',
'keng', 'keum', 'keung', 'ki', 'kil', 'kin', 'kit', 'kot', 'ku', 'kua',
'kuan', 'kuang', 'kuen', 'kun', 'kuo', 'kwang', 'kwok', 'kwon', 'kwong',
'kyeong', 'kyo', 'kyoon', 'kyou', 'kyoung', 'kyu', 'kyun', 'kyung', 'lai',
'lau', 'lee', 'lei', 'leng', 'leung', 'li', 'liang', 'lien', 'lin', 'ling',
'lock', 'long', 'lun', 'lung', 'maeng', 'man', 'mei', 'mi', 'miao', 'min',
'ming', 'mo', 'mok', 'moo', 'mook', 'moon', 'mu', 'mun', 'myeong',
'myoeng', 'myong', 'myung', 'na', 'nae', 'nai', 'nam', 'nan', 'neung',
'ngaru', 'ni', 'no', 'nyeo', 'oh', 'ok', 'ou', 'pai', 'pei', 'pen', 'peng',
'pi', 'pil', 'pin', 'ping', 'po', 'pui', 'pyo', 'pyung', 'qing', 'qun',
'ra', 'rak', 'ram', 'ran', 'reum', 'ri', 'rim', 'rin', 'roe', 'rok', 'ru',
'rui', 'ryeon', 'ryol', 'ryong', 'sa', 'sae', 'san', 'sang', 'se', 'seo',
'seob', 'seok', 'seol', 'seon', 'seong', 'seung', 'shan', 'shen', 'sheng',
'shi', 'shia', 'shiang', 'shih', 'shik', 'shim', 'shin', 'shing', 'shou',
'shu', 'shun', 'si', 'sik', 'sin', 'siu', 'so', 'song', 'soo', 'sook',
'soon', 'su', 'suk', 'sun', 'sung', 'sup', 'szu', "t'ien", 'ta', 'tae',
'taek', 'tai', 'tak', 'te', 'ti', 'tian', 'ting', 'to', 'toa', 'tsai',
'tsan', 'tse', 'tso', 'tsui', 'tung', 'tzu', 'ua', 'ui', 'un', 'wah',
'wai', 'wan', 'wei', 'wen', 'weon', 'wing', 'wit', 'wol', 'won', 'woo',
'wook', 'woon', 'woong', 'wuk', 'xiao', 'ya', 'yan', 'yang', 'yao', 'ye',
'yea', 'yee', 'yeh', 'yen', 'yeo', 'yeol', 'yeon', 'yeong', 'yeop', 'yi',
'yin', 'ying', 'yiu', 'yoeng', 'yong', 'yoo', 'yoon', 'you', 'young', 'yu',
'yuan', 'yue', 'yuen', 'yuk', 'yull', 'yun', 'yune', 'yung', 'zhi',
'zhong', 'zhu'
]
# see http://en.wikipedia.org/wiki/List_of_common_Chinese_surnames
# and http://en.wikipedia.org/wiki/List_of_Korean_family_names
ASIAN_NAMES = [
ASIAN_LAST_NAMES = [
'chan', 'chang', 'chao',
'chen', 'cheong', 'cheung',
'chong', 'choo',
@ -88,8 +135,8 @@ UA_REGEXPS = {
'(Chimera)\/(\d+)',
'(chromeframe)\/(\d+)',
'(Edge)\/(\d+)',
'(Epiphany)\/(\d+)', # before Chrome, Chromium and Safari
'(Chromium)\/(\d+)', # before Chrome
'(Epiphany)\/(\d+)', # before Chrome, Chromium and Safari
'(Chromium)\/(\d+)', # before Chrome
'(Chrome)\/(\d+)',
'(FBForIPhone)',
'(Firefox)\/(\d+)',
@ -107,7 +154,7 @@ UA_REGEXPS = {
'(OviBrowser)\/(\d+)',
'Version\/(\d+).+(Safari)',
'(WebKit)\/(\d+)',
'(MSIE) (\d\d?(?!\d))', # last, since Opera used to mask as MSIE
'(MSIE) (\d\d?(?!\d))', # last, since Opera used to mask as MSIE
'(Trident)\/.*?rv:(\d+)',
'(Gecko)',
'(Mozilla)\/(3|4)'
@ -117,7 +164,9 @@ UA_REGEXPS = {
'(Google Web Preview).+Chrome\/(\d+)',
'(Googlebot)\/(\d+)',
'(WebCrawler)\/(\d+)',
'(Yahoo! Slurp)\/(\d+)'
'(Yahoo! Slurp)\/(\d+)',
'(YandexBot)\/([\d\.]+)',
'(YandexMobileBot)\/([\d\.]+)',
],
'system': [
'(Android) (\d+)',
@ -130,7 +179,7 @@ UA_REGEXPS = {
'(BSD) (FreeBSD|NetBSD|OpenBSD)',
'(CPU OS) (\d+)',
'(iPhone OS) (\d+)',
'(iPhone)', # Opera
'(iPhone)', # Opera
'(J2ME\/MIDP)',
'(Linux).+(CentOS|CrOS|Debian|Fedora|Gentoo|Mandriva|MeeGo|Mint|Red Hat|SUSE|Ubuntu|webOS)',
'(CentOS|CrOS|Debian|Fedora|Gentoo|Mandriva|MeeGo|Mint|Red Hat|SUSE|Ubuntu|webOS).+(Linux)',
@ -155,12 +204,12 @@ UA_REGEXPS = {
'(Windows) (NT \d\.\d)',
'(Windows Phone) (\d+)',
'(Windows Phone OS) (\d+)',
'(Windows) (3\.1|95|98|2000|2003|CE|ME|Mobile|NT|XP)', # Opera
'(Win) (9x 4\.90)', # Firefox
'(Win)(16)', # Firefox
'(Win)(9\d)', # Firefox
'(Win)(NT)', # Firefox
'(Win)(NT4\.0)', # Firefox
'(Windows) (3\.1|95|98|2000|2003|CE|ME|Mobile|NT|XP)', # Opera
'(Win) (9x 4\.90)', # Firefox
'(Win)(16)', # Firefox
'(Win)(9\d)', # Firefox
'(Win)(NT)', # Firefox
'(Win)(NT4\.0)', # Firefox
'(X11)'
]
}
@ -244,15 +293,41 @@ def get_sort_name(name):
>>> get_sort_name('Scorsese, Martin')
'Scorsese, Martin'
"""
if not ' ' in name or ', ' in name:
if ' ' not in name or ', ' in name:
return name
if name.lower().startswith('the '):
return get_sort_title(name)
def add_name():
if len(first_names):
last_names.insert(0, first_names.pop())
def find_name(names):
return len(first_names) and first_names[-1].lower() in names
if is_asian_name(name):
names = name.replace('-', ' ').split(' ')
if len(names) == 2:
if names[0].lower() in ASIAN_LAST_NAMES:
lastname, firstname = names
else:
firstname, lastname = names
else:
names_ = name.split(' ')
if '-' in names_[0]:
lastname, firstname = [names[2], names[0] + '-' + names[1].lower()]
elif '-' in names_[1]:
lastname, firstname = [names[0], names[1] + '-' + names[2].lower()]
elif names[0].lower() in ASIAN_FIRST_NAMES and names[2].lower() not in ASIAN_FIRST_NAMES:
lastname, firstname = [names[2], names[0] + ' ' + names[1]]
elif names[0].lower() not in ASIAN_FIRST_NAMES and names[2].lower() in ASIAN_FIRST_NAMES:
lastname, firstname = [names[0], names[1] + ' ' + names[2]]
elif names[0].lower() in ASIAN_LAST_NAMES:
lastname, firstname = [names[0], names[1] + ' ' + names[2]]
else:
lastname, firstname = [names[2], names[0] + ' ' + names[1]]
return lastname + ' ' + firstname
first_names = name.split(' ')
last_names = []
if re.search('^[0-9]+$', first_names[-1]):
@ -269,7 +344,7 @@ def get_sort_name(name):
add_name()
name = ' '.join(last_names)
if len(first_names):
separator = ' ' if last_names[0].lower() in ASIAN_NAMES else ', '
separator = ' ' if last_names[0].lower() in ASIAN_LAST_NAMES else ', '
name += separator + ' '.join(first_names)
return name
@ -299,8 +374,8 @@ def find_re(string, regexp):
return result[0].strip()
return ''
def find_string(string, string0='', string1 = ''):
"""Return the string between string0 and string1.
def find_string(string, string0='', string1=''):
"""Return the string between string0 and string1.
If string0 or string1 is left out, begining or end of string is used.
@ -324,12 +399,23 @@ def find_string(string, string0='', string1 = ''):
string1 = '$'
return find_re(string, string0 + '(.*?)' + string1)
def is_asian_name(name):
names = name.replace('-', ' ').lower().split(' ')
return (len(names) == 2 and not '-' in name and (
(names[0] in ASIAN_FIRST_NAMES and names[1] in ASIAN_LAST_NAMES) or
(names[0] in ASIAN_LAST_NAMES and names[1] in ASIAN_FIRST_NAMES)
)) or (
len(names) == 3 and names[1] in ASIAN_FIRST_NAMES and (
names[0] in ASIAN_FIRST_NAMES or names[2] in ASIAN_FIRST_NAMES
)
)
def parse_useragent(useragent):
data = {}
for key in UA_REGEXPS:
for alias, regexp in UA_ALIASES[key].items():
alias = alias if key == 'browser' else alias + ' \\1'
useragent = re.sub(regexp, alias, useragent)
useragent = re.sub(regexp, alias, useragent)
for regexp in UA_REGEXPS[key]:
data[key] = {'name': '', 'version': '', 'string': ''}
match = re.compile(regexp).search(useragent)
@ -352,7 +438,7 @@ def parse_useragent(useragent):
'version': version,
'string': string
}
break;
break
return data
def remove_special_characters(text):
@ -373,14 +459,17 @@ def wrap(text, width):
the text. Expects that existing line breaks are posix newlines (\n).
See http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/148061
"""
return reduce(lambda line, word, width=width: '%s%s%s' %
(line,
' \n'[(len(line[line.rfind('\n')+1:])
+ len(word.split('\n',1)[0]
) >= width)],
word),
text.split(' ')
)
def reduce_line(line, word):
return '%s%s%s' % (
line,
' \n'[
(len(line[line.rfind('\n')+1:]) + len(word.split('\n', 1)[0]) >= width)
],
word
)
return reduce(reduce_line, text.split(' '))
def wrap_string(string, length=80, separator='\n', balance=False):
'''
@ -404,7 +493,7 @@ def wrap_string(string, length=80, separator='\n', balance=False):
for word in words:
if len(lines[len(lines) - 1] + word + u' ') <= length + 1:
# word fits in current line
lines[len(lines) - 1] += word + u' ';
lines[len(lines) - 1] += word + u' '
else:
if len(word) <= length:
# word fits in next line
@ -414,7 +503,7 @@ def wrap_string(string, length=80, separator='\n', balance=False):
position = length - len(lines[len(lines) - 1])
lines[len(lines) - 1] += word[0:position]
for i in range(position, len(word), length):
lines.append(word[i:i+length]);
lines.append(word[i:i+length])
lines[len(lines) - 1] += u' '
return separator.join(lines).strip()
@ -425,7 +514,7 @@ def truncate_string(string, length, padding='...', position='right'):
# 'anticon...lement'
# >>> truncate_string('anticonstitutionellement', 16, '...', 'right')
# 'anticonstitut...'
stringLength = len(string);
stringLength = len(string)
paddingLength = len(padding)
if stringLength > length:
if position == 'left':
@ -436,7 +525,7 @@ def truncate_string(string, length, padding='...', position='right'):
string = '%s%s%s' % (string[:left], padding, string[right:])
elif position == 'right':
string = '%s%s' % (string[:length - paddingLength], padding)
return string;
return string
def truncate_words(s, num):
"""Truncates a string after a certain number of chacters, but ends with a word
@ -473,7 +562,7 @@ def trim_string(string, num):
def get_valid_filename(s):
"""
Returns the given string converted to a string that can be used for a clean
filename. Specifically, leading and trailing spaces are removed;
filename. Specifically, leading and trailing spaces are removed;
all non-filename-safe characters are removed.
>>> get_valid_filename("john's portrait in 2004.jpg")
@ -498,9 +587,11 @@ def get_text_list(list_, last_word='or'):
>>> get_text_list([])
''
"""
if len(list_) == 0: return ''
if len(list_) == 1: return list_[0]
return u'%s %s %s' % (u', '.join([unicode(i) for i in list_][:-1]), last_word, list_[-1])
if len(list_) == 0:
return ''
if len(list_) == 1:
return list_[0]
return u'%s %s %s' % (u', '.join([i for i in list_][:-1]), last_word, list_[-1])
def get_list_text(text, last_word='or'):
"""
@ -519,7 +610,7 @@ def get_list_text(text, last_word='or'):
if text:
list_ = text.split(u', ')
if list_:
i=len(list_)-1
i = len(list_)-1
last = list_[i].split(last_word)
if len(last) == 2:
list_[i] = last[0].strip()
@ -531,11 +622,11 @@ def normalize_newlines(text):
def recapitalize(text):
"Recapitalizes text, placing caps after end-of-sentence punctuation."
#capwords = ()
# capwords = ()
text = text.lower()
capsRE = re.compile(r'(?:^|(?<=[\.\?\!] ))([a-z])')
text = capsRE.sub(lambda x: x.group(1).upper(), text)
#for capword in capwords:
# for capword in capwords:
# capwordRE = re.compile(r'\b%s\b' % capword, re.I)
# text = capwordRE.sub(capword, text)
return text
@ -543,22 +634,28 @@ def recapitalize(text):
def phone2numeric(phone):
"Converts a phone number with letters into its numeric equivalent."
letters = re.compile(r'[A-PR-Y]', re.I)
char2number = lambda m: {'a': '2', 'c': '2', 'b': '2', 'e': '3',
'd': '3', 'g': '4', 'f': '3', 'i': '4', 'h': '4', 'k': '5',
'j': '5', 'm': '6', 'l': '5', 'o': '6', 'n': '6', 'p': '7',
's': '7', 'r': '7', 'u': '8', 't': '8', 'w': '9', 'v': '8',
'y': '9', 'x': '9'}.get(m.group(0).lower())
def char2number(m):
return {
'a': '2', 'c': '2', 'b': '2', 'e': '3',
'd': '3', 'g': '4', 'f': '3', 'i': '4', 'h': '4', 'k': '5',
'j': '5', 'm': '6', 'l': '5', 'o': '6', 'n': '6', 'p': '7',
's': '7', 'r': '7', 'u': '8', 't': '8', 'w': '9', 'v': '8',
'y': '9', 'x': '9'
}.get(m.group(0).lower())
return letters.sub(char2number, phone)
def compress_string(s):
import cStringIO, gzip
zbuf = cStringIO.StringIO()
import gzip
from six import BytesIO
zbuf = BytesIO()
zfile = gzip.GzipFile(mode='wb', compresslevel=6, fileobj=zbuf)
zfile.write(s)
zfile.close()
return zbuf.getvalue()
smart_split_re = re.compile('("(?:[^"\\\\]*(?:\\\\.[^"\\\\]*)*)"|\'(?:[^\'\\\\]*(?:\\\\.[^\'\\\\]*)*)\'|[^\\s]+)')
def smart_split(text):
"""
Generator that splits a string by spaces, leaving quoted phrases together.
@ -582,17 +679,17 @@ def words(text):
returns words in text, removing punctuation
"""
text = text.split()
return map(lambda x: re.sub("(([.!?:-_]|'s)$)", '', x), text)
return [re.sub("(([.!?:-_]|'s)$)", '', x) for x in text]
def sort_string(string):
string = string.replace(u'Æ', 'AE').replace(u'Ø', 'O').replace(u'Þ', 'Th')
#pad numbered titles
# pad numbered titles
string = re.sub('(\d),(\d{3})', '\\1\\2', string)
string = re.sub('(\d+)', lambda x: '%010d' % int(x.group(0)), string)
return unicodedata.normalize('NFKD', string)
def sorted_strings(strings, key=None):
if not key:
key = lambda k: sort_string(k)
key = sort_string
return sorted(strings, key=key)

View file

@ -14,8 +14,8 @@ else:
__all__ = ['create_torrent', 'get_info_hash', 'get_torrent_info', 'get_files', 'get_torrent_size']
def create_torrent(file, url, params = {}, flag = Event(),
progress = lambda x: None, progress_percent = 1):
def create_torrent(file, url, params={}, flag=Event(),
progress=lambda x: None, progress_percent=1):
"Creates a torrent for a given file, using url as tracker url"
from .makemetafile import make_meta_file
return make_meta_file(file, url, params, flag, progress, progress_percent)

View file

@ -1,5 +1,6 @@
# Written by Petru Paler, Uoti Urpala, Ross Cohen and John Hoffman
# see LICENSE.txt for license information
from __future__ import print_function
from types import IntType, LongType, StringType, ListType, TupleType, DictType
try:
@ -53,8 +54,8 @@ def decode_dict(x, f):
lastkey = None
while x[f] != 'e':
k, f = decode_string(x, f)
#why is this needed
#if lastkey >= k:
# why is this needed
# if lastkey >= k:
# raise ValueError
lastkey = k
r[k], f = decode_func[x[f]](x, f)
@ -81,9 +82,9 @@ def bdecode(x, sloppy = 1):
r, l = decode_func[x[0]](x, 0)
# except (IndexError, KeyError):
except (IndexError, KeyError, ValueError):
raise ValueError, "bad bencoded data"
raise ValueError("bad bencoded data")
if not sloppy and l != len(x):
raise ValueError, "bad bencoded data"
raise ValueError("bad bencoded data")
return r
def test_bdecode():
@ -102,10 +103,10 @@ def test_bdecode():
assert 0
except ValueError:
pass
assert bdecode('i4e') == 4L
assert bdecode('i0e') == 0L
assert bdecode('i123456789e') == 123456789L
assert bdecode('i-10e') == -10L
assert bdecode('i4e') == 4
assert bdecode('i0e') == 0
assert bdecode('i123456789e') == 123456789
assert bdecode('i-10e') == -10
try:
bdecode('i-0e')
assert 0
@ -287,7 +288,7 @@ def bencode(x):
try:
encode_func[type(x)](x, r)
except:
print "*** error *** could not encode type %s (value: %s)" % (type(x), x)
print("*** error *** could not encode type %s (value: %s)" % (type(x), x))
assert 0
return ''.join(r)
@ -295,7 +296,7 @@ def test_bencode():
assert bencode(4) == 'i4e'
assert bencode(0) == 'i0e'
assert bencode(-10) == 'i-10e'
assert bencode(12345678901234567890L) == 'i12345678901234567890e'
assert bencode(12345678901234567890) == 'i12345678901234567890e'
assert bencode('') == '0:'
assert bencode('abc') == '3:abc'
assert bencode('1234567890') == '10:1234567890'

View file

@ -4,139 +4,151 @@
#
##
def _decode_int(data):
"""
decode integer from bytearray
return int, remaining data
"""
data = data[1:]
end = data.index(b'e')
return int(data[:end],10), data[end+1:]
class Decoder(object):
def _decode_str(data):
"""
decode string from bytearray
return string, remaining data
"""
start = data.index(b':')
l = int(data[:start].decode(),10)
if l <= 0:
raise Exception('invalid string size: %d'%d)
start += 1
ret = bytes(data[start:start+l])
data = data[start+l:]
return ret, data
def _decode_int(self):
"""
decode integer from bytearray
return int
"""
self.idx += 1
start = self.idx
end = self.data.index(b'e', self.idx)
self.idx = end + 1
return int(self.data[start:end])
def _decode_list(data):
"""
decode list from bytearray
return list, remaining data
"""
ls = []
data = data[1:]
while data[0] != ord(b'e'):
elem, data = _decode(data)
ls.append(elem)
return ls, data[1:]
def _decode_str(self):
"""
decode string from bytearray
return string
"""
start = self.data.index(b':', self.idx)
l = int(self.data[self.idx:start].decode(), 10)
if l < 0:
raise Exception('invalid string size: %d' % l)
start += 1
ret = self.data[start:start+l]
try:
ret = ret.decode('utf-8')
except:
pass
self.idx = start + l
return ret
def _decode_dict(data):
"""
decode dict from bytearray
return dict, remaining data
"""
d = {}
data = data[1:]
while data[0] != ord(b'e'):
k, data = _decode_str(data)
v, data = _decode(data)
d[k.decode()] = v
return d, data[1:]
def _decode_list(self):
"""
decode list from bytearray
return list
"""
ls = []
self.idx += 1
while self.data[self.idx] != ord(b'e'):
ls.append(self._decode())
self.idx += 1
return ls
def _decode(data):
"""
decode a bytearray
return deserialized object, remaining data
"""
ch = chr(data[0])
if ch == 'l':
return _decode_list(data)
elif ch == 'i':
return _decode_int(data)
elif ch == 'd':
return _decode_dict(data)
elif ch.isdigit():
return _decode_str(data)
else:
raise Exception('could not deserialize data: %s'%data)
def _decode_dict(self):
"""
decode dict from bytearray
return dict
"""
d = {}
self.idx += 1
while self.data[self.idx] != ord(b'e'):
k = self._decode_str()
v = self._decode()
d[k] = v
self.idx += 1
return d
def _decode(self):
ch = chr(self.data[self.idx])
if ch == 'l':
return self._decode_list()
elif ch == 'i':
return self._decode_int()
elif ch == 'd':
return self._decode_dict()
elif ch.isdigit():
return self._decode_str()
else:
raise Exception('could not decode data: %s' % data)
def decode(self, data):
self.idx = 0
self.data = data
obj = self._decode()
if len(data) != self.idx:
raise Exception('failed to decode, extra data: %s' % data)
return obj
def bdecode(data):
"""
decode a bytearray
return deserialized object
return decoded object
"""
obj , data = _decode(data)
if len(data) > 0:
raise Exception('failed to deserialize, extra data: %s'%data)
return obj
return Decoder().decode(data)
def _encode_str(s,buff):
def _encode_str(s, buff):
"""
encode string to a buffer
"""
s = bytearray(s)
l = len(s)
buff.append(bytearray(str(l)+':','utf-8'))
buff.append(bytearray(str(l)+':', 'utf-8'))
buff.append(s)
def _encode_int(i,buff):
def _encode_int(i, buff):
"""
encode integer to a buffer
"""
buff.append(b'i')
buff.append(bytearray(str(i),'ascii'))
buff.append(bytearray(str(i), 'ascii'))
buff.append(b'e')
def _encode_list(l,buff):
def _encode_list(l, buff):
"""
encode list of elements to a buffer
"""
buff.append(b'l')
for i in l:
_encode(i,buff)
_encode(i, buff)
buff.append(b'e')
def _encode_dict(d,buff):
def _encode_dict(d, buff):
"""
encode dict
"""
buff.append(b'd')
l = list(d.keys())
l.sort()
for k in l:
_encode(str(k),buff)
_encode(d[k],buff)
for k in sorted(d):
if not isinstance(k, (bytes, str)):
k = str(k)
_encode(k, buff)
_encode(d[k], buff)
buff.append(b'e')
def _encode(obj,buff):
def _encode(obj, buff):
"""
encode element obj to a buffer buff
"""
if isinstance(obj,str):
_encode_str(bytearray(obj,'utf-8'),buff)
elif isinstance(obj,bytes):
_encode_str(bytearray(obj),buff)
elif isinstance(obj,bytearray):
_encode_str(obj,buff)
if isinstance(obj, str):
_encode_str(bytearray(obj, 'utf-8'), buff)
elif isinstance(obj, bytes):
_encode_str(bytearray(obj), buff)
elif isinstance(obj, bytearray):
_encode_str(obj, buff)
elif str(obj).isdigit():
_encode_int(obj,buff)
elif isinstance(obj,list):
_encode_list(obj,buff)
elif hasattr(obj,'keys') and hasattr(obj,'values'):
_encode_dict(obj,buff)
elif str(obj) in ['True','False']:
_encode_int(int(obj and '1' or '0'),buff)
_encode_int(obj, buff)
elif isinstance(obj, int):
_encode_int(obj, buff)
elif isinstance(obj, list):
_encode_list(obj, buff)
elif hasattr(obj, 'keys') and hasattr(obj, 'values'):
_encode_dict(obj, buff)
elif str(obj) in ['True', 'False']:
_encode_int(int(obj and '1' or '0'), buff)
else:
raise Exception('non serializable object: %s'%obj)
raise Exception('non serializable object: %s [%s]' % (obj, type(obj)))
def bencode(obj):
@ -144,8 +156,8 @@ def bencode(obj):
bencode element, return bytearray
"""
buff = []
_encode(obj,buff)
ret = bytearray()
_encode(obj, buff)
ret = bytearray()
for ba in buff:
ret += ba
ret += ba
return bytes(ret)

View file

@ -3,7 +3,7 @@
import codecs
import ox
from . import srt
def _webvtt_timecode(t):
return ox.format_duration(t * 1000, years=False)
@ -30,3 +30,13 @@ def encode(data, webvtt=False):
)
return codecs.BOM_UTF8 + srt.encode('utf-8')
def load(filename, offset=0):
'''Parses vtt file
filename: path to an vtt file
offset (float, seconds): shift all in/out points by offset
Returns list with dicts that have in, out, value and id
'''
return srt.load(filename, offset)

View file

@ -2,6 +2,7 @@ from __future__ import print_function
import json
import re
from six import text_type
from ox.cache import read_url
HEADERS = {
@ -16,9 +17,9 @@ USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7) '
USER_AGENT += 'AppleWebKit/534.48.3 (KHTML, like Gecko) Version/5.1 Safari/534.48.3'
def get_movie_data(title, director):
if isinstance(title, unicode):
if isinstance(title, text_type):
title = title.encode('utf-8')
if isinstance(director, unicode):
if isinstance(director, text_type):
director = director.encode('utf-8')
data = {}
# itunes section (preferred source for link)
@ -45,7 +46,7 @@ def get_movie_data(title, director):
results = js['results']
if results:
url = host + results[0]['location']
if not 'link' in data:
if 'link' not in data:
data['link'] = url
headers = {
'User-Agent': USER_AGENT

View file

@ -17,7 +17,7 @@ def get(key):
if key in auth:
return auth[key]
print("please add key %s to json file '%s'" % (key, user_auth))
raise Exception,"no key %s found" % key
raise Exception("no key %s found" % key)
def update(key, value):
user_auth = os.environ.get('oxAUTH', os.path.expanduser('~/.ox/auth.json'))
@ -31,4 +31,3 @@ def update(key, value):
f = open(user_auth, "w")
f.write(json.dumps(auth, indent=2))
f.close()

View file

@ -8,13 +8,13 @@ from ox.cache import read_url
from ox.html import strip_tags, decode_html
from ox.text import find_re
import imdb
from . import imdb
def get_id(url):
return url.split("/")[-1]
def get_url(id):
return "http://www.criterion.com/films/%s" % id
return "https://www.criterion.com/films/%s" % id
def get_data(id, timeout=ox.cache.cache_timeout, get_imdb=False):
'''
@ -28,23 +28,34 @@ def get_data(id, timeout=ox.cache.cache_timeout, get_imdb=False):
u'http://s3.amazonaws.com/criterion-production/product_images/185/343_box_348x490.jpg'
'''
data = {
"id": id,
"url": get_url(id)
}
try:
html = read_url(data["url"], timeout=timeout, unicode=True)
except:
html = ox.cache.read_url(data["url"], timeout=timeout)
data["number"] = find_re(html, "<li>Spine #(\d+)")
html = read_url(data["url"], timeout=timeout).decode('utf-8', 'ignore')
data["title"] = decode_html(find_re(html, "<h1 class=\"movietitle\">(.*?)</h1>"))
data["number"] = find_re(html, "<b>Spine #(\d+)")
data["title"] = decode_html(find_re(html, "<h1 class=\"header__primarytitle\".*?>(.*?)</h1>"))
data["title"] = data["title"].split(u' \u2014 The Television Version')[0].strip()
data["director"] = strip_tags(find_re(html, "<h2 class=\"director\">(.*?)</h2>"))
results = find_re(html, '<div class="left_column">(.*?)</div>')
results = re.compile("<li>(.*?)</li>").findall(results)
data["country"] = results[0]
data["year"] = results[1]
results = find_re(html, '<ul class="film-meta-list">(.*?)</ul>')
info = re.compile('<li itemprop="(.*?)".*?>(.*?)</li>', re.DOTALL).findall(results)
info = {k: strip_tags(v).strip() for k, v in info}
if 'director' in info:
data['director'] = info['director']
if 'countryOfOrigin' in info:
data['country'] = [c.strip() for c in decode_html(info['countryOfOrigin']).split(', ')]
if 'inLanguage' in info:
data['language'] = [l.strip() for l in decode_html(info['inLanguage']).split(', ')]
for v in re.compile('<li>(.*?)</li>', re.DOTALL).findall(results):
if 'datePublished' in v:
data['year'] = strip_tags(v).strip()
elif 'duration' in v:
data['duration'] = strip_tags(v).strip()
data["synopsis"] = decode_html(strip_tags(find_re(html,
"<div class=\"content_block last\">.*?<p>(.*?)</p>")))
"<div class=\"product-summary\".*?>.*?<p>(.*?)</p>")))
result = find_re(html, "<div class=\"purchase\">(.*?)</div>")
if 'Blu-Ray' in result or 'Essential Art House DVD' in result:
@ -56,47 +67,46 @@ def get_data(id, timeout=ox.cache.cache_timeout, get_imdb=False):
data["posters"] = [result]
else:
html_ = read_url(result, unicode=True)
result = find_re(html_, '<a href="http://www.criterion.com/films/%s.*?">(.*?)</a>' % id)
result = find_re(html_, '//www.criterion.com/films/%s.*?">(.*?)</a>' % id)
result = find_re(result, "src=\"(.*?)\"")
if result:
data["posters"] = [result.replace("_w100", "")]
else:
data["posters"] = []
data['posters'] = [re.sub('(\?\d+)$', '', p) for p in data['posters']]
data['posters'] = [p for p in data['posters'] if p]
posters = find_re(html, '<div class="product-box-art".*?>(.*?)</div>')
for poster in re.compile('<img src="(.*?)"').findall(posters):
data['posters'].append(poster)
result = find_re(html, "<img alt=\"Film Still\" height=\"252\" src=\"(.*?)\"")
if result:
data["stills"] = [result]
data["trailers"] = []
else:
data["stills"] = filter(lambda x: x, [find_re(html, "\"thumbnailURL\", \"(.*?)\"")])
data["trailers"] = filter(lambda x: x, [find_re(html, "\"videoURL\", \"(.*?)\"")])
data["stills"] = list(filter(lambda x: x, [find_re(html, "\"thumbnailURL\", \"(.*?)\"")]))
data["trailers"] = list(filter(lambda x: x, [find_re(html, "\"videoURL\", \"(.*?)\"")]))
if timeout == ox.cache.cache_timeout:
timeout = -1
if get_imdb:
if get_imdb and 'title' in data and 'director' in data:
# removed year, as "title (year)" may fail to match
data['imdbId'] = imdb.get_movie_id(data['title'], data['director'], timeout=timeout)
return data
def get_ids(page=None):
ids = []
if page:
url = "http://www.criterion.com/library/expanded_view?m=dvd&p=%s&pp=50&s=spine" % page
html = read_url(url)
results = re.compile("films/(\d+)").findall(html)
html = read_url("https://www.criterion.com/shop/browse/list?sort=spine_number", unicode=True)
results = re.compile("films/(\d+)-").findall(html)
ids += results
results = re.compile("boxsets/(.*?)\"").findall(html)
for result in results:
html = read_url("https://www.criterion.com/boxsets/" + result, unicode=True)
results = re.compile("films/(\d+)-").findall(html)
ids += results
results = re.compile("boxsets/(.*?)\"").findall(html)
for result in results:
html = read_url("http://www.criterion.com/boxsets/" + result)
results = re.compile("films/(\d+)").findall(html)
ids += results
return set(ids)
html = read_url("http://www.criterion.com/library/expanded_view?m=dvd&p=1&pp=50&s=spine", unicode=True)
results = re.compile("\&amp;p=(\d+)\&").findall(html)
pages = max(map(int, results))
for page in range(1, pages):
ids += get_ids(page)
return sorted(set(ids), key=int)
if __name__ == '__main__':
print(get_ids())

View file

@ -1,21 +1,21 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import re
from six.moves.urllib.parse import unquote
from ox.cache import read_url
def get_video_url(url):
'''
>>> get_video_url('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3opar_priere-pour-refuznik-1-jeanluc-goda_shortfilms').split('?auth')[0]
'http://www.dailymotion.com/cdn/FLV-320x240/video/x3opar_priere-pour-refuznik-1-jean-luc-god_shortfilms.flv'
>>> get_video_url('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3ou94_priere-pour-refuznik-2-jeanluc-goda_shortfilms').split('?auth')[0]
'http://www.dailymotion.com/cdn/FLV-320x240/video/x3ou94_priere-pour-refuznik-2-jean-luc-god_shortfilms.flv'
'''
data = read_url(url)
video = re.compile('''video", "(.*?)"''').findall(data)
for v in video:
v = unquote(v).split('@@')[0]
return v
return ''
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import re
from six.moves.urllib.parse import unquote
from ox.cache import read_url
def get_video_url(url):
'''
>>> get_video_url('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3opar_priere-pour-refuznik-1-jeanluc-goda_shortfilms').split('?auth')[0]
'http://www.dailymotion.com/cdn/FLV-320x240/video/x3opar_priere-pour-refuznik-1-jean-luc-god_shortfilms.flv'
>>> get_video_url('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3ou94_priere-pour-refuznik-2-jeanluc-goda_shortfilms').split('?auth')[0]
'http://www.dailymotion.com/cdn/FLV-320x240/video/x3ou94_priere-pour-refuznik-2-jean-luc-god_shortfilms.flv'
'''
data = read_url(url)
video = re.compile('''video", "(.*?)"''').findall(data)
for v in video:
v = unquote(v).split('@@')[0]
return v
return ''

View file

@ -6,17 +6,25 @@ from six.moves import urllib
import ox
from ox import strip_tags, decode_html
from ox.cache import read_url
import lxml.html
def find(query, timeout=ox.cache.cache_timeout):
"""
Returns tuples with title, url, description
"""
if not isinstance(query, bytes):
query = query.encode('utf-8')
params = urllib.parse.urlencode({'q': query})
url = 'http://duckduckgo.com/html/?' + params
data = read_url(url, timeout=timeout).decode('utf-8')
doc = lxml.html.document_fromstring(data)
results = []
regex = '<a .*?class="large" href="(.+?)">(.*?)</a>.*?<div class="snippet">(.*?)</div>'
for r in re.compile(regex, re.DOTALL).findall(data):
results.append((strip_tags(decode_html(r[1])), r[0], strip_tags(decode_html(r[2]))))
for e in doc.xpath("//a[contains(@class, 'result__a')]"):
url = e.attrib['href']
if 'uddg=' in url:
url = urllib.parse.unquote(url.split('&uddg=')[-1])
title = e.text_content()
description = ''
results.append((title, url, description))
return results

View file

@ -7,7 +7,7 @@ import time
from ox import strip_tags, find_re
from ox.cache import read_url
import google
from . import google
def get_show_url(title):

View file

@ -21,11 +21,11 @@ def find(query, max_results=DEFAULT_MAX_RESULTS, timeout=DEFAULT_TIMEOUT):
"""
Return max_results tuples with title, url, description
>>> find("The Matrix site:imdb.com", 1)[0][0]
u'The Matrix (1999) - IMDb'
>>> str(find("The Matrix site:imdb.com", 1)[0][0])
'The Matrix (1999) - IMDb'
>>> find("The Matrix site:imdb.com", 1)[0][1]
u'http://www.imdb.com/title/tt0133093/'
>>> str(find("The Matrix site:imdb.com", 1)[0][1])
'http://www.imdb.com/title/tt0133093/'
"""
results = []
offset = 0

View file

@ -7,7 +7,7 @@ import time
import unicodedata
from six.moves.urllib.parse import urlencode
from six import string_types
from six import text_type, string_types
from .. import find_re, strip_tags, decode_html
from .. import cache
@ -18,22 +18,95 @@ from . import duckduckgo
from ..utils import datetime
from ..geo import normalize_country_name
def read_url(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None, unicode=False):
def prepare_url(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None, unicode=False):
headers = headers.copy()
# https://webapps.stackexchange.com/questions/11003/how-can-i-disable-reconfigure-imdbs-automatic-geo-location-so-it-does-not-defau
headers['X-Forwarded-For'] = '72.21.206.80'
return url, data, headers, timeout, unicode
def read_url(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None, unicode=False):
url, data, headers, timeout, unicode = prepare_url(url, data, headers, timeout, valid, unicode)
return cache.read_url(url, data, headers, timeout, unicode=unicode)
def delete_url(url, data=None, headers=cache.DEFAULT_HEADERS):
url, data, headers, timeout, unicode = prepare_url(url, data, headers)
cache.store.delete(url, data, headers)
def get_url(id):
return "http://www.imdb.com/title/tt%s/" % id
def reference_section(id):
return {
'page': 'reference',
're': [
'<h4 name="{id}" id="{id}".*?<table(.*?)</table>'.format(id=id),
'<a href="/name/.*?>(.*?)</a>'
],
'type': 'list'
}
def zebra_list(label, more=None):
conditions = {
'page': 'reference',
're': [
'_label">' + label + '</td>.*?<ul(.*?)</ul>',
'<li.*?>(.*?)</li>'
],
'type': 'list',
}
if more:
conditions['re'] += more
return conditions
def zebra_table(label, more=None, type='string'):
conditions = {
'page': 'reference',
're': [
'_label">' + label + '</td>.*?<td>(.*?)</td>',
],
'type': type,
}
if more:
conditions['re'] += more
return conditions
def parse_aspectratio(value):
r = value
if ':' in value:
r = value.split(':')
n = r[0]
d = r[1].strip().split(' ')[0]
try:
if float(d):
value = str(float(n) / float(d))
else:
value = str(float(n))
except:
print('failed to parse aspect: %s' % value)
else:
value = '.'.join(value.strip().split('.')[:2])
return value
'''
'posterIds': {
'page': 'posters',
're': '/unknown-thumbnail/media/rm(.*?)/tt',
'type': 'list'
},
'''
class Imdb(SiteParser):
'''
>>> Imdb('0068646')['title']
u'The Godfather'
>>> Imdb('0068646')['title'] == text_type(u'The Godfather')
True
>>> Imdb('0133093')['title']
u'The Matrix'
>>> Imdb('0133093')['title'] == text_type(u'The Matrix')
True
'''
regex = {
regex = {
'alternativeTitles': {
'page': 'releaseinfo',
're': [
@ -41,98 +114,49 @@ class Imdb(SiteParser):
"td>(.*?)</td>.*?<td>(.*?)</td>"
],
'type': 'list'
},
'aspectratio': {
'page': 'combined',
're': 'Aspect Ratio:</h5><div class="info-content">([\d\.]+)',
'page': 'reference',
're': [
'Aspect Ratio</td>.*?ipl-inline-list__item">\s+([\d\.\:\ ]+)',
parse_aspectratio,
],
'type': 'float',
},
'budget': {
'page': 'business',
're': [
'<h5>Budget</h5>\s*?\$(.*?)<br',
lambda data: find_re(decode_html(data).replace(',', ''), '\d+')
],
'type': 'int'
},
'budget': zebra_table('Budget', more=[
lambda data: find_re(decode_html(data).replace(',', ''), '\d+')
], type='int'),
'cast': {
'page': 'combined',
'page': 'reference',
're': [
'<td class="nm">.*?>(.*?)</a>.*?<td class="char">(.*?)</td>',
lambda ll: [strip_tags(l) for l in ll]
],
'type': 'list'
},
'cinematographer': {
'page': 'combined',
're': [
lambda data: data.split('Series Crew')[0],
'Cinematography by</a>(.*?)</table>',
'<a href="/name/.*?/">(.*?)</a>'
' <table class="cast_list">(.*?)</table>',
'<td.*?itemprop="actor".*?>.*?>(.*?)</a>.*?<td class="character">(.*?)</td>',
lambda ll: [strip_tags(l) for l in ll] if isinstance(ll, list) else strip_tags(ll)
],
'type': 'list'
},
'cinematographer': reference_section('cinematographers'),
'connections': {
'page': 'movieconnections',
're': '<h4 class="li_group">(.*?)</h4>(.*?)(<\/div>\n <a|<script)',
'type': 'list'
},
'country': {
'page': 'combined',
're': [
'<div class="info"><h5>Country:</h5>.*?<div class="info">',
#'<a href="/country/.*?">(.*?)</a>', #links changed to work with existing caches, just take all links
'<a.*?>(.*?)</a>',
],
'type': 'list'
},
'country': zebra_list('Country', more=['<a.*?>(.*?)</a>']),
'creator': {
'page': 'combined',
'page': '',
're': [
'<h5>Creator.?:</h5>.*?<div class="info-content">(.*?)</div>',
'<a href="/name/.*?>(.*?)</a>'
],
'type': 'list'
},
'director': {
'page': 'combined',
're': [
lambda data: data.split('<b>Series Crew</b>')[0],
'Directed by</a>(.*?)</table>',
'<a href="/name/.*?>(.*?)</a>'
],
'type': 'list'
},
'_director': {
'page': 'combined',
're': [
'<h5>Director:</h5>.*?<div class="info-content">(.*?)</div>',
'<a href="/name/.*?>(.*?)</a>'
],
'type': 'list'
},
'editor': {
'page': 'combined',
're': [
lambda data: data.split('Series Crew')[0],
'Film Editing by</a>(.*?)</table>',
'<a href="/name/.*?>(.*?)</a>'
],
'type': 'list'
},
'composer': {
'page': 'combined',
're': [
lambda data: data.split('Series Crew')[0],
'Original Music by</a>(.*?)</table>',
'<a href="/name/.*?>(.*?)</a>'
'<div class="credit_summary_item">.*?<h4.*?>Creator.?:</h4>(.*?)</div>',
'<a href="/name/.*?>(.*?)</a>',
lambda ll: strip_tags(ll)
],
'type': 'list'
},
'director': reference_section('directors'),
'editor': reference_section('editors'),
'composer': reference_section('composers'),
'episodeTitle': {
'page': 'combined',
're': '<div id="tn15title">.*?<em>(.*?)</em>',
'page': 'reference',
're': '<h3 itemprop="name">(.*?)<',
'type': 'string'
},
'filmingLocations': {
@ -143,71 +167,44 @@ class Imdb(SiteParser):
],
'type': 'list'
},
'genre': {
'page': 'combined',
're': [
'<h5>Genre:</h5>(.*?)<hr',
'<a href="/Sections/Genres/.*?/">(.*?)</a>'
],
'type': 'list'
},
'gross': {
'page': 'business',
're': [
'<h5>Gross</h5>\s*?\$(.*?)<br',
lambda data: find_re(data.replace(',', ''), '\d+')
],
'type': 'int'
},
'genre': zebra_list('Genres', more=['<a.*?>(.*?)</a>']),
'gross': zebra_table('Cumulative Worldwide Gross', more=[
lambda data: find_re(decode_html(data).replace(',', ''), '\d+')
], type='int'),
'keyword': {
'page': 'keywords',
're': '<a href="/keyword/.*?>(.*?)</a>',
'type': 'list'
},
'language': {
'page': 'combined',
're': [
'<div class="info"><h5>Language:</h5>.*?<div class="info">',
#'<a href="/language/.*?">(.*?)</a>', #links changed to work with existing caches, just take all links
'<a.*?>(.*?)</a>',
],
'type': 'list'
},
'summary': {
'page': 'plotsummary',
're': '<p class="plotSummary">(.*?)<\/p>',
'language': zebra_list('Language', more=['<a.*?>(.*?)</a>']),
'originalTitle': {
'page': 'releaseinfo',
're': '<td>\(original title\)</td>\s*<td>(.*?)</td>',
'type': 'string'
},
'summary': zebra_table('Plot Summary', more=[
'<p>(.*?)<em'
]),
'posterId': {
'page': 'combined',
're': '/primary-photo/media/rm(.*?)/tt',
'page': 'reference',
're': '<img.*?class="titlereference-primary-image".*?src="(.*?)".*?>',
'type': 'string'
},
'posterIds': {
'page': 'posters',
're': '/unknown-thumbnail/media/rm(.*?)/tt',
'type': 'list'
},
'producer': {
'page': 'combined',
're': [
lambda data: data.split('Series Crew')[0],
'Produced by</a>(.*?)</table>',
'<a href="/name/.*?/">(.*?)</a>'
],
'type': 'list'
},
'producer': reference_section('producers'),
'productionCompany': {
'page': 'combined',
'page': 'reference',
're': [
'Production Companies</b><ul>(.*?)</ul>',
'Production Companies.*?<ul(.*?)</ul>',
'<a href="/company/.*?/">(.*?)</a>'
],
'type': 'list'
},
'rating': {
'page': 'combined',
're': '<div class="starbar-meta">.*?<b>([\d,.]+?)/10</b>',
'page': 'reference',
're': [
'<div class="ipl-rating-star ">(.*?)</div>',
'ipl-rating-star__rating">([\d,.]+?)</span>',
],
'type': 'float'
},
'releasedate': {
@ -218,64 +215,55 @@ class Imdb(SiteParser):
],
'type': 'list'
},
'reviews': {
'page': 'externalreviews',
're': [
'<ol>(.*?)</ol>',
'<li><a href="(http.*?)".*?>(.*?)</a></li>'
],
'type': 'list'
},
'runtime': {
'page': 'combined',
're': '<h5>Runtime:</h5><div class="info-content">.*?([0-9]+ sec|[0-9]+ min).*?</div>',
'type': 'string'
},
'color': {
'page': 'combined',
're': [
'<h5>Color:</h5><div class="info-content">(.*?)</div>',
'<a.*?>(.*?)</a>'
],
'type': 'list'
},
'sound': {
'page': 'combined',
're': [
'<h5>Sound Mix:</h5><div class="info-content">(.*?)</div>',
'<a.*?>(.*?)</a>'
],
'type': 'list'
},
#FIXME using some /offsite/ redirect now
#'reviews': {
# 'page': 'externalreviews',
# 're': [
# '<ul class="simpleList">(.*?)</ul>',
# '<li>.*?<a href="(http.*?)".*?>(.*?)</a>.*?</li>'
# ],
# 'type': 'list'
#},
'runtime': zebra_list('Runtime'),
'color': zebra_list('Color', more=[
'<a.*?>([^(<]+)',
lambda r: r[0] if isinstance(r, list) else r,
strip_tags
]),
'sound': zebra_list('Sound Mix', more=[
'<a.*?>([^(<]+)',
lambda r: r[0] if isinstance(r, list) else r,
strip_tags
]),
'season': {
'page': 'combined',
'page': 'reference',
're': [
'<h5>Original Air Date:</h5>.*?<div class="info-content">(.*?)</div>',
'\(Season (\d+), Episode \d+\)',
'<ul class="ipl-inline-list titlereference-overview-season-episode-numbers">(.*?)</ul>',
'Season (\d+)',
],
'type': 'int'
},
'episode': {
'page': 'combined',
'page': 'reference',
're': [
'<h5>Original Air Date:</h5>.*?<div class="info-content">(.*?)</div>',
'\(Season \d+, Episode (\d+)\)',
'<ul class="ipl-inline-list titlereference-overview-season-episode-numbers">(.*?)</ul>',
'Episode (\d+)',
],
'type': 'int'
},
'series': {
'page': 'combined',
're': '<h5>TV Series:</h5>.*?<a href="/title/tt(\d{7})',
'page': 'reference',
're': '<h4 itemprop="name">.*?<a href="/title/tt(\d{7})',
'type': 'string'
},
'isSeries': {
'page': 'combined',
're': '<span class="tv-extra">(TV series|TV mini-series) ',
'page': 'reference',
're': 'property=\'og:title\'.*?content=".*?(TV series|TV mini-series).*?"',
'type': 'string'
},
'title': {
'page': 'combined',
're': '<h1>(.*?) <span>',
'page': 'releaseinfo',
're': 'h3 itemprop="name">.*?>(.*?)</a>',
'type': 'string'
},
'trivia': {
@ -287,38 +275,45 @@ class Imdb(SiteParser):
'type': 'list',
},
'votes': {
'page': 'combined',
're': '<a href="ratings" class="tn15more">([\d,]*?) votes</a>',
'page': 'reference',
're': [
'class="ipl-rating-star__total-votes">\((.*?)\)',
lambda r: r.replace(',', '')
],
'type': 'string'
},
'writer': {
'page': 'combined',
'writer': reference_section('writers'),
'year': {
'page': 'reference',
're': [
lambda data: data.split('Series Crew')[0],
'Writing credits</a>(.*?)</table>',
'<a href="/name/.*?/">(.*?)</a>'
'<span class="titlereference-title-year">(.*?)</span>',
'<a.*?>(\d+)',
],
'type': 'int'
},
'credits': {
'page': 'fullcredits',
're': [
lambda data: data.split('<h4'),
'>(.*?)</h4>.*?(<table.*?</table>)',
lambda data: [d for d in data if d]
],
'type': 'list'
},
'year': {
'page': 'combined',
're': '="og:title" content="[^"]*?\((\d{4}).*?"',
'type': 'int'
}
}
def read_url(self, url, timeout):
if not url in self._cache:
if url not in self._cache:
self._cache[url] = read_url(url, timeout=timeout, unicode=True)
return self._cache[url]
def __init__(self, id, timeout=-1):
#use akas.imdb.com to always get original title:
#http://www.imdb.com/help/show_leaf?titlelanguagedisplay
self.baseUrl = "http://akas.imdb.com/title/tt%s/" % id
# use akas.imdb.com to always get original title:
# http://www.imdb.com/help/show_leaf?titlelanguagedisplay
self.baseUrl = "http://www.imdb.com/title/tt%s/" % id
super(Imdb, self).__init__(timeout)
url = self.baseUrl + 'combined'
url = self.baseUrl + 'reference'
page = self.read_url(url, timeout=-1)
if '<title>IMDb: Page not found</title>' in page \
or 'The requested URL was not found on our server.' in page:
@ -332,119 +327,15 @@ class Imdb(SiteParser):
isinstance(self['alternativeTitles'][0], string_types):
self['alternativeTitles'] = [self['alternativeTitles']]
for key in ('country', 'genre', 'language', 'sound', 'color'):
if key in self:
self[key] = [x[0] if len(x) == 1 and isinstance(x, list) else x for x in self[key]]
self[key] = list(filter(lambda x: x.lower() != 'home', self[key]))
#normalize country names
if 'country' in self:
self['country'] = [normalize_country_name(c) or c for c in self['country']]
if 'sound' in self:
self['sound'] = list(set(self['sound']))
types = {}
stop_words = [
'alternative spelling',
'alternative title',
'alternative transliteration',
'closing credits title',
'complete title',
'IMAX version',
'informal short title',
'International (Spanish title)',
'Japan (imdb display title)',
'longer version',
'new title',
'original subtitled version',
'pre-release title',
'promotional abbreviation',
'recut version',
'reissue title',
'restored version',
'script title',
'short title',
'(subtitle)',
'TV title',
'working title',
'World-wide (Spanish title)',
]
#ignore english japanese titles
#for movies that are not only from japan
if ['Japan'] != self.get('country', []):
stop_words += [
'Japan (English title)'
]
for t in self.get('alternativeTitles', []):
for type in t[0].split('/'):
type = type.strip()
stop_word = False
for key in stop_words:
if key in type:
stop_word = True
break
if not stop_word:
if not type in types:
types[type] = []
types[type].append(t[1])
titles = {}
for type in types:
for title in types[type]:
if not title in titles:
titles[title] = []
titles[title].append(type)
def select_title(type):
title = types[type][0]
count = 0
if len(types[type]) > 1:
for t in types[type]:
if len(titles[t]) > count:
count = len(titles[t])
title = t
return title
#FIXME: does work in python2.6, possible to import from __future__?
#types = {type: select_title(type) for type in types}
_types = {}
for type in types:
_types[type] = select_title(type)
types = _types
regexps = [
"^.+ \(imdb display title\) \(English title\)$",
"^USA \(imdb display title\)$",
"^International \(English title\)$",
"^International \(English title\)$",
"^UK \(imdb display title\)$",
"^International \(.+\) \(English title\)$",
"^World-wide \(English title\)$",
]
if 'Hong Kong' in self.get('country', []):
regexps += [
"Hong Kong \(English title\)"
]
english_countries = (
'USA', 'UK', 'United States', 'United Kingdom',
'Australia', 'New Zealand'
)
if not filter(lambda c: c in english_countries, self.get('country', [])):
regexps += [
"^[^(]+ \(English title\)$",
"^.+ \(.+\) \(English title\)$",
"^USA$",
"^UK$",
"^USA \(.+\)$",
"^UK \(.+\)$",
"^Australia \(.+\)$",
"World-wide \(English title\)",
"\(literal English title\)",
"^International \(.+ title\)$",
"^International \(.+\) \(.+ title\)$",
]
for regexp in regexps:
for type in types:
if re.compile(regexp).findall(type):
#print types[type], type
self['internationalTitle'] = types[type]
break
if 'internationalTitle' in self:
break
def cleanup_title(title):
if title.startswith('"') and title.endswith('"'):
@ -454,44 +345,43 @@ class Imdb(SiteParser):
title = re.sub('\(\#[.\d]+\)', '', title)
return title.strip()
for t in ('title', 'internationalTitle'):
for t in ('title', 'originalTitle'):
if t in self:
self[t] = cleanup_title(self[t])
if 'internationalTitle' in self and \
self.get('title', '').lower() == self['internationalTitle'].lower():
del self['internationalTitle']
if 'alternativeTitles' in self:
alt = {}
for t in self['alternativeTitles']:
title = cleanup_title(t[1])
if title not in (self.get('title'), self.get('internationalTitle')):
if title.lower() not in (self.get('title', '').lower(), self.get('originalTitle', '').lower()):
if title not in alt:
alt[title] = []
for c in t[0].split('/'):
if not '(working title)' in c:
c = c.replace('International', '').replace('World-wide', '').split('(')[0].strip()
if c:
alt[title].append(c)
for cleanup in ('International', '(working title)', 'World-wide'):
c = c.replace(cleanup, '')
c = c.split('(')[0].strip()
if c:
alt[title].append(c)
self['alternativeTitles'] = []
for t in sorted(alt, key=lambda a: sorted(alt[a])):
countries = sorted([normalize_country_name(c) or c for c in alt[t]])
countries = sorted(set([normalize_country_name(c) or c for c in alt[t]]))
self['alternativeTitles'].append((t, countries))
if not self['alternativeTitles']:
del self['alternativeTitles']
if 'internationalTitle' in self:
self['originalTitle'] = self['title']
self['title'] = self.pop('internationalTitle')
if 'runtime' in self and self['runtime']:
if 'min' in self['runtime']: base=60
else: base=1
if isinstance(self['runtime'], list):
self['runtime'] = self['runtime'][0]
if 'min' in self['runtime']:
base = 60
else:
base = 1
self['runtime'] = int(find_re(self['runtime'], '([0-9]+)')) * base
if 'runtime' in self and not self['runtime']:
del self['runtime']
if 'votes' in self: self['votes'] = self['votes'].replace(',', '')
if 'sound' in self:
self['sound'] = list(sorted(set(self['sound'])))
if 'cast' in self:
if isinstance(self['cast'][0], string_types):
@ -499,6 +389,7 @@ class Imdb(SiteParser):
self['actor'] = [c[0] for c in self['cast']]
def cleanup_character(c):
c = c.replace('(uncredited)', '').strip()
c = re.sub('\s+', ' ', c)
return c
self['cast'] = [{'actor': x[0], 'character': cleanup_character(x[1])}
for x in self['cast']]
@ -522,18 +413,8 @@ class Imdb(SiteParser):
return r
cc[rel] = list(map(get_conn, re.compile('<a href="/title/tt(\d{7})/?">(.*?)</a>(.*?)<\/div', re.DOTALL).findall(data)))
self['connections'] = cc
for key in ('country', 'genre'):
if key in self:
self[key] = list(filter(lambda x: x.lower() != 'home', self[key]))
#0092999
if '_director' in self:
if 'series' in self or 'isSeries' in self:
self['creator'] = self.pop('_director')
else:
del self['_director']
if 'isSeries' in self:
del self['isSeries']
self['isSeries'] = True
@ -555,7 +436,7 @@ class Imdb(SiteParser):
if 'director' in self:
self['episodeDirector'] = self['director']
if not 'creator' in series and 'director' in series:
if 'creator' not in series and 'director' in series:
series['creator'] = series['director']
if len(series['creator']) > 10:
series['creator'] = series['director'][:1]
@ -566,7 +447,7 @@ class Imdb(SiteParser):
if 'year' in series:
self['seriesYear'] = series['year']
if not 'year' in self:
if 'year' not in self:
self['year'] = series['year']
if 'year' in self:
@ -620,11 +501,48 @@ class Imdb(SiteParser):
self['summary'] = self['summary'][0]
self['summary'] = self['summary'].split('</p')[0].strip()
if 'credits' in self:
credits = [
[
strip_tags(d[0].replace(' by', '')).strip(),
[
[
strip_tags(x[0]).strip(),
[t.strip().split(' (')[0].strip() for t in x[2].split(' / ')]
]
for x in
re.compile('<td class="name">(.*?)</td>.*?<td>(.*?)</td>.*?<td class="credit">(.*?)</td>', re.DOTALL).findall(d[1])
]
] for d in self['credits'] if d
]
credits = [c for c in credits if c[1]]
self['credits'] = []
self['lyricist'] = []
self['singer'] = []
for department, crew in credits:
department = department.replace('(in alphabetical order)', '').strip()
for c in crew:
name = c[0]
roles = c[1]
self['credits'].append({
'name': name,
'roles': roles,
'deparment': department
})
if department == 'Music Department':
if 'lyricist' in roles:
self['lyricist'].append(name)
if 'playback singer' in roles:
self['singer'].append(name)
if not self['credits']:
del self['credits']
class ImdbCombined(Imdb):
def __init__(self, id, timeout=-1):
_regex = {}
for key in self.regex:
if self.regex[key]['page'] in ('combined', 'releaseinfo'):
if self.regex[key]['page'] in ('releaseinfo', 'reference'):
_regex[key] = self.regex[key]
self.regex = _regex
super(ImdbCombined, self).__init__(id, timeout)
@ -640,25 +558,25 @@ def get_movie_by_title(title, timeout=-1):
If there is more than one film with that title for the year
Title (Year/I)
>>> get_movie_by_title(u'"Father Knows Best" (1954) {(#5.34)}')
u'1602860'
>>> str(get_movie_by_title(u'"Father Knows Best" (1954) {(#5.34)}'))
'1602860'
>>> get_movie_by_title(u'The Matrix (1999)')
u'0133093'
>>> str(get_movie_by_title(u'The Matrix (1999)'))
'0133093'
>>> get_movie_by_title(u'Little Egypt (1951)')
u'0043748'
>>> str(get_movie_by_title(u'Little Egypt (1951)'))
'0043748'
>>> str(get_movie_by_title(u'Little Egypt (1897/I)'))
'0214882'
>>> get_movie_by_title(u'Little Egypt (1897/I)')
u'0214882'
>>> get_movie_by_title(u'Little Egypt')
None
>>> get_movie_by_title(u'"Dexter" (2006) {Father Knows Best (#1.9)}')
u'0866567'
>>> str(get_movie_by_title(u'"Dexter" (2006) {Father Knows Best (#1.9)}'))
'0866567'
'''
params = {'s':'tt','q': title}
params = {'s': 'tt', 'q': title}
if not isinstance(title, bytes):
try:
params['q'] = unicodedata.normalize('NFKC', params['q']).encode('latin-1')
@ -676,20 +594,21 @@ def get_movie_by_title(title, timeout=-1):
def get_movie_id(title, director='', year='', timeout=-1):
'''
>>> get_movie_id('The Matrix')
u'0133093'
>>> str(get_movie_id('The Matrix'))
'0133093'
>>> get_movie_id('2 or 3 Things I Know About Her', 'Jean-Luc Godard')
u'0060304'
>>> str(get_movie_id('2 or 3 Things I Know About Her', 'Jean-Luc Godard'))
'0060304'
>>> get_movie_id('2 or 3 Things I Know About Her', 'Jean-Luc Godard', '1967')
u'0060304'
>>> str(get_movie_id('2 or 3 Things I Know About Her', 'Jean-Luc Godard', '1967'))
'0060304'
>>> get_movie_id(u"Histoire(s) du cinema: Le controle de l'univers", 'Jean-Luc Godard')
u'0179214'
>>> str(get_movie_id(u"Histoire(s) du cinema: Le controle de l'univers", u'Jean-Luc Godard'))
'0179214'
>>> str(get_movie_id(u"Histoire(s) du cinéma: Le contrôle de l'univers", u'Jean-Luc Godard'))
'0179214'
>>> get_movie_id(u"Histoire(s) du cinéma: Le contrôle de l'univers", 'Jean-Luc Godard')
u'0179214'
'''
imdbId = {
(u'Le jour se l\xe8ve', u'Marcel Carn\xe9'): '0031514',
@ -729,7 +648,7 @@ def get_movie_id(title, director='', year='', timeout=-1):
}.get((title, director), None)
if imdbId:
return imdbId
params = {'s':'tt','q': title}
params = {'s': 'tt', 'q': title}
if director:
params['q'] = u'"%s" %s' % (title, director)
if year:
@ -756,8 +675,8 @@ def get_movie_id(title, director='', year='', timeout=-1):
if results:
return results[0]
#print (title, director), ": '',"
#print google_query
#print((title, director), ": '',")
#print(google_query)
#results = google.find(google_query, timeout=timeout)
results = duckduckgo.find(google_query, timeout=timeout)
if results:
@ -772,15 +691,12 @@ def get_movie_poster(imdbId):
'''
>>> get_movie_poster('0133093')
'http://ia.media-imdb.com/images/M/MV5BMjEzNjg1NTg2NV5BMl5BanBnXkFtZTYwNjY3MzQ5._V1._SX338_SY475_.jpg'
>>> get_movie_poster('0994352')
'http://ia.media-imdb.com/images/M/MV5BMjA3NzMyMzU1MV5BMl5BanBnXkFtZTcwNjc1ODUwMg@@._V1._SX594_SY755_.jpg'
'''
info = ImdbCombined(imdbId)
if 'posterId' in info:
url = "http://www.imdb.com/media/rm%s/tt%s" % (info['posterId'], imdbId)
data = read_url(url).decode('utf-8', 'ignore')
poster = find_re(data, 'img.*?id="primary-img".*?src="(.*?)"')
poster = info['posterId']
if '@._V' in poster:
poster = poster.split('@._V')[0] + '@.jpg'
return poster
elif 'series' in info:
return get_movie_poster(info['series'])
@ -793,7 +709,7 @@ def get_episodes(imdbId, season=None):
url += '?season=%d' % season
data = cache.read_url(url)
for e in re.compile('<div data-const="tt(\d{7})".*?>.*?<div>S(\d+), Ep(\d+)<\/div>\n<\/div>', re.DOTALL).findall(data):
episodes['S%02dE%02d' %(int(e[1]), int(e[2]))] = e[0]
episodes['S%02dE%02d' % (int(e[1]), int(e[2]))] = e[0]
else:
data = cache.read_url(url)
match = re.compile('<strong>Season (\d+)</strong>').findall(data)
@ -804,9 +720,11 @@ def get_episodes(imdbId, season=None):
def max_votes():
url = 'http://www.imdb.com/search/title?num_votes=500000,&sort=num_votes,desc'
data = cache.read_url(url)
votes = max([int(v.replace(',', ''))
for v in re.compile('<td class="sort_col">([\d,]+)</td>').findall(data)])
data = cache.read_url(url).decode('utf-8', 'ignore')
votes = max([
int(v.replace(',', ''))
for v in re.compile('<span name="nv" data-value="(\d+)"').findall(data)
])
return votes
def guess(title, director='', timeout=-1):

View file

@ -3,26 +3,34 @@
from __future__ import print_function
import re
from ox.cache import read_url
import ox.cache
from ox.html import strip_tags
from ox.text import find_re
def read_url(url, timeout=ox.cache.cache_timeout):
data = ox.cache.read_url(url, timeout=timeout)
try:
data = data.decode('utf-8')
except UnicodeDecodeError:
data = data.decode('latin-1')
return data
def get_data(id):
'''
>>> get_data('1991/silence_of_the_lambs')['imdbId']
u'0102926'
>>> str(get_data('1991/silence_of_the_lambs')['imdbId'])
'0102926'
>>> get_data('1991/silence_of_the_lambs')['posters'][0]
u'http://www.impawards.com/1991/posters/silence_of_the_lambs_ver1.jpg'
>>> str(get_data('1991/silence_of_the_lambs')['posters'][0])
'http://www.impawards.com/1991/posters/silence_of_the_lambs_ver1.jpg'
>>> get_data('1991/silence_of_the_lambs')['url']
u'http://www.impawards.com/1991/silence_of_the_lambs_ver1.html'
>>> str(get_data('1991/silence_of_the_lambs')['url'])
'http://www.impawards.com/1991/silence_of_the_lambs_ver1.html'
'''
data = {
'url': get_url(id)
}
html = read_url(data['url'], unicode=True)
html = read_url(data['url'])
data['imdbId'] = find_re(html, 'imdb.com/title/tt(\d{7})')
if not data['imdbId']:
data['imdbId'] = _id_map.get(id, '')
@ -37,16 +45,15 @@ def get_data(id):
for result in results:
result = result.replace('_xlg.html', '.html')
url = 'http://www.impawards.com/%s/%s' % (data['year'], result)
html = read_url(url, unicode=True)
html = read_url(url)
result = find_re(html, '<a href = (\w*?_xlg.html)')
if result:
url = 'http://www.impawards.com/%s/%s' % (data['year'], result)
html = read_url(url, unicode=True)
html = read_url(url)
poster = 'http://www.impawards.com/%s/%s' % (data['year'], find_re(html, '<img SRC="(.*?)"'))
else:
poster = 'http://www.impawards.com/%s/%s' % (data['year'], find_re(html, '<img src="(posters.*?)"'))
data['posters'].append(poster)
return data
def get_id(url):
@ -60,27 +67,29 @@ def get_id(url):
id = '%s/%s' % (year, '_'.join(split))
return id
def get_ids(page=None):
ids = []
if page:
html = read_url('http://www.impawards.com/archives/page%s.html' % page, timeout = -1, unicode=True)
html = read_url('http://www.impawards.com/archives/page%s.html' % page, timeout=-1)
results = re.compile('<a href = \.\./(.*?)>', re.DOTALL).findall(html)
for result in results:
url = 'http://impawards.com/%s' % result
ids.append(get_id(url))
return set(ids)
#get all
html = read_url('http://www.impawards.com/archives/latest.html', timeout = 60*60, unicode=True)
# get all
html = read_url('http://www.impawards.com/archives/latest.html', timeout=60*60)
pages = int(find_re(html, '<a href= page(.*?).html>')) + 1
for page in range(pages, 0, -1):
for id in get_ids(page):
if not id in ids:
if id not in ids:
ids.append(id)
return ids
def get_url(id):
url = u"http://www.impawards.com/%s.html" % id
html = read_url(url, unicode=True)
html = read_url(url)
if find_re(html, "No Movie Posters on This Page"):
url = u"http://www.impawards.com/%s_ver1.html" % id
return url

View file

@ -28,22 +28,32 @@ def get_show_url(title):
def get_data(url):
data = read_url(url, unicode=True)
doc = document_fromstring(data)
score = filter(lambda s: s.attrib.get('property') == 'v:average',
doc.xpath('//span[@class="score_value"]'))
score = [s for s in doc.xpath('//span[@class="score_value"]')
if s.attrib.get('property') == 'v:average']
if score:
score = int(score[0].text)
else:
score = -1
authors = [a.text
for a in doc.xpath('//div[@class="review_content"]//div[@class="author"]//a')]
sources = [d.text
for d in doc.xpath('//div[@class="review_content"]//div[@class="source"]/a')]
reviews = [d.text
for d in doc.xpath('//div[@class="review_content"]//div[@class="review_body"]')]
scores = [int(d.text.strip())
for d in doc.xpath('//div[@class="review_content"]//div[contains(@class, "critscore")]')]
urls = [a.attrib['href']
for a in doc.xpath('//div[@class="review_content"]//a[contains(@class, "external")]')]
authors = [
a.text
for a in doc.xpath('//div[@class="review_content"]//div[@class="author"]//a')
]
sources = [
d.text
for d in doc.xpath('//div[@class="review_content"]//div[@class="source"]/a')
]
reviews = [
d.text
for d in doc.xpath('//div[@class="review_content"]//div[@class="review_body"]')
]
scores = [
int(d.text.strip())
for d in doc.xpath('//div[@class="review_content"]//div[contains(@class, "critscore")]')
]
urls = [
a.attrib['href']
for a in doc.xpath('//div[@class="review_content"]//a[contains(@class, "external")]')
]
metacritics = []
for i in range(len(authors)):
@ -54,7 +64,7 @@ def get_data(url):
'quote': strip_tags(reviews[i]).strip(),
'score': scores[i],
})
return {
'critics': metacritics,
'id': get_id(url),

View file

@ -1,121 +0,0 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
from datetime import datetime
import re
import socket
from six.moves.urllib.parse import quote
from ox.cache import read_url
from ox import find_re, cache, strip_tags, decode_html, get_torrent_info, int_value, normalize_newlines
from ox.normalize import normalize_imdbid
import ox
from torrent import Torrent
def _parse_results_page(data, max_results=10):
results=[]
regexp = '''<tr><td>(.*?)</td><td>(.*?)<a href="/tor/(.*?)">(.*?)</a>.*?</td>.*?</tr>'''
for row in re.compile(regexp, re.DOTALL).findall(data):
torrentDate = row[0]
torrentExtra = row[1]
torrentId = row[2]
torrentTitle = decode_html(row[3]).strip()
torrentLink = "http://www.mininova.org/tor/" + torrentId
privateTracker = 'priv.gif' in torrentExtra
if not privateTracker:
results.append((torrentTitle, torrentLink, ''))
return results
def find_movie(query=None, imdb=None, max_results=10):
'''search for torrents on mininova
'''
if imdb:
url = "http://www.mininova.org/imdb/?imdb=%s" % normalize_imdbid(imdb)
else:
url = "http://www.mininova.org/search/%s/seeds" % quote(query)
data = read_url(url, unicode=True)
return _parse_results_page(data, max_results)
def get_id(mininovaId):
mininovaId = unicode(mininovaId)
d = find_re(mininovaId, "/(\d+)")
if d:
return d
mininovaId = mininovaId.split('/')
if len(mininovaId) == 1:
return mininovaId[0]
else:
return mininovaId[-1]
def exists(mininovaId):
mininovaId = get_id(mininovaId)
data = ox.net.read_url("http://www.mininova.org/tor/%s" % mininovaId)
if not data or 'Torrent not found...' in data:
return False
if 'tracker</a> of this torrent requires registration.' in data:
return False
return True
def get_data(mininovaId):
_key_map = {
'by': u'uploader',
}
mininovaId = get_id(mininovaId)
torrent = dict()
torrent[u'id'] = mininovaId
torrent[u'domain'] = 'mininova.org'
torrent[u'comment_link'] = "http://www.mininova.org/tor/%s" % mininovaId
torrent[u'torrent_link'] = "http://www.mininova.org/get/%s" % mininovaId
torrent[u'details_link'] = "http://www.mininova.org/det/%s" % mininovaId
data = read_url(torrent['comment_link'], unicode=True) + read_url(torrent['details_link'], unicode=True)
if '<h1>Torrent not found...</h1>' in data:
return None
for d in re.compile('<p>.<strong>(.*?):</strong>(.*?)</p>', re.DOTALL).findall(data):
key = d[0].lower().strip()
key = _key_map.get(key, key)
value = decode_html(strip_tags(d[1].strip()))
torrent[key] = value
torrent[u'title'] = find_re(data, '<title>(.*?):.*?</title>')
torrent[u'imdbId'] = find_re(data, 'title/tt(\d{7})')
torrent[u'description'] = find_re(data, '<div id="description">(.*?)</div>')
if torrent['description']:
torrent['description'] = normalize_newlines(decode_html(strip_tags(torrent['description']))).strip()
t = read_url(torrent[u'torrent_link'])
torrent[u'torrent_info'] = get_torrent_info(t)
return torrent
class Mininova(Torrent):
'''
>>> Mininova('123')
{}
>>> Mininova('1072195')['infohash']
'72dfa59d2338e4a48c78cec9de25964cddb64104'
'''
def __init__(self, mininovaId):
self.data = get_data(mininovaId)
if not self.data:
return
Torrent.__init__(self)
ratio = self.data['share ratio'].split(',')
self['seeder'] = -1
self['leecher'] = -1
if len(ratio) == 2:
val = int_value(ratio[0].replace(',','').strip())
if val:
self['seeder'] = int(val)
val = int_value(ratio[1].replace(',','').strip())
if val:
self['leecher'] = int(val)
val = int_value(self.data['downloads'].replace(',','').strip())
if val:
self['downloaded'] = int(val)
else:
self['downloaded'] = -1
published = self.data['added on']
published = published.split(' +')[0]
self['published'] = datetime.strptime(published, "%a, %d %b %Y %H:%M:%S")

View file

@ -2,12 +2,12 @@
# vi:si:et:sw=4:sts=4:ts=4
import re
import feedparser
from ox.cache import read_url
from ox import find_re, strip_tags
from ox.iso import langCode2To3, langTo3Code
def find_subtitles(imdb, parts = 1, language = "eng"):
import feedparser
if len(language) == 2:
language = langCode2To3(language)
elif len(language) != 3:

View file

@ -32,7 +32,7 @@ def get_data(url):
r['summary'] = get_og(data, 'description')
meter = re.compile('<span id="all-critics-meter" class="meter(.*?)">(.*?)</span>').findall(data)
meter = filter(lambda m: m[1].isdigit(), meter)
meter = [m for m in meter if m[1].isdigit()]
if meter:
r['tomatometer'] = meter[0][1]
r['rating'] = find_re(data, 'Average Rating: <span>([\d.]+)/10</span>')

View file

@ -33,7 +33,7 @@ class SiteParser(dict):
return "%s%s" % (self.baseUrl, page)
def read_url(self, url, timeout):
if not url in self._cache:
if url not in self._cache:
self._cache[url] = read_url(url, timeout=timeout, unicode=True)
return self._cache[url]

View file

@ -95,7 +95,7 @@ def format_subsection(string):
'ussports': 'US-Sports',
'wunderbar': 'wunderBAR'
}
if subsection.has_key(string):
if string in subsection:
return subsection[string].replace(u'\xc3', 'ae')
return string[:1].upper() + string[1:]
@ -219,8 +219,8 @@ def archive_news():
else:
dMax = days[m]
for d in range(dMax, 0, -1):
print('getNews(%d, %d, %d)' % (y, m, d))
news = getNews(y, m ,d)
print('get_news(%d, %d, %d)' % (y, m, d))
news = get_news(y, m, d)
for new in news:
dirname = archivePath + '/' + new['date'][0:4] + '/' + new['date'][5:7] + new['date'][8:10] + '/' + new['date'][11:13] + new['date'][14:16]
if not os.path.exists(dirname):
@ -230,7 +230,7 @@ def archive_news():
else:
filename = dirname + '/' + new['url'] + '.json'
if not os.path.exists(filename) or True:
data = json.dumps(new, ensure_ascii = False)
data = json.dumps(new, ensure_ascii=False)
f = open(filename, 'w')
f.write(data)
f.close()
@ -253,7 +253,7 @@ def archive_news():
string = strings[3]
if len(strings) == 6:
string += '/' + strings[4]
if not count.has_key(string):
if string not in count:
count[string] = {'count': 1, 'string': '%s %s http://www.spiegel.de/%s/0,1518,archiv-%d-%03d,00.html' % (new['date'], new['date'], new['section'].lower(), y, int(datetime(y, m, d).strftime('%j')))}
else:
count[string] = {'count': count[string]['count'] + 1, 'string': '%s %s' % (new['date'], count[string]['string'][17:])}
@ -269,12 +269,12 @@ if __name__ == '__main__':
# spiegel = Spiegel(2008, 8)
# print(spiegel.getContents())
# news = News(2001, 9, 10)
# output(news.getNews())
# output(news.get_news())
'''
x = []
for d in range(10, 30):
print('2/%d' % d)
news = getNews(2008, 2, d)
news = get_news(2008, 2, d)
for new in news:
strings = new['url'].split('/')
string = format_section(strings[3])

View file

@ -21,10 +21,10 @@ def find(query, max_results=DEFAULT_MAX_RESULTS, timeout=DEFAULT_TIMEOUT):
Return max_results tuples with title, url, description
>>> find("The Matrix site:imdb.com", 1)[0][0]
u'The Matrix (1999) - IMDb'
'The Matrix (1999) - IMDb'
>>> find("The Matrix site:imdb.com", 1)[0][1]
u'http://www.imdb.com/title/tt0133093/'
'http://www.imdb.com/title/tt0133093/'
"""
results = []
url = 'https://eu1.startpage.com/do/search?nosteeraway=1&abp=1&language=english&cmd=process_search&query=%s&x=0&y=0&cat=web&engine0=v1all' % quote_plus(query)

View file

@ -9,11 +9,10 @@ from ox import find_re, cache, strip_tags, decode_html, get_torrent_info, normal
from ox.normalize import normalize_imdbid
import ox
from torrent import Torrent
cache_timeout = 24*60*60 # cache search only for 24 hours
season_episode = re.compile("S..E..", re.IGNORECASE)
baseurl = "https://thepiratebay.org/"
def read_url(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None, unicode=False):
@ -25,7 +24,7 @@ def find_movies(query=None, imdb=None, max_results=10):
if imdb:
query = "tt" + normalize_imdbid(imdb)
results = []
next = ["https://thepiratebay.se/search/%s/0/3/200" % quote(query), ]
next = [baseurl + "hsearch/%s/0/3/200" % quote(query), ]
page_count = 1
while next and page_count < 4:
page_count += 1
@ -33,12 +32,12 @@ def find_movies(query=None, imdb=None, max_results=10):
if not url.startswith('http'):
if not url.startswith('/'):
url = "/" + url
url = "https://thepiratebay.se" + url
url = baseurl + url
data = read_url(url, timeout=cache_timeout, unicode=True)
regexp = '''<tr.*?<td class="vertTh"><a href="/browse/(.*?)".*?<td><a href="(/torrent/.*?)" class="detLink".*?>(.*?)</a>.*?</tr>'''
for row in re.compile(regexp, re.DOTALL).findall(data):
torrentType = row[0]
torrentLink = "https://thepiratebay.se" + row[1]
torrentLink = baseurl + row[1]
torrentTitle = decode_html(row[2])
# 201 = Movies , 202 = Movie DVDR, 205 TV Shows
if torrentType in ['201']:
@ -61,7 +60,7 @@ def get_id(piratebayId):
def exists(piratebayId):
piratebayId = get_id(piratebayId)
return ox.net.exists("https://thepiratebay.se/torrent/%s" % piratebayId)
return ox.net.exists(baseurl + "torrent/%s" % piratebayId)
def get_data(piratebayId):
_key_map = {
@ -75,7 +74,7 @@ def get_data(piratebayId):
torrent = dict()
torrent[u'id'] = piratebayId
torrent[u'domain'] = 'thepiratebay.org'
torrent[u'comment_link'] = 'https://thepiratebay.se/torrent/%s' % piratebayId
torrent[u'comment_link'] = baseurl + 'torrent/%s' % piratebayId
data = read_url(torrent['comment_link'], unicode=True)
torrent[u'title'] = find_re(data, '<title>(.*?) \(download torrent\) - TPB</title>')
@ -84,33 +83,15 @@ def get_data(piratebayId):
torrent[u'title'] = decode_html(torrent[u'title']).strip()
torrent[u'imdbId'] = find_re(data, 'title/tt(\d{7})')
title = quote(torrent['title'].encode('utf-8'))
torrent[u'torrent_link']="http://torrents.thepiratebay.org/%s/%s.torrent" % (piratebayId, title)
torrent[u'magent_link']= find_re(data, '"(magnet:.*?)"')
torrent[u'infohash'] = find_re(torrent[u'magent_link'], "btih:(.*?)&")
for d in re.compile('dt>(.*?):</dt>.*?<dd.*?>(.*?)</dd>', re.DOTALL).findall(data):
key = d[0].lower().strip()
key = _key_map.get(key, key)
value = decode_html(strip_tags(d[1].strip()))
torrent[key] = value
if not '<' in key:
torrent[key] = value
torrent[u'description'] = find_re(data, '<div class="nfo">(.*?)</div>')
if torrent[u'description']:
torrent['description'] = normalize_newlines(decode_html(strip_tags(torrent['description']))).strip()
t = read_url(torrent[u'torrent_link'])
torrent[u'torrent_info'] = get_torrent_info(t)
return torrent
class Thepiratebay(Torrent):
'''
>>> Thepiratebay('123')
{}
>>> Thepiratebay('3951349')['infohash']
'4e84415d36ed7b54066160c05a0b0f061898d12b'
'''
def __init__(self, piratebayId):
self.data = get_data(piratebayId)
if not self.data:
return
Torrent.__init__(self)
published = self.data['uploaded']
published = published.replace(' GMT', '').split(' +')[0]
self['published'] = datetime.strptime(published, "%Y-%m-%d %H:%M:%S")

View file

@ -1,37 +0,0 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
from ox import int_value
class Torrent(dict):
'''
>>> Torrent()
{'files': 1, 'domain': u'', 'subtitle language': u'', 'seeder': -1, 'description': u'', 'language': u'', 'title': u'', 'imdbId': u'', 'downloaded': -1, 'leecher': -1, 'torrent_link': u'', 'torrent_info': {}, 'published': u'', 'announce': '', 'infohash': '', 'id': u'', 'comment_link': u'', 'size': -1}
'''
_string_keys = ('id', 'title', 'description', 'infohash', 'torrent_link', 'comment_link',
'imdbId', 'announce', 'domain', 'published', 'language', 'subtitle language')
_int_keys = ('size', 'seeder', 'leecher', 'downloaded', 'files')
_dict_keys = ('torrent_info', )
_list_keys = ()
data = {'torrent_info': {}}
def __init__(self):
for key in self._string_keys:
self[key] = self.data.get(key, u'')
for key in self._dict_keys:
self[key] = self.data.get(key, {})
for key in self._list_keys:
self[key] = self.data.get(key, [])
for key in self._int_keys:
value = self.data.get(key, -1)
if not isinstance(value, int):
value = int(int_value(value))
self[key] = value
self['infohash'] = self.data['torrent_info'].get('hash', '')
self['size'] = self.data['torrent_info'].get('size', -1)
self['announce'] = self.data['torrent_info'].get('announce', '')
if 'files' in self.data['torrent_info']:
self['files'] = len(self.data['torrent_info']['files'])
else:
self['files'] = 1

View file

@ -116,7 +116,7 @@ def get_movie_data(wikipedia_url):
def get_image_url(name):
url = 'http://en.wikipedia.org/wiki/Image:' + name.replace(' ', '%20')
data = read_url(url)
data = read_url(url).decode('utf-8')
url = find_re(data, 'href="(http://upload.wikimedia.org/.*?)"')
if not url:
url = find_re(data, 'href="(//upload.wikimedia.org/.*?)"')
@ -145,7 +145,7 @@ def find(query, max_results=10):
url = "http://en.wikipedia.org/w/api.php?" + urllib.parse.urlencode(query)
data = read_url(url)
if not data:
data = read_url(url, timeout=0)
data = read_url(url, timeout=0)
result = json.loads(data.decode('utf-8'))
results = []
if result and 'query' in result:

View file

@ -7,7 +7,6 @@ import re
from xml.dom.minidom import parseString
import json
import feedparser
import ox
from ox.cache import read_url, cache_timeout
@ -27,15 +26,15 @@ def video_url(youtubeId, format='mp4', timeout=cache_timeout):
"""
fmt = None
if format == '4k':
fmt=38
fmt = 38
elif format == '1080p':
fmt=37
fmt = 37
elif format == '720p':
fmt=22
fmt = 22
elif format == 'mp4':
fmt=18
fmt = 18
elif format == 'high':
fmt=35
fmt = 35
elif format == 'webm':
streams = videos(youtubeId, 'webm')
return streams[max(streams.keys())]['url']
@ -46,14 +45,14 @@ def video_url(youtubeId, format='mp4', timeout=cache_timeout):
def get_video_info(id):
eurl = get_url(id)
data = read_url(eurl)
data = read_url(eurl).decode('utf-8')
t = re.compile('\W[\'"]?t[\'"]?: ?[\'"](.+?)[\'"]').findall(data)
if t:
t = t[0]
else:
raise IOError
url = "http://www.youtube.com/get_video_info?&video_id=%s&el=$el&ps=default&eurl=%s&hl=en_US&t=%s" % (id, quote(eurl), quote(t))
data = read_url(url)
data = read_url(url).decode('utf-8')
info = {}
for part in data.split('&'):
key, value = part.split('=')
@ -61,6 +60,7 @@ def get_video_info(id):
return info
def find(query, max_results=10, offset=1, orderBy='relevance'):
import feedparser
query = quote(query)
url = "http://gdata.youtube.com/feeds/api/videos?vq=%s&orderby=%s&start-index=%s&max-results=%s" % (query, orderBy, offset, max_results)
data = read_url(url)
@ -104,14 +104,20 @@ def info(id, timeout=cache_timeout):
info['license'] = match[0].strip()
info['license'] = re.sub('<.+?>', '', info['license']).strip()
subs = subtitles(id, timeout)
if subs:
info['subtitles'] = subs
return info
def subtitles(id, timeout=cache_timeout):
url = "http://www.youtube.com/api/timedtext?hl=en&type=list&tlangs=1&v=%s&asrs=1" % id
data = read_url(url, timeout=timeout)
xml = parseString(data)
languages = [t.getAttribute('lang_code') for t in xml.getElementsByTagName('track')]
subtitles = {}
if languages:
info['subtitles'] = {}
for language in languages:
url = "http://www.youtube.com/api/timedtext?hl=en&v=%s&type=track&lang=%s&name&kind"%(id, language)
url = "http://www.youtube.com/api/timedtext?hl=en&v=%s&type=track&lang=%s&name&kind" % (id, language)
data = read_url(url, timeout=timeout)
xml = parseString(data)
subs = []
@ -128,8 +134,8 @@ def info(id, timeout=cache_timeout):
'out': end,
'value': ox.decode_html(text),
})
info['subtitles'][language] = subs
return info
subtitles[language] = subs
return subtitles
def videos(id, format=''):
stream_type = {
@ -154,7 +160,7 @@ def videos(id, format=''):
return streams
def playlist(url):
data = read_url(url)
data = read_url(url).decode('utf-8')
items = []
for i in list(set(re.compile('<a href="(/watch\?v=.*?)" title="(.*?)" ').findall(data))):
items.append({