run update
This commit is contained in:
parent
11af4540c5
commit
6806bebb7c
607 changed files with 52543 additions and 31832 deletions
|
|
@ -1 +1 @@
|
|||
VERSION="2.3.b'786'"
|
||||
VERSION="2.3.895"
|
||||
|
|
@ -1,13 +1,18 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
# GPL 2011
|
||||
from __future__ import with_statement
|
||||
from __future__ import print_function
|
||||
from types import MethodType
|
||||
import gzip
|
||||
import os
|
||||
import shutil
|
||||
import sys
|
||||
import time
|
||||
|
||||
from six.moves import http_cookiejar as cookielib
|
||||
import gzip
|
||||
from six import BytesIO, PY2
|
||||
from six.moves import urllib
|
||||
from types import MethodType
|
||||
from six.moves.urllib.parse import urlparse
|
||||
|
||||
from . import __version__
|
||||
from .utils import json
|
||||
|
|
@ -15,6 +20,8 @@ from .form import MultiPartForm
|
|||
|
||||
__all__ = ['getAPI', 'API']
|
||||
|
||||
CHUNK_SIZE = 1024*1024*5
|
||||
|
||||
def getAPI(url, cj=None):
|
||||
return API(url, cj)
|
||||
|
||||
|
|
@ -101,7 +108,7 @@ class API(object):
|
|||
result = result.decode('utf-8')
|
||||
result = json.loads(result)
|
||||
except:
|
||||
result = {'status':{}}
|
||||
result = {'status': {}}
|
||||
result['status']['code'] = e.code
|
||||
result['status']['text'] = str(e)
|
||||
return result
|
||||
|
|
@ -123,3 +130,112 @@ class API(object):
|
|||
form.add_field('data', json.dumps(data))
|
||||
return self._json_request(self.url, form)
|
||||
|
||||
def save_url(self, url, filename, overwrite=False):
|
||||
chunk_size = 16 * 1024
|
||||
if not os.path.exists(filename) or overwrite:
|
||||
dirname = os.path.dirname(filename)
|
||||
if dirname and not os.path.exists(dirname):
|
||||
os.makedirs(dirname)
|
||||
request = urllib.request.Request(url, method='GET')
|
||||
tmpname = filename + '.tmp'
|
||||
with open(tmpname, 'wb') as fd:
|
||||
u = self._opener.open(request)
|
||||
for chunk in iter(lambda: u.read(chunk_size), b''):
|
||||
fd.write(chunk)
|
||||
shutil.move(tmpname, filename)
|
||||
|
||||
def upload_chunks(self, url, filename, data=None):
|
||||
form = MultiPartForm()
|
||||
if data:
|
||||
for key in data:
|
||||
form.add_field(key, data[key])
|
||||
data = self._json_request(url, form)
|
||||
|
||||
def full_url(path):
|
||||
if path.startswith('/'):
|
||||
u = urlparse(url)
|
||||
path = '%s://%s%s' % (u.scheme, u.netloc, path)
|
||||
return path
|
||||
|
||||
if 'uploadUrl' in data:
|
||||
uploadUrl = full_url(data['uploadUrl'])
|
||||
f = open(filename, 'rb')
|
||||
fsize = os.stat(filename).st_size
|
||||
done = 0
|
||||
if 'offset' in data and data['offset'] < fsize:
|
||||
done = data['offset']
|
||||
f.seek(done)
|
||||
resume_offset = done
|
||||
else:
|
||||
resume_offset = 0
|
||||
chunk = f.read(CHUNK_SIZE)
|
||||
fname = os.path.basename(filename)
|
||||
if not isinstance(fname, bytes):
|
||||
fname = fname.encode('utf-8')
|
||||
while chunk:
|
||||
form = MultiPartForm()
|
||||
form.add_file('chunk', fname, chunk)
|
||||
if len(chunk) < CHUNK_SIZE or f.tell() == fsize:
|
||||
form.add_field('done', '1')
|
||||
form.add_field('offset', str(done))
|
||||
try:
|
||||
data = self._json_request(uploadUrl, form)
|
||||
except KeyboardInterrupt:
|
||||
print("\ninterrupted by user.")
|
||||
sys.exit(1)
|
||||
except:
|
||||
print("uploading chunk failed, will try again in 5 seconds\r", end='')
|
||||
sys.stdout.flush()
|
||||
data = {'result': -1}
|
||||
time.sleep(5)
|
||||
if data and 'status' in data:
|
||||
if data['status']['code'] == 403:
|
||||
print("login required")
|
||||
return False
|
||||
if data['status']['code'] != 200:
|
||||
print("request returned error, will try again in 5 seconds")
|
||||
if DEBUG:
|
||||
print(data)
|
||||
time.sleep(5)
|
||||
if data and data.get('result') == 1:
|
||||
done += len(chunk)
|
||||
if data.get('offset') not in (None, done):
|
||||
print('server offset out of sync, continue from', data['offset'])
|
||||
done = data['offset']
|
||||
f.seek(done)
|
||||
chunk = f.read(CHUNK_SIZE)
|
||||
if data and 'result' in data and data.get('result') == 1:
|
||||
return data.get('id', True)
|
||||
else:
|
||||
return False
|
||||
return False
|
||||
|
||||
def signin(url):
|
||||
import sys
|
||||
from getpass import getpass
|
||||
from .web import auth
|
||||
|
||||
if not url.startswith('http'):
|
||||
site = url
|
||||
url = 'https://%s/api/' % url
|
||||
else:
|
||||
site = url.split('/')[2]
|
||||
api = API(url)
|
||||
update = False
|
||||
try:
|
||||
credentials = auth.get(site)
|
||||
except:
|
||||
credentials = {}
|
||||
print('Please provide your username and password for %s:' % site)
|
||||
credentials['username'] = input('Username: ')
|
||||
credentials['password'] = getpass('Password: ')
|
||||
update = True
|
||||
r = api.signin(**credentials)
|
||||
if 'errors' in r.get('data', {}):
|
||||
for kv in r['data']['errors'].items():
|
||||
print('%s: %s' % kv)
|
||||
sys.exit(1)
|
||||
if update:
|
||||
auth.update(site, credentials)
|
||||
return api
|
||||
|
||||
|
|
|
|||
|
|
@ -1,17 +1,23 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
# GPL 2011
|
||||
from __future__ import with_statement, print_function
|
||||
from __future__ import print_function
|
||||
|
||||
import gzip
|
||||
import zlib
|
||||
import hashlib
|
||||
import os
|
||||
from six import BytesIO
|
||||
import sqlite3
|
||||
import time
|
||||
import zlib
|
||||
|
||||
from six import BytesIO
|
||||
from six.moves import urllib
|
||||
from six import PY2
|
||||
import sqlite3
|
||||
try:
|
||||
import requests
|
||||
USE_REQUESTS = True
|
||||
except:
|
||||
USE_REQUESTS = False
|
||||
|
||||
from .utils import json
|
||||
from .file import makedirs
|
||||
|
|
@ -19,12 +25,14 @@ from .file import makedirs
|
|||
from . import net
|
||||
from .net import DEFAULT_HEADERS, detect_encoding
|
||||
|
||||
cache_timeout = 30*24*60*60 # default is 30 days
|
||||
|
||||
cache_timeout = 30*24*60*60 # default is 30 days
|
||||
|
||||
COMPRESS_TYPES = (
|
||||
'text/html',
|
||||
'text/plain',
|
||||
'text/xml',
|
||||
'text/x-wiki',
|
||||
'application/json',
|
||||
'application/xhtml+xml',
|
||||
'application/x-javascript',
|
||||
|
|
@ -33,7 +41,7 @@ COMPRESS_TYPES = (
|
|||
'application/rss+xml'
|
||||
)
|
||||
|
||||
def status(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):
|
||||
def status(url, data=None, headers=None, timeout=cache_timeout):
|
||||
'''
|
||||
>>> status('http://google.com')
|
||||
200
|
||||
|
|
@ -43,7 +51,7 @@ def status(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):
|
|||
headers = get_headers(url, data, headers)
|
||||
return int(headers['status'])
|
||||
|
||||
def exists(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):
|
||||
def exists(url, data=None, headers=None, timeout=cache_timeout):
|
||||
'''
|
||||
>>> exists('http://google.com')
|
||||
True
|
||||
|
|
@ -55,14 +63,14 @@ def exists(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):
|
|||
return True
|
||||
return False
|
||||
|
||||
def get_headers(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):
|
||||
def get_headers(url, data=None, headers=None, timeout=cache_timeout):
|
||||
url_headers = store.get(url, data, headers, timeout, "headers")
|
||||
if not url_headers:
|
||||
url_headers = net.get_headers(url, data, headers)
|
||||
store.set(url, data, -1, url_headers)
|
||||
return url_headers
|
||||
|
||||
def get_json(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):
|
||||
def get_json(url, data=None, headers=None, timeout=cache_timeout):
|
||||
return json.loads(read_url(url, data, headers, timeout).decode('utf-8'))
|
||||
|
||||
class InvalidResult(Exception):
|
||||
|
|
@ -76,7 +84,7 @@ def _fix_unicode_url(url):
|
|||
url = url.encode('utf-8')
|
||||
return url
|
||||
|
||||
def read_url(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout, valid=None, unicode=False):
|
||||
def read_url(url, data=None, headers=None, timeout=cache_timeout, valid=None, unicode=False):
|
||||
'''
|
||||
url - url to load
|
||||
data - possible post data
|
||||
|
|
@ -87,24 +95,35 @@ def read_url(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout, val
|
|||
'''
|
||||
if net.DEBUG:
|
||||
print('ox.cache.read_url', url)
|
||||
#FIXME: send last-modified / etag from cache and only update if needed
|
||||
#url = _fix_unicode_url(url)
|
||||
# FIXME: send last-modified / etag from cache and only update if needed
|
||||
# url = _fix_unicode_url(url)
|
||||
result = store.get(url, data, headers, timeout)
|
||||
url_headers = {}
|
||||
if not result:
|
||||
try:
|
||||
url_headers, result = net.read_url(url, data, headers, return_headers=True)
|
||||
except urllib.error.HTTPError as e:
|
||||
e.headers['Status'] = "%s" % e.code
|
||||
for key in e.headers:
|
||||
url_headers[key.lower()] = e.headers[key]
|
||||
result = e.read()
|
||||
if url_headers.get('content-encoding', None) == 'gzip':
|
||||
result = gzip.GzipFile(fileobj=BytesIO(result)).read()
|
||||
if not valid or valid(result, url_headers):
|
||||
store.set(url, post_data=data, data=result, headers=url_headers)
|
||||
if USE_REQUESTS:
|
||||
r = requests.get(url, headers=headers)
|
||||
for key in r.headers:
|
||||
url_headers[key.lower()] = r.headers[key]
|
||||
result = r.content
|
||||
url_headers['Status'] = "%s" % r.status_code
|
||||
if not valid or valid(result, url_headers):
|
||||
store.set(url, post_data=data, data=result, headers=url_headers)
|
||||
else:
|
||||
raise InvalidResult(result, url_headers)
|
||||
else:
|
||||
raise InvalidResult(result, url_headers)
|
||||
try:
|
||||
url_headers, result = net.read_url(url, data, headers, return_headers=True)
|
||||
except urllib.error.HTTPError as e:
|
||||
e.headers['Status'] = "%s" % e.code
|
||||
for key in e.headers:
|
||||
url_headers[key.lower()] = e.headers[key]
|
||||
result = e.read()
|
||||
if url_headers.get('content-encoding', None) == 'gzip':
|
||||
result = gzip.GzipFile(fileobj=BytesIO(result)).read()
|
||||
if not valid or valid(result, url_headers):
|
||||
store.set(url, post_data=data, data=result, headers=url_headers)
|
||||
else:
|
||||
raise InvalidResult(result, url_headers)
|
||||
if unicode:
|
||||
ctype = url_headers.get('content-type', '').lower()
|
||||
if 'charset' in ctype:
|
||||
|
|
@ -116,13 +135,13 @@ def read_url(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout, val
|
|||
result = result.decode(encoding)
|
||||
return result
|
||||
|
||||
get_url=read_url
|
||||
get_url = read_url
|
||||
|
||||
def save_url(url, filename, overwrite=False):
|
||||
if not os.path.exists(filename) or overwrite:
|
||||
dirname = os.path.dirname(filename)
|
||||
if dirname and not os.path.exists(dirname):
|
||||
os.makedirs(dirname)
|
||||
makedirs(dirname)
|
||||
data = read_url(url)
|
||||
with open(filename, 'wb') as f:
|
||||
f.write(data)
|
||||
|
|
@ -134,7 +153,7 @@ class Cache:
|
|||
def __init__(self):
|
||||
pass
|
||||
|
||||
def get(self, url, data, headers=DEFAULT_HEADERS, timeout=-1, value="data"):
|
||||
def get(self, url, data, headers=None, timeout=-1, value="data"):
|
||||
'''
|
||||
if value == 'data' return data of url if its in the cache else None
|
||||
if value == 'headers' return headers for url
|
||||
|
|
@ -159,7 +178,7 @@ class SQLiteCache(Cache):
|
|||
def __init__(self):
|
||||
path = cache_path()
|
||||
if not os.path.exists(path):
|
||||
os.makedirs(path)
|
||||
makedirs(path)
|
||||
self.db = os.path.join(path, "cache.sqlite")
|
||||
self.create()
|
||||
|
||||
|
|
@ -192,7 +211,7 @@ class SQLiteCache(Cache):
|
|||
def set_setting(self, c, key, value):
|
||||
c.execute(u'INSERT OR REPLACE INTO setting values (?, ?)', (key, str(value)))
|
||||
|
||||
def get(self, url, data={}, headers=DEFAULT_HEADERS, timeout=-1, value="data"):
|
||||
def get(self, url, data={}, headers=None, timeout=-1, value="data"):
|
||||
r = None
|
||||
if timeout == 0:
|
||||
return r
|
||||
|
|
@ -225,7 +244,7 @@ class SQLiteCache(Cache):
|
|||
conn.close()
|
||||
return r
|
||||
|
||||
def delete(self, url, data=None, headers=DEFAULT_HEADERS):
|
||||
def delete(self, url, data=None, headers=None):
|
||||
url_hash = self.get_url_hash(url, data)
|
||||
conn = self.connect()
|
||||
c = conn.cursor()
|
||||
|
|
@ -244,7 +263,8 @@ class SQLiteCache(Cache):
|
|||
c = conn.cursor()
|
||||
|
||||
# Insert a row of data
|
||||
if not post_data: post_data=""
|
||||
if not post_data:
|
||||
post_data = ""
|
||||
only_headers = 0
|
||||
if data == -1:
|
||||
only_headers = 1
|
||||
|
|
@ -280,11 +300,11 @@ class FileCache(Cache):
|
|||
|
||||
def files(self, domain, h):
|
||||
prefix = os.path.join(self.root, domain, h[:2], h[2:4], h[4:6], h[6:8])
|
||||
i = os.path.join(prefix, '%s.json'%h)
|
||||
f = os.path.join(prefix, '%s.dat'%h)
|
||||
i = os.path.join(prefix, '%s.json' % h)
|
||||
f = os.path.join(prefix, '%s.dat' % h)
|
||||
return prefix, i, f
|
||||
|
||||
def get(self, url, data={}, headers=DEFAULT_HEADERS, timeout=-1, value="data"):
|
||||
def get(self, url, data={}, headers=None, timeout=-1, value="data"):
|
||||
r = None
|
||||
if timeout == 0:
|
||||
return r
|
||||
|
|
@ -308,13 +328,13 @@ class FileCache(Cache):
|
|||
if value == 'headers':
|
||||
r = info['headers']
|
||||
else:
|
||||
with open(f, 'rb') as data:
|
||||
r = data.read()
|
||||
with open(f, 'rb') as fd:
|
||||
r = fd.read()
|
||||
if info['compressed']:
|
||||
r = zlib.decompress(r)
|
||||
return r
|
||||
|
||||
def delete(self, url, data=None, headers=DEFAULT_HEADERS):
|
||||
def delete(self, url, data=None, headers=None):
|
||||
url_hash = self.get_url_hash(url, data)
|
||||
domain = self.get_domain(url)
|
||||
|
||||
|
|
@ -344,15 +364,104 @@ class FileCache(Cache):
|
|||
if not info['only_headers']:
|
||||
if info['compressed']:
|
||||
data = zlib.compress(data)
|
||||
elif not isinstance(data, str):
|
||||
elif not isinstance(data, bytes):
|
||||
data = data.encode('utf-8')
|
||||
with open(f, 'wb') as _f:
|
||||
_f.write(data)
|
||||
with open(i, 'wb') as _i:
|
||||
with open(i, 'w') as _i:
|
||||
json.dump(info, _i)
|
||||
|
||||
|
||||
class KVCache(Cache):
|
||||
_bytes_only = False
|
||||
|
||||
def _keys(self, url, data, headers=None):
|
||||
url_hash = self.get_url_hash(url, data)
|
||||
domain = self.get_domain(url)
|
||||
key = 'ox:%s:%s' % (domain, url_hash)
|
||||
return key, key + ':data'
|
||||
|
||||
def get(self, url, data={}, headers=None, timeout=-1, value="data"):
|
||||
if timeout == 0:
|
||||
return None
|
||||
|
||||
r = None
|
||||
info_key, data_key = self._keys(url, data, headers)
|
||||
info = self.backend.get(info_key)
|
||||
if info:
|
||||
if self._bytes_only:
|
||||
info = json.loads(info.decode())
|
||||
now = time.mktime(time.localtime())
|
||||
expired = now-timeout
|
||||
|
||||
if value != 'headers' and info['only_headers']:
|
||||
return None
|
||||
if timeout < 0 or info['created'] > expired:
|
||||
if value == 'headers':
|
||||
r = info['headers']
|
||||
else:
|
||||
r = self.backend.get(data_key)
|
||||
if r and info['compressed']:
|
||||
r = zlib.decompress(r)
|
||||
return r
|
||||
|
||||
def delete(self, url, data=None, headers=None):
|
||||
for key in self._keys(url, data, headers):
|
||||
self.backend.delete(key)
|
||||
|
||||
def set(self, url, post_data, data, headers):
|
||||
info_key, data_key = self._keys(url, post_data, headers)
|
||||
|
||||
created = time.mktime(time.localtime())
|
||||
content_type = headers.get('content-type', '').split(';')[0].strip()
|
||||
|
||||
info = {
|
||||
'compressed': content_type in COMPRESS_TYPES,
|
||||
'only_headers': data == -1,
|
||||
'created': created,
|
||||
'headers': headers,
|
||||
'url': url,
|
||||
}
|
||||
if post_data:
|
||||
info['post_data'] = post_data
|
||||
if not info['only_headers']:
|
||||
if info['compressed']:
|
||||
data = zlib.compress(data)
|
||||
elif not isinstance(data, bytes):
|
||||
data = data.encode('utf-8')
|
||||
self.backend.set(data_key, data)
|
||||
if self._bytes_only:
|
||||
info = json.dumps(info, ensure_ascii=False).encode('utf-8')
|
||||
self.backend.set(info_key, info)
|
||||
|
||||
|
||||
class MemCache(KVCache):
|
||||
_bytes_only = False
|
||||
|
||||
def __init__(self):
|
||||
import pylibmc
|
||||
|
||||
f, self.host = cache_path().split(':', 1)
|
||||
self.backend = pylibmc.Client([self.host])
|
||||
self.backend.behaviors['connect_timeout'] = 60000
|
||||
|
||||
|
||||
class RedisCache(KVCache):
|
||||
_bytes_only = True
|
||||
|
||||
def __init__(self):
|
||||
import redis
|
||||
|
||||
f, self.url = cache_path().split(':', 1)
|
||||
self.backend = redis.from_url(self.url)
|
||||
|
||||
|
||||
if cache_path().startswith('fs:'):
|
||||
store = FileCache()
|
||||
elif cache_path().startswith('redis:'):
|
||||
store = RedisCache()
|
||||
elif cache_path().startswith('memcache:'):
|
||||
store = MemCache()
|
||||
else:
|
||||
store = SQLiteCache()
|
||||
|
||||
|
|
|
|||
|
|
@ -1,36 +1,37 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
# GPL 2008
|
||||
from __future__ import division, with_statement, print_function
|
||||
import os
|
||||
from __future__ import division, print_function
|
||||
from distutils.spawn import find_executable
|
||||
from glob import glob
|
||||
import hashlib
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import sqlite3
|
||||
import struct
|
||||
import subprocess
|
||||
import sqlite3
|
||||
from distutils.spawn import find_executable
|
||||
|
||||
from .utils import json
|
||||
|
||||
__all__ = ['sha1sum', 'oshash', 'avinfo', 'makedirs']
|
||||
__all__ = ['sha1sum', 'oshash', 'avinfo', 'makedirs', 'iexists']
|
||||
|
||||
EXTENSIONS = {
|
||||
'audio': [
|
||||
'aac', 'aif', 'aiff',
|
||||
'flac', 'm4a', 'mp3', 'oga', 'ogg', 'wav', 'wma'
|
||||
'aac', 'aif', 'aiff', 'amr',
|
||||
'flac', 'm4a', 'mp3', 'oga', 'ogg', 'wav', 'wma', 'opus'
|
||||
],
|
||||
'image': [
|
||||
'bmp', 'gif', 'jpeg', 'jpg', 'png', 'svg', 'webp'
|
||||
],
|
||||
'subtitle': [
|
||||
'idx', 'srt', 'sub'
|
||||
'idx', 'srt', 'sub', 'vtt'
|
||||
],
|
||||
'video': [
|
||||
'3gp',
|
||||
'avi', 'divx', 'dv', 'flv', 'm2t', 'm4v', 'mkv', 'mov', 'mp4',
|
||||
'mpeg', 'mpg', 'mts', 'ogm', 'ogv', 'rm', 'rmvb', 'vob', 'webm', 'wmv',
|
||||
'mod', 'tod', # http://en.wikipedia.org/wiki/MOD_and_TOD
|
||||
'avi', 'divx', 'dv', 'flv', 'm2t', 'm2ts', 'm4v', 'mkv', 'mov', 'mp4',
|
||||
'mpeg', 'mpg', 'mts', 'ogm', 'ogv', 'rm', 'rmvb', 'vob', 'webm', 'wmv', 'asf',
|
||||
'mod', 'tod', # http://en.wikipedia.org/wiki/MOD_and_TOD
|
||||
'mxf', 'ts'
|
||||
],
|
||||
}
|
||||
|
|
@ -131,25 +132,25 @@ def oshash(filename, cached=True):
|
|||
if filesize < 65536:
|
||||
for x in range(int(filesize/bytesize)):
|
||||
buffer = f.read(bytesize)
|
||||
(l_value,)= struct.unpack(longlongformat, buffer)
|
||||
(l_value,) = struct.unpack(longlongformat, buffer)
|
||||
hash += l_value
|
||||
hash = hash & 0xFFFFFFFFFFFFFFFF #to remain as 64bit number
|
||||
hash = hash & 0xFFFFFFFFFFFFFFFF # to remain as 64bit number
|
||||
else:
|
||||
for x in range(int(65536/bytesize)):
|
||||
buffer = f.read(bytesize)
|
||||
(l_value,)= struct.unpack(longlongformat, buffer)
|
||||
(l_value,) = struct.unpack(longlongformat, buffer)
|
||||
hash += l_value
|
||||
hash = hash & 0xFFFFFFFFFFFFFFFF #to remain as 64bit number
|
||||
f.seek(max(0,filesize-65536),0)
|
||||
hash = hash & 0xFFFFFFFFFFFFFFFF # to remain as 64bit number
|
||||
f.seek(max(0, filesize-65536), 0)
|
||||
for x in range(int(65536/bytesize)):
|
||||
buffer = f.read(bytesize)
|
||||
(l_value,)= struct.unpack(longlongformat, buffer)
|
||||
(l_value,) = struct.unpack(longlongformat, buffer)
|
||||
hash += l_value
|
||||
hash = hash & 0xFFFFFFFFFFFFFFFF
|
||||
f.close()
|
||||
returnedhash = "%016x" % hash
|
||||
returnedhash = "%016x" % hash
|
||||
return returnedhash
|
||||
except(IOError):
|
||||
except IOError:
|
||||
return "IOError"
|
||||
|
||||
def avinfo(filename, cached=True):
|
||||
|
|
@ -160,23 +161,25 @@ def avinfo(filename, cached=True):
|
|||
return ffprobe(filename)
|
||||
ffmpeg2theora = cmd('ffmpeg2theora')
|
||||
p = subprocess.Popen([ffmpeg2theora], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||
info, error = p.communicate()
|
||||
version = info.split('\n')[0].split(' - ')[0].split(' ')[-1]
|
||||
stdout, error = p.communicate()
|
||||
stdout = stdout.decode('utf-8')
|
||||
version = stdout.split('\n')[0].split(' - ')[0].split(' ')[-1]
|
||||
if version < '0.27':
|
||||
raise EnvironmentError('version of ffmpeg2theora needs to be 0.27 or later, found %s' % version)
|
||||
p = subprocess.Popen([ffmpeg2theora, '--info', filename],
|
||||
stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||
info, error = p.communicate()
|
||||
stdout, error = p.communicate()
|
||||
stdout = stdout.decode('utf-8')
|
||||
try:
|
||||
info = json.loads(info)
|
||||
info = json.loads(stdout)
|
||||
except:
|
||||
#remove metadata, can be broken
|
||||
# remove metadata, can be broken
|
||||
reg = re.compile('"metadata": {.*?},', re.DOTALL)
|
||||
info = re.sub(reg, '', info)
|
||||
info = json.loads(info)
|
||||
stdout = re.sub(reg, '', stdout)
|
||||
info = json.loads(stdout)
|
||||
if 'video' in info:
|
||||
for v in info['video']:
|
||||
if not 'display_aspect_ratio' in v and 'width' in v:
|
||||
if 'display_aspect_ratio' not in v and 'width' in v:
|
||||
v['display_aspect_ratio'] = '%d:%d' % (v['width'], v['height'])
|
||||
v['pixel_aspect_ratio'] = '1:1'
|
||||
if len(info.get('audio', [])) > 1:
|
||||
|
|
@ -189,12 +192,14 @@ def avinfo(filename, cached=True):
|
|||
ffmpeg = cmd('ffmpeg')
|
||||
p = subprocess.Popen([ffmpeg, '-i', filename], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||
stdout, stderr = p.communicate()
|
||||
stderr = stderr.decode('utf-8')
|
||||
languages = [re.compile('\((.+?)\):').findall(l) for l in stderr.split('\n') if 'Stream' in l and 'Audio' in l]
|
||||
if len(languages) == len(info['audio']):
|
||||
for i, stream in enumerate(info['audio']):
|
||||
language = languages[i]
|
||||
if language and language[0] != 'und':
|
||||
stream['language'] = language[0]
|
||||
fix_coverart(info)
|
||||
return info
|
||||
|
||||
return {'path': filename, 'size': 0}
|
||||
|
|
@ -203,6 +208,7 @@ def ffprobe(filename):
|
|||
p = subprocess.Popen([
|
||||
cmd('ffprobe'),
|
||||
'-show_format',
|
||||
'-show_chapters',
|
||||
'-show_streams',
|
||||
'-print_format',
|
||||
'json',
|
||||
|
|
@ -210,6 +216,7 @@ def ffprobe(filename):
|
|||
|
||||
], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||
info, error = p.communicate()
|
||||
info = info.decode('utf-8')
|
||||
ffinfo = json.loads(info)
|
||||
|
||||
def fix_value(key, value):
|
||||
|
|
@ -224,7 +231,7 @@ def ffprobe(filename):
|
|||
return value
|
||||
|
||||
info = {}
|
||||
if not 'format' in ffinfo:
|
||||
if 'format' not in ffinfo:
|
||||
info['error'] = 'badfile'
|
||||
else:
|
||||
for key in ('duration', 'size', 'bit_rate'):
|
||||
|
|
@ -235,8 +242,18 @@ def ffprobe(filename):
|
|||
info['audio'] = []
|
||||
info['video'] = []
|
||||
info['metadata'] = ffinfo['format'].get('tags', {})
|
||||
chapters = ffinfo.get('chapters', [])
|
||||
if chapters:
|
||||
info['chapters'] = [
|
||||
{
|
||||
'in': float(chapter['start_time']),
|
||||
'out': float(chapter['end_time']),
|
||||
'value': chapter.get('tags', {}).get('title')
|
||||
}
|
||||
for chapter in chapters if chapter.get('tags', {}).get('title')
|
||||
]
|
||||
for s in ffinfo['streams']:
|
||||
tags = s.pop('tags', {})
|
||||
tags = s.pop('tags', {})
|
||||
language = None
|
||||
for t in tags:
|
||||
if t == 'language':
|
||||
|
|
@ -278,17 +295,29 @@ def ffprobe(filename):
|
|||
info[s['codec_type']].append(stream)
|
||||
else:
|
||||
pass
|
||||
#print s
|
||||
# print s
|
||||
for v in info['video']:
|
||||
if 'rotate' in info.get('metadata', {}) and int(info['metadata']['rotate']) in (-180, -90, 90, 180):
|
||||
v['width'], v['height'] = v['height'], v['width']
|
||||
k = 'display_aspect_ratio'
|
||||
if not k in v and 'width' in v \
|
||||
if k not in v and 'width' in v \
|
||||
or (k in v and v[k] == '0:1'):
|
||||
v[k] = '%d:%d' % (v['width'], v['height'])
|
||||
v['pixel_aspect_ratio'] = '1:1'
|
||||
info['oshash'] = oshash(filename)
|
||||
info['path'] = filename
|
||||
if not 'size' in info:
|
||||
if 'size' not in info:
|
||||
info['size'] = os.path.getsize(filename)
|
||||
|
||||
fix_coverart(info)
|
||||
return info
|
||||
|
||||
def fix_coverart(info):
|
||||
if info.get('video') \
|
||||
and info['path'].split('.')[-1] in EXTENSIONS['audio'] \
|
||||
and info['video'][0]['codec'] in EXTENSIONS['image'] + ['mjpeg']:
|
||||
info['cover'] = info.pop('video')
|
||||
info['video'] = []
|
||||
return info
|
||||
|
||||
def makedirs(path):
|
||||
|
|
@ -353,3 +382,17 @@ def write_path(file):
|
|||
path = os.path.split(file)[0]
|
||||
if path and not os.path.exists(path):
|
||||
os.makedirs(path)
|
||||
|
||||
def iexists(path):
|
||||
parts = path.split(os.sep)
|
||||
name = parts[-1].lower()
|
||||
if len(parts) == 1:
|
||||
folder = '.'
|
||||
else:
|
||||
folder = os.path.dirname(path)
|
||||
try:
|
||||
files = os.listdir(folder)
|
||||
except FileNotFoundError:
|
||||
return False
|
||||
files = {os.path.basename(f).lower() for f in files}
|
||||
return name in files
|
||||
|
|
|
|||
|
|
@ -6,7 +6,7 @@ from __future__ import print_function
|
|||
|
||||
import unicodedata
|
||||
|
||||
from six import unichr, PY2
|
||||
from six import unichr, text_type
|
||||
|
||||
__all__ = ['fix_bad_unicode']
|
||||
|
||||
|
|
@ -151,10 +151,7 @@ def text_badness(text):
|
|||
- Improbable single-byte characters, such as ƒ or ¬
|
||||
- Letters in somewhat rare scripts
|
||||
'''
|
||||
if PY2:
|
||||
assert isinstance(text, unicode)
|
||||
else:
|
||||
assert isinstance(text, str)
|
||||
assert isinstance(text, text_type)
|
||||
errors = 0
|
||||
very_weird_things = 0
|
||||
weird_things = 0
|
||||
|
|
|
|||
|
|
@ -1,11 +1,12 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
# GPL 2014
|
||||
from __future__ import with_statement, print_function
|
||||
from __future__ import print_function
|
||||
|
||||
import itertools
|
||||
import mimetypes
|
||||
import random
|
||||
import os
|
||||
import hashlib
|
||||
import sys
|
||||
|
||||
from six import PY2
|
||||
|
|
@ -20,8 +21,7 @@ _fmt = '%%0%dd' % _width
|
|||
|
||||
def _make_boundary():
|
||||
# Craft a random boundary.
|
||||
token = random.randrange(sys.maxsize)
|
||||
boundary = ('=' * 15) + (_fmt % token) + '=='
|
||||
boundary = ('=' * 15) + hashlib.sha1(os.urandom(32)).hexdigest() + '=='
|
||||
return boundary
|
||||
|
||||
class MultiPartForm(object):
|
||||
|
|
@ -75,7 +75,7 @@ class MultiPartForm(object):
|
|||
# line is separated by '\r\n'.
|
||||
parts = []
|
||||
part_boundary = '--' + self.boundary
|
||||
|
||||
|
||||
# Add the form fields
|
||||
parts.extend(
|
||||
[ part_boundary,
|
||||
|
|
@ -85,7 +85,7 @@ class MultiPartForm(object):
|
|||
]
|
||||
for name, value in self.form_fields
|
||||
)
|
||||
|
||||
|
||||
# Add the files to upload
|
||||
parts.extend(
|
||||
[ part_boundary,
|
||||
|
|
@ -97,7 +97,7 @@ class MultiPartForm(object):
|
|||
]
|
||||
for field_name, filename, content_type, body in self.files
|
||||
)
|
||||
|
||||
|
||||
# Flatten the list and add closing boundary marker,
|
||||
# then return CR+LF separated data
|
||||
flattened = list(itertools.chain(*parts))
|
||||
|
|
|
|||
|
|
@ -4,13 +4,14 @@ import math
|
|||
import re
|
||||
import string
|
||||
|
||||
from six import text_type
|
||||
|
||||
def toAZ(num):
|
||||
"""
|
||||
Converts an integer to bijective base 26 string using A-Z
|
||||
|
||||
>>> for i in range(1, 1000): assert fromAZ(toAZ(i)) == i
|
||||
|
||||
|
||||
>>> toAZ(1)
|
||||
'A'
|
||||
|
||||
|
|
@ -20,7 +21,8 @@ def toAZ(num):
|
|||
>>> toAZ(1234567890)
|
||||
'CYWOQVJ'
|
||||
"""
|
||||
if num < 1: raise ValueError("must supply a positive integer")
|
||||
if num < 1:
|
||||
raise ValueError("must supply a positive integer")
|
||||
digits = string.ascii_uppercase
|
||||
az = ''
|
||||
while num != 0:
|
||||
|
|
@ -30,7 +32,7 @@ def toAZ(num):
|
|||
az = digits[r] + az
|
||||
return az
|
||||
|
||||
encode_base26=toAZ
|
||||
encode_base26 = toAZ
|
||||
|
||||
def fromAZ(num):
|
||||
"""
|
||||
|
|
@ -45,7 +47,7 @@ def fromAZ(num):
|
|||
>>> fromAZ('FOO')
|
||||
4461
|
||||
"""
|
||||
num = num.replace('-','')
|
||||
num = num.replace('-', '')
|
||||
digits = string.ascii_uppercase
|
||||
r = 0
|
||||
for exp, char in enumerate(reversed(num)):
|
||||
|
|
@ -64,7 +66,8 @@ def to26(q):
|
|||
>>> to26(347485647)
|
||||
'BDGKMAP'
|
||||
"""
|
||||
if q < 0: raise ValueError("must supply a positive integer")
|
||||
if q < 0:
|
||||
raise ValueError("must supply a positive integer")
|
||||
base26 = string.ascii_uppercase
|
||||
converted = []
|
||||
while q != 0:
|
||||
|
|
@ -73,7 +76,7 @@ def to26(q):
|
|||
converted.insert(0, l)
|
||||
return "".join(converted) or 'A'
|
||||
|
||||
decode_base26=toAZ
|
||||
decode_base26 = toAZ
|
||||
|
||||
def from26(q):
|
||||
"""
|
||||
|
|
@ -82,7 +85,7 @@ def from26(q):
|
|||
0
|
||||
"""
|
||||
base26 = string.ascii_uppercase
|
||||
q = q.replace('-','')
|
||||
q = q.replace('-', '')
|
||||
r = 0
|
||||
for i in q:
|
||||
r = r * 26 + base26.index(i.upper())
|
||||
|
|
@ -123,7 +126,8 @@ def to32(q):
|
|||
ValueError: must supply a positive integer
|
||||
"""
|
||||
|
||||
if q < 0: raise ValueError("must supply a positive integer")
|
||||
if q < 0:
|
||||
raise ValueError("must supply a positive integer")
|
||||
letters = "0123456789ABCDEFGHJKMNPQRSTVWXYZ"
|
||||
converted = []
|
||||
while q != 0:
|
||||
|
|
@ -188,7 +192,7 @@ def from32(q):
|
|||
'Z': 31,
|
||||
}
|
||||
base32 = ('0123456789' + string.ascii_uppercase)[:32]
|
||||
q = q.replace('-','')
|
||||
q = q.replace('-', '')
|
||||
q = ''.join([base32[_32map[i.upper()]] for i in q])
|
||||
return int(q, 32)
|
||||
|
||||
|
|
@ -210,7 +214,8 @@ def to36(q):
|
|||
...
|
||||
ValueError: must supply a positive integer
|
||||
"""
|
||||
if q < 0: raise ValueError("must supply a positive integer")
|
||||
if q < 0:
|
||||
raise ValueError("must supply a positive integer")
|
||||
letters = "0123456789abcdefghijklmnopqrstuvwxyz"
|
||||
converted = []
|
||||
while q != 0:
|
||||
|
|
@ -233,7 +238,7 @@ def int_value(strValue, default=u''):
|
|||
u''
|
||||
"""
|
||||
try:
|
||||
val = re.compile('(\d+)').findall(unicode(strValue).strip())[0]
|
||||
val = re.compile('(\d+)').findall(text_type(strValue).strip())[0]
|
||||
except:
|
||||
val = default
|
||||
return val
|
||||
|
|
@ -250,7 +255,7 @@ def float_value(strValue, default=u''):
|
|||
u''
|
||||
"""
|
||||
try:
|
||||
val = re.compile('([\d.]+)').findall(unicode(strValue).strip())[0]
|
||||
val = re.compile('([\d.]+)').findall(text_type(strValue).strip())[0]
|
||||
except:
|
||||
val = default
|
||||
return val
|
||||
|
|
@ -286,7 +291,7 @@ def format_number(number, longName, shortName):
|
|||
n = number / math.pow(1024, i + 1)
|
||||
return '%s %s%s' % (format_thousands('%.*f' % (i, n)), prefix[i], shortName)
|
||||
|
||||
def format_thousands(number, separator = ','):
|
||||
def format_thousands(number, separator=','):
|
||||
"""
|
||||
Return the number with separators (1,000,000)
|
||||
|
||||
|
|
@ -316,18 +321,18 @@ def format_pixels(number):
|
|||
return format_number(number, 'pixel', 'px')
|
||||
|
||||
def format_currency(amount, currency="$"):
|
||||
if amount:
|
||||
temp = "%.2f" % amount
|
||||
profile=re.compile(r"(\d)(\d\d\d[.,])")
|
||||
while 1:
|
||||
temp, count = re.subn(profile,r"\1,\2",temp)
|
||||
if not count:
|
||||
break
|
||||
if temp.startswith('-'):
|
||||
return "-"+ currency + temp[1:-3]
|
||||
return currency + temp[:-3]
|
||||
else:
|
||||
return ""
|
||||
if amount:
|
||||
temp = "%.2f" % amount
|
||||
profile = re.compile(r"(\d)(\d\d\d[.,])")
|
||||
while 1:
|
||||
temp, count = re.subn(profile, r"\1,\2", temp)
|
||||
if not count:
|
||||
break
|
||||
if temp.startswith('-'):
|
||||
return "-" + currency + temp[1:-3]
|
||||
return currency + temp[:-3]
|
||||
else:
|
||||
return ""
|
||||
|
||||
def plural(amount, unit, plural='s'):
|
||||
'''
|
||||
|
|
@ -339,7 +344,8 @@ def plural(amount, unit, plural='s'):
|
|||
if abs(amount) != 1:
|
||||
if plural == 's':
|
||||
unit = unit + plural
|
||||
else: unit = plural
|
||||
else:
|
||||
unit = plural
|
||||
return "%s %s" % (format_thousands(amount), unit)
|
||||
|
||||
def format_duration(ms, verbosity=0, years=True, hours=True, milliseconds=True):
|
||||
|
|
@ -390,14 +396,14 @@ def format_duration(ms, verbosity=0, years=True, hours=True, milliseconds=True):
|
|||
duration += ".%03d" % ms
|
||||
else:
|
||||
if verbosity == 1:
|
||||
durations = ["%sd" % d, "%sh" % h, "%sm" % m, "%ss" % s]
|
||||
durations = ["%sd" % d, "%sh" % h, "%sm" % m, "%ss" % s]
|
||||
if years:
|
||||
durations.insert(0, "%sy" % y)
|
||||
if milliseconds:
|
||||
durations.append("%sms" % ms)
|
||||
else:
|
||||
durations = [plural(d, 'day'), plural(h,'hour'),
|
||||
plural(m, 'minute'), plural(s, 'second')]
|
||||
durations = [plural(d, 'day'), plural(h, 'hour'),
|
||||
plural(m, 'minute'), plural(s, 'second')]
|
||||
if years:
|
||||
durations.insert(0, plural(y, 'year'))
|
||||
if milliseconds:
|
||||
|
|
@ -434,7 +440,7 @@ def parse_timecode(string):
|
|||
'''
|
||||
timecode = 0
|
||||
for i, v in enumerate(list(reversed(string.split(':')))[:4]):
|
||||
timecode += float(v) * ( 86400 if i == 3 else pow(60, i))
|
||||
timecode += float(v) * (86400 if i == 3 else pow(60, i))
|
||||
return timecode
|
||||
|
||||
def ms2runtime(ms, shortenLong=False):
|
||||
|
|
@ -482,7 +488,8 @@ def time2ms(timeString):
|
|||
p = timeString.split(':')
|
||||
for i in range(len(p)):
|
||||
_p = p[i]
|
||||
if _p.endswith('.'): _p =_p[:-1]
|
||||
if _p.endswith('.'):
|
||||
_p = _p[:-1]
|
||||
ms = ms * 60 + float(_p)
|
||||
return int(ms * 1000)
|
||||
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
# GPL 2008
|
||||
from __future__ import with_statement, print_function
|
||||
from __future__ import print_function
|
||||
|
||||
import math
|
||||
|
||||
|
|
|
|||
|
|
@ -10,7 +10,7 @@ letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
|
|||
|
||||
# Configuration for add_links() function
|
||||
|
||||
LEADING_PUNCTUATION = ['(', '<', '<']
|
||||
LEADING_PUNCTUATION = ['(', '<', '<']
|
||||
TRAILING_PUNCTUATION = ['.', ',', ')', '>', '\n', '>', "'", '"']
|
||||
|
||||
# list of possible strings used for bullets in bulleted lists
|
||||
|
|
@ -18,16 +18,16 @@ DOTS = ['·', '*', '\xe2\x80\xa2', '•', '•', '•']
|
|||
|
||||
unencoded_ampersands_re = re.compile(r'&(?!(\w+|#\d+);)')
|
||||
word_split_re = re.compile(r'(\s+)')
|
||||
punctuation_re = re.compile('^(?P<lead>(?:%s)*)(?P<middle>.*?)(?P<trail>(?:%s)*)$' % \
|
||||
('|'.join([re.escape(x) for x in LEADING_PUNCTUATION]),
|
||||
'|'.join([re.escape(x) for x in TRAILING_PUNCTUATION])))
|
||||
punctuation_re = re.compile('^(?P<lead>(?:%s)*)(?P<middle>.*?)(?P<trail>(?:%s)*)$' % (
|
||||
'|'.join([re.escape(x) for x in LEADING_PUNCTUATION]),
|
||||
'|'.join([re.escape(x) for x in TRAILING_PUNCTUATION])))
|
||||
simple_email_re = re.compile(r'^\S+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9._-]+$')
|
||||
link_target_attribute_re = re.compile(r'(<a [^>]*?)target=[^\s>]+')
|
||||
html_gunk_re = re.compile(r'(?:<br clear="all">|<i><\/i>|<b><\/b>|<em><\/em>|<strong><\/strong>|<\/?smallcaps>|<\/?uppercase>)', re.IGNORECASE)
|
||||
hard_coded_bullets_re = re.compile(r'((?:<p>(?:%s).*?[a-zA-Z].*?</p>\s*)+)' % '|'.join([re.escape(x) for x in DOTS]), re.DOTALL)
|
||||
trailing_empty_content_re = re.compile(r'(?:<p>(?: |\s|<br \/>)*?</p>\s*)+\Z')
|
||||
if PY2:
|
||||
del x # Temporary variable
|
||||
del x # Temporary variable
|
||||
|
||||
def escape(html):
|
||||
'''
|
||||
|
|
@ -44,7 +44,7 @@ def linebreaks(value):
|
|||
'''
|
||||
Converts newlines into <p> and <br />
|
||||
'''
|
||||
value = re.sub(r'\r\n|\r|\n', '\n', value) # normalize newlines
|
||||
value = re.sub(r'\r\n|\r|\n', '\n', value) # normalize newlines
|
||||
paras = re.split('\n{2,}', value)
|
||||
paras = ['<p>%s</p>' % p.strip().replace('\n', '<br />') for p in paras]
|
||||
return '\n\n'.join(paras)
|
||||
|
|
@ -83,21 +83,23 @@ def add_links(text, trim_url_limit=None, nofollow=False):
|
|||
|
||||
If nofollow is True, the URLs in link text will get a rel="nofollow" attribute.
|
||||
"""
|
||||
trim_url = lambda x, limit=trim_url_limit: limit is not None and (x[:limit] + (len(x) >=limit and '...' or '')) or x
|
||||
trim_url = lambda x, limit=trim_url_limit: limit is not None and (x[:limit] + (len(x) >= limit and '...' or '')) or x
|
||||
words = word_split_re.split(text)
|
||||
nofollow_attr = nofollow and ' rel="nofollow"' or ''
|
||||
for i, word in enumerate(words):
|
||||
match = punctuation_re.match(word)
|
||||
if match:
|
||||
lead, middle, trail = match.groups()
|
||||
if middle.startswith('www.') or ('@' not in middle and not middle.startswith('http://') and \
|
||||
len(middle) > 0 and middle[0] in letters + string.digits and \
|
||||
(middle.endswith('.org') or middle.endswith('.net') or middle.endswith('.com'))):
|
||||
if middle.startswith('www.') or ('@' not in middle and not middle.startswith('http://') and
|
||||
len(middle) > 0 and middle[0] in letters + string.digits and
|
||||
(middle.endswith('.org') or
|
||||
middle.endswith('.net') or
|
||||
middle.endswith('.com'))):
|
||||
middle = '<a href="http://%s"%s>%s</a>' % (middle, nofollow_attr, trim_url(middle))
|
||||
if middle.startswith('http://') or middle.startswith('https://'):
|
||||
middle = '<a href="%s"%s>%s</a>' % (middle, nofollow_attr, trim_url(middle))
|
||||
if '@' in middle and not middle.startswith('www.') and not ':' in middle \
|
||||
and simple_email_re.match(middle):
|
||||
if '@' in middle and not middle.startswith('www.') and ':' not in middle \
|
||||
and simple_email_re.match(middle):
|
||||
middle = '<a href="mailto:%s">%s</a>' % (middle, middle)
|
||||
if lead + middle + trail != word:
|
||||
words[i] = lead + middle + trail
|
||||
|
|
@ -127,6 +129,7 @@ def clean_html(text):
|
|||
# Trim stupid HTML such as <br clear="all">.
|
||||
text = html_gunk_re.sub('', text)
|
||||
# Convert hard-coded bullets into HTML unordered lists.
|
||||
|
||||
def replace_p_tags(match):
|
||||
s = match.group().replace('</p>', '</li>')
|
||||
for d in DOTS:
|
||||
|
|
@ -153,6 +156,7 @@ def decode_html(html):
|
|||
if isinstance(html, bytes):
|
||||
html = html.decode('utf-8')
|
||||
uchr = unichr
|
||||
|
||||
def entitydecode(match, uchr=uchr):
|
||||
entity = match.group(1)
|
||||
if entity == '#x80':
|
||||
|
|
@ -282,7 +286,7 @@ def sanitize_html(html, tags=None, global_attributes=[]):
|
|||
{'name': 'thead'},
|
||||
{'name': 'tr'},
|
||||
# other
|
||||
{'name': '[]'},
|
||||
{'name': '[]'},
|
||||
{
|
||||
'name': 'a',
|
||||
'required': ['href'],
|
||||
|
|
@ -328,15 +332,14 @@ def sanitize_html(html, tags=None, global_attributes=[]):
|
|||
|
||||
for tag in tags:
|
||||
valid_attributes[tag['name']] = tag.get('required', []) \
|
||||
+ tag.get('optional', []) \
|
||||
+ global_attributes
|
||||
+ tag.get('optional', []) + global_attributes
|
||||
required_attributes[tag['name']] = tag.get('required', [])
|
||||
validation[tag['name']] = tag.get('validation', {})
|
||||
|
||||
if '[]' in validation:
|
||||
html = re.sub(
|
||||
re.compile('\[((https?:\/\/|\/).+?) (.+?)\]', re.IGNORECASE),
|
||||
'<a href="\\1">\\3</a>', html);
|
||||
'<a href="\\1">\\3</a>', html)
|
||||
|
||||
parts = split_tags(html)
|
||||
for i, part in enumerate(parts):
|
||||
|
|
@ -351,17 +354,17 @@ def sanitize_html(html, tags=None, global_attributes=[]):
|
|||
a = attr_re.findall(attributes)
|
||||
attrs = dict(a)
|
||||
|
||||
if not closing and not name in non_closing_tags:
|
||||
if not closing and name not in non_closing_tags:
|
||||
level += 1
|
||||
|
||||
if not attrs and attributes or name not in valid_tags:
|
||||
if not attrs and attributes or name not in valid_tags:
|
||||
valid = False
|
||||
else:
|
||||
valid = True
|
||||
for key in set(attrs) - set(valid_attributes[name]):
|
||||
del attrs[key]
|
||||
for key in required_attributes[tag['name']]:
|
||||
if not key in attrs:
|
||||
if key not in attrs:
|
||||
valid = False
|
||||
|
||||
if valid:
|
||||
|
|
@ -395,6 +398,7 @@ def sanitize_html(html, tags=None, global_attributes=[]):
|
|||
|
||||
def split_tags(string):
|
||||
tags = []
|
||||
|
||||
def collect(match):
|
||||
tags.append(match.group(0))
|
||||
return '\0'
|
||||
|
|
|
|||
|
|
@ -14,12 +14,13 @@ except:
|
|||
import ImageFont
|
||||
|
||||
|
||||
ZONE_INDEX = []
|
||||
for pixel_index in range(64):
|
||||
x, y = pixel_index % 8, int(pixel_index / 8)
|
||||
ZONE_INDEX.append(int(x / 2) + int(y / 4) * 4)
|
||||
del x
|
||||
del y
|
||||
ZONE_INDEX = [
|
||||
(int(x / 2) + int(y / 4) * 4)
|
||||
for x, y in [
|
||||
(pixel_index % 8, int(pixel_index / 8))
|
||||
for pixel_index in range(64)
|
||||
]
|
||||
]
|
||||
|
||||
def drawText(image, position, text, font_file, font_size, color):
|
||||
draw = ImageDraw.Draw(image)
|
||||
|
|
@ -165,8 +166,10 @@ def wrapText(text, max_width, max_lines, font_file, font_size):
|
|||
if width <= max_width and width > min_width:
|
||||
min_width = width
|
||||
return min_width
|
||||
|
||||
def get_width(string):
|
||||
return draw.textsize(string, font=font)[0]
|
||||
|
||||
image = Image.new('RGB', (1, 1))
|
||||
draw = ImageDraw.Draw(image)
|
||||
font = ImageFont.truetype(font_file, font_size, encoding='unic')
|
||||
|
|
|
|||
|
|
@ -208,7 +208,7 @@ def langTo3Code(lang):
|
|||
if lang:
|
||||
lang = langEnglishName(lang)
|
||||
if lang:
|
||||
lang=lang.lower()
|
||||
lang = lang.lower()
|
||||
for l in _iso639_languages:
|
||||
if l[0].lower() == lang:
|
||||
return l[3]
|
||||
|
|
@ -218,7 +218,7 @@ def langTo2Code(lang):
|
|||
if lang:
|
||||
lang = langEnglishName(lang)
|
||||
if lang:
|
||||
lang=lang.lower()
|
||||
lang = lang.lower()
|
||||
for l in _iso639_languages:
|
||||
if l[0].lower() == lang:
|
||||
return l[2]
|
||||
|
|
|
|||
|
|
@ -11,9 +11,9 @@ def minify(source, comment=''):
|
|||
pass
|
||||
# python2 performance with unicode string is terrible
|
||||
if PY2:
|
||||
if isinstance(source, unicode):
|
||||
if isinstance(source, unicode): # pylint: disable=undefined-variable
|
||||
source = source.encode('utf-8')
|
||||
if isinstance(comment, unicode):
|
||||
if isinstance(comment, unicode): # pylint: disable=undefined-variable
|
||||
comment = comment.encode('utf-8')
|
||||
tokens = tokenize(source)
|
||||
length = len(tokens)
|
||||
|
|
@ -30,20 +30,20 @@ def minify(source, comment=''):
|
|||
# numbers or strings or unary operators or grouping operators
|
||||
# with a single newline, otherwise remove it
|
||||
if prevToken and nextToken\
|
||||
and (prevToken['type'] in ['identifier', 'number', 'string']\
|
||||
or prevToken['value'] in ['++', '--', ')', ']', '}'])\
|
||||
and (nextToken['type'] in ['identifier', 'number', 'string']\
|
||||
or nextToken['value'] in ['+', '-', '++', '--', '~', '!', '(', '[', '{']):
|
||||
and (prevToken['type'] in ['identifier', 'number', 'string']
|
||||
or prevToken['value'] in ['++', '--', ')', ']', '}']) \
|
||||
and (nextToken['type'] in ['identifier', 'number', 'string']
|
||||
or nextToken['value'] in ['+', '-', '++', '--', '~', '!', '(', '[', '{']):
|
||||
minified += '\n'
|
||||
elif token['type'] == 'whitespace':
|
||||
# replace whitespace between two tokens that are identifiers or
|
||||
# numbers, or between a token that ends with "+" or "-" and one that
|
||||
# begins with "+" or "-", with a single space, otherwise remove it
|
||||
if prevToken and nextToken\
|
||||
and ((prevToken['type'] in ['identifier', 'number']\
|
||||
and nextToken['type'] in ['identifier', 'number'])
|
||||
or (prevToken['value'] in ['+', '-', '++', '--']
|
||||
and nextToken['value'] in ['+', '-', '++', '--'])):
|
||||
if prevToken and nextToken \
|
||||
and ((prevToken['type'] in ['identifier', 'number'] and
|
||||
nextToken['type'] in ['identifier', 'number']) or
|
||||
(prevToken['value'] in ['+', '-', '++', '--'] and
|
||||
nextToken['value'] in ['+', '-', '++', '--'])):
|
||||
minified += ' '
|
||||
elif token['type'] != 'comment':
|
||||
# remove comments and leave all other tokens untouched
|
||||
|
|
@ -178,7 +178,7 @@ def tokenize(source):
|
|||
'value': value
|
||||
})
|
||||
if type == 'comment':
|
||||
lines = value.split('\n');
|
||||
lines = value.split('\n')
|
||||
column = len(lines[-1])
|
||||
line += len(lines) - 1
|
||||
elif type == 'linebreak':
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
#!/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
from __future__ import with_statement, print_function
|
||||
from __future__ import print_function
|
||||
|
||||
import re
|
||||
|
||||
|
|
@ -23,11 +23,11 @@ def loads(source):
|
|||
try:
|
||||
m = re.search(r'line (\d+) column (\d+)', msg)
|
||||
if m:
|
||||
(lineno, colno) = map(int, m.groups())
|
||||
(lineno, colno) = [int(n) for n in m.groups()]
|
||||
except:
|
||||
pass
|
||||
if lineno and colno:
|
||||
s = minified.split('\n')
|
||||
context = s[lineno-1][max(0, colno-30):colno+30]
|
||||
msg += ' at:\n\n %s\n %s\033[1m^\033[0m' %(context, ' ' * (colno - max(0, colno-30) - 2))
|
||||
msg += ' at:\n\n %s\n %s\033[1m^\033[0m' % (context, ' ' * (colno - max(0, colno-30) - 2))
|
||||
raise ValueError(msg)
|
||||
|
|
|
|||
|
|
@ -29,7 +29,7 @@ def format_path(data, directory_key='director'):
|
|||
director = data['directorSort'] or ['Unknown Director']
|
||||
title = data['seriesTitle' if data['isEpisode'] else 'title'] or 'Untitled'
|
||||
year = data['seriesYear' if data['isEpisode'] else 'year'] or None
|
||||
parts = list(map(format_underscores, filter(lambda x: x != None, [
|
||||
parts = list(map(format_underscores, filter(lambda x: x is not None, [
|
||||
u'; '.join(director[:10]),
|
||||
u'%s%s' % (title, u' (%s)' % year if year else ''),
|
||||
u'%s%s%s%s%s%s' % (
|
||||
|
|
@ -60,7 +60,7 @@ def parse_item_files(files):
|
|||
def get_version_key(file, extension=True):
|
||||
return '%s/%s-part/%s' % (
|
||||
file['version'] or '',
|
||||
'single' if file['part'] == None else 'multi',
|
||||
'single' if file['part'] is None else 'multi',
|
||||
file['extension'] if extension else ''
|
||||
)
|
||||
# filter out duplicate files (keep shortest path, sorted alphabetically)
|
||||
|
|
@ -70,7 +70,7 @@ def parse_item_files(files):
|
|||
duplicate_files = []
|
||||
for key in [get_file_key(file) for file in files]:
|
||||
key_files = sorted(
|
||||
sorted([file for file in files if get_file_key(file) == key]),
|
||||
[file for file in files if get_file_key(file) == key],
|
||||
key=lambda x: len(x['path'])
|
||||
)
|
||||
unique_files.append(key_files[0])
|
||||
|
|
@ -114,10 +114,8 @@ def parse_item_files(files):
|
|||
# determine preferred subtitle language
|
||||
language[version_key] = None
|
||||
subtitle_files = [file for file in version_files[version_key] if file['extension'] == 'srt']
|
||||
for subtitle_language in sorted(
|
||||
list(set([file['language'] for file in subtitle_files])),
|
||||
key=lambda x: LANGUAGES.index(x) if x in LANGUAGES else x
|
||||
):
|
||||
subtitle_languages = list(set([file['language'] for file in subtitle_files]))
|
||||
for subtitle_language in sorted(subtitle_languages, key=subtitle_sort):
|
||||
language_files = [file for file in subtitle_files if file['language'] == subtitle_language]
|
||||
if len(subtitle_files) == len(parts):
|
||||
language[version_key] = subtitle_language
|
||||
|
|
@ -188,25 +186,30 @@ def parse_path(path, directory_key='director'):
|
|||
|
||||
# TODO: '.com.avi'
|
||||
'''
|
||||
|
||||
def parse_type(string):
|
||||
for type in EXTENSIONS:
|
||||
if string in EXTENSIONS[type]:
|
||||
return type
|
||||
return None
|
||||
|
||||
def parse_underscores(string):
|
||||
string = unicodedata.normalize('NFC', string)
|
||||
# '^_' or '_$' is '.'
|
||||
string = re.sub('^_', '.', string)
|
||||
string = re.sub('_$', '.', string)
|
||||
# '_.foo$' or '_ (' is '?'
|
||||
string = re.sub('_(?=(\.\w+$| \())', '?', string)
|
||||
string = re.sub(re.compile('_(?=(\.\w+$| \())', re.U), '?', string)
|
||||
# ' _..._ ' is '<...>'
|
||||
string = re.sub('(?<= )_(.+)_(?= )', '<\g<1>>', string)
|
||||
# 'foo_bar' or 'foo _ bar' is '/'
|
||||
string = re.sub('(?<=\w)_(?=\w)', '/', string)
|
||||
string = re.sub(re.compile('(?<=\w)_(?=\w)', re.U), '/', string)
|
||||
string = re.sub(' _ ', ' / ', string)
|
||||
# 'foo_ ' is ':'
|
||||
string = re.sub('(?<=\w)_ ', ': ', string)
|
||||
string = re.sub(re.compile('(?<=\w)_ ', re.U), ': ', string)
|
||||
string = unicodedata.normalize('NFD', string)
|
||||
return string
|
||||
|
||||
data = {}
|
||||
parts = list(map(lambda x: parse_underscores(x.strip()), unicodedata.normalize('NFD', path).split('/')))
|
||||
# subdirectory
|
||||
|
|
@ -269,12 +272,12 @@ def parse_path(path, directory_key='director'):
|
|||
# isEpisode, seriesTitle, seriesYear
|
||||
data['isEpisode'] = False
|
||||
data['seriesTitle'] = data['seriesYear'] = None
|
||||
if data['season'] != None or data['episode'] != None or data['episodes']:
|
||||
if data['season'] is not None or data['episode'] is not None or data['episodes']:
|
||||
data['isEpisode'] = True
|
||||
data['seriesTitle'] = data['title']
|
||||
season = 'S%02d' % data['season'] if data['season'] != None else ''
|
||||
season = 'S%02d' % data['season'] if data['season'] is not None else ''
|
||||
episode = ''
|
||||
if data['episode'] != None:
|
||||
if data['episode'] is not None:
|
||||
episode = 'E%02d' % data['episode']
|
||||
elif data['episodes']:
|
||||
episode = 'E%02d%s%02d' % (
|
||||
|
|
@ -356,7 +359,7 @@ def parse_movie_path(path):
|
|||
director = "%s." % director[:-1]
|
||||
director = director.split('; ')
|
||||
director = [normalize_name(d).strip() for d in director]
|
||||
director = filter(lambda d: d not in ('Unknown Director', 'Various Directors'), director)
|
||||
director = list(filter(lambda d: d not in ('Unknown Director', 'Various Directors'), director))
|
||||
else:
|
||||
director = []
|
||||
|
||||
|
|
@ -376,9 +379,9 @@ def parse_movie_path(path):
|
|||
season = match.group(3)
|
||||
episode = match.group(5)
|
||||
episodeTitle = (match.group(6) or '').strip()
|
||||
if episode != None:
|
||||
if episode is not None:
|
||||
episode = int(episode)
|
||||
if season != None:
|
||||
if season is not None:
|
||||
season = int(season)
|
||||
if episode and not season:
|
||||
season = 1
|
||||
|
|
@ -396,7 +399,7 @@ def parse_movie_path(path):
|
|||
else:
|
||||
episode = None
|
||||
|
||||
if episode and 'Episode %d'%episode in fileparts:
|
||||
if episode and 'Episode %d' % episode in fileparts:
|
||||
episodeTitle = fileparts.index('Episode %d' % episode) + 1
|
||||
episodeTitle = fileparts[episodeTitle]
|
||||
if episodeTitle == extension or episodeTitle.startswith('Part'):
|
||||
|
|
@ -482,3 +485,11 @@ def get_oxid(title, director=[], year='',
|
|||
oxid = get_hash('\n'.join([director, title, str(year), str(season)]))[:8] + \
|
||||
get_hash('\n'.join([str(episode), episode_director, episode_title, str(episode_year)]))[:8]
|
||||
return u'0x' + oxid
|
||||
|
||||
def subtitle_sort(language):
|
||||
if language in LANGUAGES:
|
||||
return str(LANGUAGES.index(language))
|
||||
elif language is None:
|
||||
return str(len(LANGUAGES))
|
||||
else:
|
||||
return language
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
# GPL 2008
|
||||
from __future__ import with_statement, print_function
|
||||
from __future__ import print_function
|
||||
import gzip
|
||||
import json
|
||||
import os
|
||||
|
|
@ -16,14 +16,14 @@ from chardet.universaldetector import UniversalDetector
|
|||
DEBUG = False
|
||||
# Default headers for HTTP requests.
|
||||
DEFAULT_HEADERS = {
|
||||
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:38.0) Gecko/20100101 Firefox/38.0',
|
||||
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:55.0) Gecko/20100101 Firefox/55.0',
|
||||
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||
'Accept-Language': 'en-us,en;q=0.5',
|
||||
'Accept-Encoding': 'gzip'
|
||||
'Accept-Language': 'en-US,en;q=0.8,fr;q=0.6,de;q=0.4',
|
||||
'Accept-Encoding': 'gzip',
|
||||
}
|
||||
|
||||
def status(url, data=None, headers=DEFAULT_HEADERS):
|
||||
def status(url, data=None, headers=None):
|
||||
try:
|
||||
f = open_url(url, data, headers)
|
||||
s = f.code
|
||||
|
|
@ -31,13 +31,13 @@ def status(url, data=None, headers=DEFAULT_HEADERS):
|
|||
s = e.code
|
||||
return s
|
||||
|
||||
def exists(url, data=None, headers=DEFAULT_HEADERS):
|
||||
def exists(url, data=None, headers=None):
|
||||
s = status(url, data, headers)
|
||||
if s >= 200 and s < 400:
|
||||
return True
|
||||
return False
|
||||
|
||||
def get_headers(url, data=None, headers=DEFAULT_HEADERS):
|
||||
def get_headers(url, data=None, headers=None):
|
||||
try:
|
||||
f = open_url(url, data, headers)
|
||||
f.headers['Status'] = "%s" % f.code
|
||||
|
|
@ -48,10 +48,12 @@ def get_headers(url, data=None, headers=DEFAULT_HEADERS):
|
|||
headers = e.headers
|
||||
return dict(headers)
|
||||
|
||||
def get_json(url, data=None, headers=DEFAULT_HEADERS):
|
||||
return json.loads(read_url(url, data, headers).decode('utf-8'))
|
||||
def get_json(url, data=None, headers=None):
|
||||
return json.loads(read_url(url, data, headers).decode('utf-8')) # pylint: disable=no-member
|
||||
|
||||
def open_url(url, data=None, headers=DEFAULT_HEADERS):
|
||||
def open_url(url, data=None, headers=None):
|
||||
if headers is None:
|
||||
headers = DEFAULT_HEADERS.copy()
|
||||
if PY2:
|
||||
if not isinstance(url, bytes):
|
||||
url = url.encode('utf-8')
|
||||
|
|
@ -64,7 +66,7 @@ def open_url(url, data=None, headers=DEFAULT_HEADERS):
|
|||
req = urllib.request.Request(url, data, headers)
|
||||
return urllib.request.urlopen(req)
|
||||
|
||||
def read_url(url, data=None, headers=DEFAULT_HEADERS, return_headers=False, unicode=False):
|
||||
def read_url(url, data=None, headers=None, return_headers=False, unicode=False):
|
||||
if DEBUG:
|
||||
print('ox.net.read_url', url)
|
||||
f = open_url(url, data, headers)
|
||||
|
|
@ -108,7 +110,7 @@ def detect_encoding(data):
|
|||
detector.close()
|
||||
return detector.result['encoding']
|
||||
|
||||
get_url=read_url
|
||||
get_url = read_url
|
||||
|
||||
def save_url(url, filename, overwrite=False):
|
||||
if not os.path.exists(filename) or overwrite:
|
||||
|
|
@ -119,51 +121,50 @@ def save_url(url, filename, overwrite=False):
|
|||
with open(filename, 'wb') as f:
|
||||
f.write(data)
|
||||
|
||||
def _get_size(url):
|
||||
req = urllib.request.Request(url, headers=DEFAULT_HEADERS.copy())
|
||||
req.get_method = lambda: 'HEAD'
|
||||
u = urllib.request.urlopen(req)
|
||||
if u.code != 200 or 'Content-Length' not in u.headers:
|
||||
raise IOError
|
||||
return int(u.headers['Content-Length'])
|
||||
|
||||
def _get_range(url, start, end):
|
||||
headers = DEFAULT_HEADERS.copy()
|
||||
headers['Range'] = 'bytes=%s-%s' % (start, end)
|
||||
req = urllib.request.Request(url, headers=headers)
|
||||
u = urllib.request.urlopen(req)
|
||||
return u.read()
|
||||
|
||||
def oshash(url):
|
||||
def get_size(url):
|
||||
req = urllib.request.Request(url, headers=DEFAULT_HEADERS.copy())
|
||||
req.get_method = lambda : 'HEAD'
|
||||
u = urllib.request.urlopen(req)
|
||||
if u.code != 200 or not 'Content-Length' in u.headers:
|
||||
raise IOError
|
||||
return int(u.headers['Content-Length'])
|
||||
|
||||
def get_range(url, start, end):
|
||||
headers = DEFAULT_HEADERS.copy()
|
||||
headers['Range'] = 'bytes=%s-%s' % (start, end)
|
||||
req = urllib.request.Request(url, headers=headers)
|
||||
u = urllib.request.urlopen(req)
|
||||
return u.read()
|
||||
|
||||
try:
|
||||
longlongformat = 'q' # long long
|
||||
bytesize = struct.calcsize(longlongformat)
|
||||
|
||||
filesize = get_size(url)
|
||||
hash = filesize
|
||||
head = get_range(url, 0, min(filesize, 65536))
|
||||
filesize = _get_size(url)
|
||||
hash_ = filesize
|
||||
head = _get_range(url, 0, min(filesize, 65536))
|
||||
if filesize > 65536:
|
||||
tail = get_range(url, filesize-65536, filesize)
|
||||
tail = _get_range(url, filesize-65536, filesize)
|
||||
if filesize < 65536:
|
||||
f = BytesIO(head)
|
||||
for x in range(int(filesize/bytesize)):
|
||||
for _ in range(int(filesize/bytesize)):
|
||||
buffer = f.read(bytesize)
|
||||
(l_value,)= struct.unpack(longlongformat, buffer)
|
||||
hash += l_value
|
||||
hash = hash & 0xFFFFFFFFFFFFFFFF #cut off 64bit overflow
|
||||
(l_value,) = struct.unpack(longlongformat, buffer)
|
||||
hash_ += l_value
|
||||
hash_ = hash_ & 0xFFFFFFFFFFFFFFFF # cut off 64bit overflow
|
||||
else:
|
||||
for offset in range(0, 65536, bytesize):
|
||||
buffer = head[offset:offset+bytesize]
|
||||
(l_value,)= struct.unpack(longlongformat, buffer)
|
||||
hash += l_value
|
||||
hash = hash & 0xFFFFFFFFFFFFFFFF #cut of 64bit overflow
|
||||
(l_value,) = struct.unpack(longlongformat, buffer)
|
||||
hash_ += l_value
|
||||
hash_ = hash_ & 0xFFFFFFFFFFFFFFFF # cut of 64bit overflow
|
||||
for offset in range(0, 65536, bytesize):
|
||||
buffer = tail[offset:offset+bytesize]
|
||||
(l_value,)= struct.unpack(longlongformat, buffer)
|
||||
hash += l_value
|
||||
hash = hash & 0xFFFFFFFFFFFFFFFF
|
||||
returnedhash = "%016x" % hash
|
||||
(l_value,) = struct.unpack(longlongformat, buffer)
|
||||
hash_ += l_value
|
||||
hash_ = hash_ & 0xFFFFFFFFFFFFFFFF
|
||||
returnedhash = "%016x" % hash_
|
||||
return returnedhash
|
||||
except(IOError):
|
||||
except IOError:
|
||||
return "IOError"
|
||||
|
||||
|
|
|
|||
|
|
@ -18,7 +18,8 @@ _articles = ('the', 'la', 'a', 'die', 'der', 'le', 'el',
|
|||
_articlesDict = dict([(x, x) for x in _articles])
|
||||
_spArticles = []
|
||||
for article in _articles:
|
||||
if article[-1] not in ("'", '-'): article += ' '
|
||||
if article[-1] not in ("'", '-'):
|
||||
article += ' '
|
||||
_spArticles.append(article)
|
||||
|
||||
_noarticles = (
|
||||
|
|
@ -50,8 +51,10 @@ def canonical_title(title):
|
|||
'Los Angeles Plays Itself'
|
||||
"""
|
||||
try:
|
||||
if _articlesDict.has_key(title.split(', ')[-1].lower()): return title
|
||||
except IndexError: pass
|
||||
if title.split(', ')[-1].lower() in _articlesDict:
|
||||
return title
|
||||
except IndexError:
|
||||
pass
|
||||
ltitle = title.lower()
|
||||
for start in _noarticles:
|
||||
if ltitle.startswith(start):
|
||||
|
|
@ -60,7 +63,8 @@ def canonical_title(title):
|
|||
if ltitle.startswith(article):
|
||||
lart = len(article)
|
||||
title = '%s, %s' % (title[lart:], title[:lart])
|
||||
if article[-1] == ' ': title = title[:-1]
|
||||
if article[-1] == ' ':
|
||||
title = title[:-1]
|
||||
break
|
||||
## XXX: an attempt using a dictionary lookup.
|
||||
##for artSeparator in (' ', "'", '-'):
|
||||
|
|
@ -82,9 +86,10 @@ def normalize_title(title):
|
|||
'The Movie Title'
|
||||
"""
|
||||
stitle = title.split(', ')
|
||||
if len(stitle) > 1 and _articlesDict.has_key(stitle[-1].lower()):
|
||||
if len(stitle) > 1 and stitle[-1].lower() in _articlesDict:
|
||||
sep = ' '
|
||||
if stitle[-1][-1] in ("'", '-'): sep = ''
|
||||
if stitle[-1][-1] in ("'", '-'):
|
||||
sep = ''
|
||||
title = '%s%s%s' % (stitle[-1], sep, ', '.join(stitle[:-1]))
|
||||
return title
|
||||
|
||||
|
|
@ -139,7 +144,8 @@ def canonical_name(name):
|
|||
# Don't convert names already in the canonical format.
|
||||
if name in ('Unknown Director', ):
|
||||
return name
|
||||
if name.find(', ') != -1: return name
|
||||
if name.find(', ') != -1:
|
||||
return name
|
||||
sname = name.split(' ')
|
||||
snl = len(sname)
|
||||
if snl == 2:
|
||||
|
|
@ -147,11 +153,14 @@ def canonical_name(name):
|
|||
name = '%s, %s' % (sname[1], sname[0])
|
||||
elif snl > 2:
|
||||
lsname = [x.lower() for x in sname]
|
||||
if snl == 3: _indexes = (0, snl-2)
|
||||
else: _indexes = (0, snl-2, snl-3)
|
||||
if snl == 3:
|
||||
_indexes = (0, snl-2)
|
||||
else:
|
||||
_indexes = (0, snl-2, snl-3)
|
||||
# Check for common surname prefixes at the beginning and near the end.
|
||||
for index in _indexes:
|
||||
if lsname[index] not in _sname_suffixes: continue
|
||||
if lsname[index] not in _sname_suffixes:
|
||||
continue
|
||||
try:
|
||||
# Build the surname.
|
||||
surn = '%s %s' % (sname[index], sname[index+1])
|
||||
|
|
@ -194,11 +203,12 @@ def normalize_name(name):
|
|||
|
||||
def normalize_path(path):
|
||||
path = path.replace(':', '_').replace('/', '_')
|
||||
if path.endswith('.'): path = path[:-1] + '_'
|
||||
if path.endswith('.'):
|
||||
path = path[:-1] + '_'
|
||||
return path
|
||||
|
||||
def strip_accents(s):
|
||||
if isinstance(s, str):
|
||||
s = unicode(s)
|
||||
s = s.decode('utf-8')
|
||||
return ''.join((c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn'))
|
||||
|
||||
|
|
|
|||
|
|
@ -6,13 +6,16 @@ from . import cache
|
|||
from .text import find_re
|
||||
from .utils import json, ET
|
||||
|
||||
|
||||
def get_embed_code(url, maxwidth=None, maxheight=None):
|
||||
embed = {}
|
||||
header = cache.get_headers(url)
|
||||
if header.get('content-type', '').startswith('text/html'):
|
||||
html = cache.read_url(url)
|
||||
json_oembed = filter(lambda l: 'json+oembed' in l, re.compile('<link.*?>').findall(html))
|
||||
xml_oembed = filter(lambda l: 'xml+oembed' in l, re.compile('<link.*?>').findall(html))
|
||||
links = re.compile('<link.*?>').findall(html)
|
||||
json_oembed = [l for l in links if 'json+oembed' in l]
|
||||
xml_oembed = [l for l in links if 'xml+oembed' in l]
|
||||
|
||||
if json_oembed:
|
||||
oembed_url = find_re(json_oembed[0], 'href="(.*?)"')
|
||||
if maxwidth:
|
||||
|
|
@ -21,7 +24,7 @@ def get_embed_code(url, maxwidth=None, maxheight=None):
|
|||
oembed_url += '&maxheight=%d' % maxheight
|
||||
embed = json.loads(cache.read_url(oembed_url))
|
||||
elif xml_oembed:
|
||||
oembed_url = find_re(json_oembed[0], 'href="(.*?)"')
|
||||
oembed_url = find_re(xml_oembed[0], 'href="(.*?)"')
|
||||
if maxwidth:
|
||||
oembed_url += '&maxwidth=%d' % maxwidth
|
||||
if maxheight:
|
||||
|
|
|
|||
|
|
@ -1,10 +1,11 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
from __future__ import with_statement, division, print_function
|
||||
import chardet
|
||||
import re
|
||||
from __future__ import division, print_function
|
||||
import codecs
|
||||
import re
|
||||
|
||||
import chardet
|
||||
from six import PY2
|
||||
import ox
|
||||
|
||||
|
||||
|
|
@ -12,18 +13,21 @@ __all__ = []
|
|||
|
||||
|
||||
def _detect_encoding(fp):
|
||||
bomDict={ # bytepattern : name
|
||||
(0x00, 0x00, 0xFE, 0xFF): "utf_32_be",
|
||||
(0xFF, 0xFE, 0x00, 0x00): "utf_32_le",
|
||||
(0xFE, 0xFF, None, None): "utf_16_be",
|
||||
(0xFF, 0xFE, None, None): "utf_16_le",
|
||||
(0xEF, 0xBB, 0xBF, None): "utf_8",
|
||||
}
|
||||
bomDict = { # bytepattern : name
|
||||
(0x00, 0x00, 0xFE, 0xFF): "utf_32_be",
|
||||
(0xFF, 0xFE, 0x00, 0x00): "utf_32_le",
|
||||
(0xFE, 0xFF, None, None): "utf_16_be",
|
||||
(0xFF, 0xFE, None, None): "utf_16_le",
|
||||
(0xEF, 0xBB, 0xBF, None): "utf_8",
|
||||
}
|
||||
|
||||
# go to beginning of file and get the first 4 bytes
|
||||
oldFP = fp.tell()
|
||||
fp.seek(0)
|
||||
(byte1, byte2, byte3, byte4) = tuple(map(ord, fp.read(4)))
|
||||
if PY2:
|
||||
(byte1, byte2, byte3, byte4) = [ord(b) for b in fp.read(4)]
|
||||
else:
|
||||
(byte1, byte2, byte3, byte4) = fp.read(4)
|
||||
|
||||
# try bom detection using 4 bytes, 3 bytes, or 2 bytes
|
||||
bomDetection = bomDict.get((byte1, byte2, byte3, byte4))
|
||||
|
|
@ -31,18 +35,18 @@ def _detect_encoding(fp):
|
|||
bomDetection = bomDict.get((byte1, byte2, byte3, None))
|
||||
if not bomDetection:
|
||||
bomDetection = bomDict.get((byte1, byte2, None, None))
|
||||
## if BOM detected, we're done :-)
|
||||
# if BOM detected, we're done :-)
|
||||
fp.seek(oldFP)
|
||||
if bomDetection:
|
||||
return bomDetection
|
||||
encoding = 'latin-1'
|
||||
#more character detecting magick using http://chardet.feedparser.org/
|
||||
# more character detecting magick using http://chardet.feedparser.org/
|
||||
fp.seek(0)
|
||||
rawdata = fp.read()
|
||||
#if data can be decoded as utf-8 use that, try chardet otherwise
|
||||
#chardet detects utf-8 as ISO-8859-2 most of the time
|
||||
# if data can be decoded as utf-8 use that, try chardet otherwise
|
||||
# chardet detects utf-8 as ISO-8859-2 most of the time
|
||||
try:
|
||||
data = unicode(rawdata, 'utf-8')
|
||||
rawdata.decode('utf-8')
|
||||
encoding = 'utf-8'
|
||||
except:
|
||||
encoding = chardet.detect(rawdata)['encoding']
|
||||
|
|
@ -63,26 +67,30 @@ def load(filename, offset=0):
|
|||
def parse_time(t):
|
||||
return offset + ox.time2ms(t.replace(',', '.')) / 1000
|
||||
|
||||
with open(filename) as f:
|
||||
with open(filename, 'rb') as f:
|
||||
encoding = _detect_encoding(f)
|
||||
data = f.read()
|
||||
try:
|
||||
data = unicode(data, encoding)
|
||||
data = data.decode(encoding)
|
||||
except:
|
||||
try:
|
||||
data = unicode(data, 'latin-1')
|
||||
data = data.decode('latin-1')
|
||||
except:
|
||||
print("failed to detect encoding, giving up")
|
||||
return srt
|
||||
|
||||
data = data.replace('\r\n', '\n')
|
||||
srts = re.compile('(\d\d:\d\d:\d\d[,.]\d\d\d)\s*?-->\s*?(\d\d:\d\d:\d\d[,.]\d\d\d).*?\n(.*?)\n\n', re.DOTALL)
|
||||
if not data.endswith('\n\n'):
|
||||
data += '\n\n'
|
||||
regexp = r'(\d\d:\d\d:\d\d[,.]\d\d\d)\s*?-->\s*?(\d\d:\d\d:\d\d[,.]\d\d\d).*?\n(.*?)\n\n'
|
||||
srts = re.compile(regexp, re.DOTALL)
|
||||
i = 0
|
||||
for s in srts.findall(data):
|
||||
_s = {'id': str(i),
|
||||
'in': parse_time(s[0]),
|
||||
'out': parse_time(s[1]),
|
||||
'value': s[2].strip()
|
||||
_s = {
|
||||
'id': str(i),
|
||||
'in': parse_time(s[0]),
|
||||
'out': parse_time(s[1]),
|
||||
'value': s[2].strip()
|
||||
}
|
||||
srt.append(_s)
|
||||
i += 1
|
||||
|
|
|
|||
|
|
@ -5,20 +5,67 @@ import math
|
|||
import re
|
||||
import unicodedata
|
||||
|
||||
from six.moves import reduce
|
||||
|
||||
ARTICLES = list(set([
|
||||
# def sg, def pl, indef sg, indef pl (each m/f/n)
|
||||
'der', 'die', 'das', 'ein', 'eine', # de
|
||||
'the', 'a', 'an', # en
|
||||
'el', 'la', 'lo', 'los', 'las', 'un', 'una', 'unos', 'unas', # es
|
||||
'le', "l'", 'la', 'les', 'un', 'une', 'des', # fr
|
||||
'il', 'lo', "l'" 'la', '_i', 'gli', 'le', # it
|
||||
'de', 'het', 'een', # nl
|
||||
'o', 'a', 'os', '_as', 'um', 'uma', '_uns', 'umas' # pt
|
||||
# some _disabled because of collisions
|
||||
'der', 'die', 'das', 'ein', 'eine', # de
|
||||
'the', 'a', 'an', # en
|
||||
'el', 'la', 'lo', 'los', 'las', 'un', 'una', 'unos', 'unas', # es
|
||||
'le', "l'", 'la', 'les', 'un', 'une', 'des', # fr
|
||||
'il', 'lo', "l'" 'la', '_i', 'gli', 'le', # it
|
||||
'de', 'het', 'een', # nl
|
||||
'o', 'a', 'os', '_as', 'um', 'uma', '_uns', 'umas' # pt
|
||||
# some _disabled because of collisions
|
||||
]))
|
||||
# every given name in 0xDB that matches Xxxx-yyyy Lastname
|
||||
ASIAN_FIRST_NAMES = [
|
||||
'a', 'ae', 'aeng', 'ah', 'ai', 'an', 'back', 'bae', 'ban', 'bang', 'bao',
|
||||
'beom', 'bi', 'bin', 'bo', 'bok', 'bon', 'bong', 'bu', 'bum', 'byeong',
|
||||
'byoung', 'byung', 'cai', 'chae', 'chan', 'chang', 'chao', 'cheal', 'chen',
|
||||
'cheng', 'cheol', 'cheon', 'cheong', 'cheul', 'chi', 'chia', 'chiao',
|
||||
'chieh', 'chien', 'chih', 'chin', 'ching', 'cho', 'choi', 'chong', 'choo',
|
||||
'chu', 'chuan', 'chuen', 'chul', 'chun', 'chung', 'chuo', 'chyi', 'da',
|
||||
'dae', 'dah', 'dal', 'dan', 'deok', 'do', 'dong', 'doo', 'duek', 'duk',
|
||||
'e', 'el', 'en', 'eui', 'eul', 'eun', 'eung', 'fai', 'fan', 'fang', 'fei',
|
||||
'fen', 'feng', 'fo', 'foo', 'fu', 'ga', 'gae', 'gam', 'gang', 'ge', 'gen',
|
||||
'geon', 'geun', 'gi', 'gil', 'gin', 'gnad', 'gok', 'goo', 'gook', 'gu',
|
||||
'gun', 'gwan', 'gye', 'gyeong', 'gyu', 'gyun', 'ha', 'hae', 'hak', 'han',
|
||||
'hang', 'hao', 'he', 'hee', 'heng', 'heon', 'hie', 'ho', 'hoi', 'hong',
|
||||
'hoo', 'hoon', 'hou', 'hsi', 'hsiang', 'hsiao', 'hsieh', 'hsien', 'hsin',
|
||||
'hsing', 'hsiung', 'hu', 'hua', 'huai', 'huang', 'hue', 'hui', 'hun',
|
||||
'hung', 'hwa', 'hwan', 'hwang', 'hye', 'hyeok', 'hyeon', 'hyeong', 'hyo',
|
||||
'hyuk', 'hyun', 'hyung', 'i', 'ik', 'il', 'in', 'ja', 'jae', 'jan', 'jang',
|
||||
'je', 'jee', 'jen', 'jeok', 'jeong', 'jeung', 'ji', 'jia', 'jian', 'jik',
|
||||
'jin', 'jing', 'jo', 'jong', 'joo', 'joon', 'ju', 'juan', 'jun', 'jung',
|
||||
'ka', 'kai', 'kam', 'kan', 'kang', 'kap', 'kar', 'ke', 'kee', 'kei',
|
||||
'keng', 'keum', 'keung', 'ki', 'kil', 'kin', 'kit', 'kot', 'ku', 'kua',
|
||||
'kuan', 'kuang', 'kuen', 'kun', 'kuo', 'kwang', 'kwok', 'kwon', 'kwong',
|
||||
'kyeong', 'kyo', 'kyoon', 'kyou', 'kyoung', 'kyu', 'kyun', 'kyung', 'lai',
|
||||
'lau', 'lee', 'lei', 'leng', 'leung', 'li', 'liang', 'lien', 'lin', 'ling',
|
||||
'lock', 'long', 'lun', 'lung', 'maeng', 'man', 'mei', 'mi', 'miao', 'min',
|
||||
'ming', 'mo', 'mok', 'moo', 'mook', 'moon', 'mu', 'mun', 'myeong',
|
||||
'myoeng', 'myong', 'myung', 'na', 'nae', 'nai', 'nam', 'nan', 'neung',
|
||||
'ngaru', 'ni', 'no', 'nyeo', 'oh', 'ok', 'ou', 'pai', 'pei', 'pen', 'peng',
|
||||
'pi', 'pil', 'pin', 'ping', 'po', 'pui', 'pyo', 'pyung', 'qing', 'qun',
|
||||
'ra', 'rak', 'ram', 'ran', 'reum', 'ri', 'rim', 'rin', 'roe', 'rok', 'ru',
|
||||
'rui', 'ryeon', 'ryol', 'ryong', 'sa', 'sae', 'san', 'sang', 'se', 'seo',
|
||||
'seob', 'seok', 'seol', 'seon', 'seong', 'seung', 'shan', 'shen', 'sheng',
|
||||
'shi', 'shia', 'shiang', 'shih', 'shik', 'shim', 'shin', 'shing', 'shou',
|
||||
'shu', 'shun', 'si', 'sik', 'sin', 'siu', 'so', 'song', 'soo', 'sook',
|
||||
'soon', 'su', 'suk', 'sun', 'sung', 'sup', 'szu', "t'ien", 'ta', 'tae',
|
||||
'taek', 'tai', 'tak', 'te', 'ti', 'tian', 'ting', 'to', 'toa', 'tsai',
|
||||
'tsan', 'tse', 'tso', 'tsui', 'tung', 'tzu', 'ua', 'ui', 'un', 'wah',
|
||||
'wai', 'wan', 'wei', 'wen', 'weon', 'wing', 'wit', 'wol', 'won', 'woo',
|
||||
'wook', 'woon', 'woong', 'wuk', 'xiao', 'ya', 'yan', 'yang', 'yao', 'ye',
|
||||
'yea', 'yee', 'yeh', 'yen', 'yeo', 'yeol', 'yeon', 'yeong', 'yeop', 'yi',
|
||||
'yin', 'ying', 'yiu', 'yoeng', 'yong', 'yoo', 'yoon', 'you', 'young', 'yu',
|
||||
'yuan', 'yue', 'yuen', 'yuk', 'yull', 'yun', 'yune', 'yung', 'zhi',
|
||||
'zhong', 'zhu'
|
||||
]
|
||||
# see http://en.wikipedia.org/wiki/List_of_common_Chinese_surnames
|
||||
# and http://en.wikipedia.org/wiki/List_of_Korean_family_names
|
||||
ASIAN_NAMES = [
|
||||
ASIAN_LAST_NAMES = [
|
||||
'chan', 'chang', 'chao',
|
||||
'chen', 'cheong', 'cheung',
|
||||
'chong', 'choo',
|
||||
|
|
@ -88,8 +135,8 @@ UA_REGEXPS = {
|
|||
'(Chimera)\/(\d+)',
|
||||
'(chromeframe)\/(\d+)',
|
||||
'(Edge)\/(\d+)',
|
||||
'(Epiphany)\/(\d+)', # before Chrome, Chromium and Safari
|
||||
'(Chromium)\/(\d+)', # before Chrome
|
||||
'(Epiphany)\/(\d+)', # before Chrome, Chromium and Safari
|
||||
'(Chromium)\/(\d+)', # before Chrome
|
||||
'(Chrome)\/(\d+)',
|
||||
'(FBForIPhone)',
|
||||
'(Firefox)\/(\d+)',
|
||||
|
|
@ -107,7 +154,7 @@ UA_REGEXPS = {
|
|||
'(OviBrowser)\/(\d+)',
|
||||
'Version\/(\d+).+(Safari)',
|
||||
'(WebKit)\/(\d+)',
|
||||
'(MSIE) (\d\d?(?!\d))', # last, since Opera used to mask as MSIE
|
||||
'(MSIE) (\d\d?(?!\d))', # last, since Opera used to mask as MSIE
|
||||
'(Trident)\/.*?rv:(\d+)',
|
||||
'(Gecko)',
|
||||
'(Mozilla)\/(3|4)'
|
||||
|
|
@ -117,7 +164,9 @@ UA_REGEXPS = {
|
|||
'(Google Web Preview).+Chrome\/(\d+)',
|
||||
'(Googlebot)\/(\d+)',
|
||||
'(WebCrawler)\/(\d+)',
|
||||
'(Yahoo! Slurp)\/(\d+)'
|
||||
'(Yahoo! Slurp)\/(\d+)',
|
||||
'(YandexBot)\/([\d\.]+)',
|
||||
'(YandexMobileBot)\/([\d\.]+)',
|
||||
],
|
||||
'system': [
|
||||
'(Android) (\d+)',
|
||||
|
|
@ -130,7 +179,7 @@ UA_REGEXPS = {
|
|||
'(BSD) (FreeBSD|NetBSD|OpenBSD)',
|
||||
'(CPU OS) (\d+)',
|
||||
'(iPhone OS) (\d+)',
|
||||
'(iPhone)', # Opera
|
||||
'(iPhone)', # Opera
|
||||
'(J2ME\/MIDP)',
|
||||
'(Linux).+(CentOS|CrOS|Debian|Fedora|Gentoo|Mandriva|MeeGo|Mint|Red Hat|SUSE|Ubuntu|webOS)',
|
||||
'(CentOS|CrOS|Debian|Fedora|Gentoo|Mandriva|MeeGo|Mint|Red Hat|SUSE|Ubuntu|webOS).+(Linux)',
|
||||
|
|
@ -155,12 +204,12 @@ UA_REGEXPS = {
|
|||
'(Windows) (NT \d\.\d)',
|
||||
'(Windows Phone) (\d+)',
|
||||
'(Windows Phone OS) (\d+)',
|
||||
'(Windows) (3\.1|95|98|2000|2003|CE|ME|Mobile|NT|XP)', # Opera
|
||||
'(Win) (9x 4\.90)', # Firefox
|
||||
'(Win)(16)', # Firefox
|
||||
'(Win)(9\d)', # Firefox
|
||||
'(Win)(NT)', # Firefox
|
||||
'(Win)(NT4\.0)', # Firefox
|
||||
'(Windows) (3\.1|95|98|2000|2003|CE|ME|Mobile|NT|XP)', # Opera
|
||||
'(Win) (9x 4\.90)', # Firefox
|
||||
'(Win)(16)', # Firefox
|
||||
'(Win)(9\d)', # Firefox
|
||||
'(Win)(NT)', # Firefox
|
||||
'(Win)(NT4\.0)', # Firefox
|
||||
'(X11)'
|
||||
]
|
||||
}
|
||||
|
|
@ -244,15 +293,41 @@ def get_sort_name(name):
|
|||
>>> get_sort_name('Scorsese, Martin')
|
||||
'Scorsese, Martin'
|
||||
"""
|
||||
if not ' ' in name or ', ' in name:
|
||||
if ' ' not in name or ', ' in name:
|
||||
return name
|
||||
if name.lower().startswith('the '):
|
||||
return get_sort_title(name)
|
||||
|
||||
def add_name():
|
||||
if len(first_names):
|
||||
last_names.insert(0, first_names.pop())
|
||||
|
||||
def find_name(names):
|
||||
return len(first_names) and first_names[-1].lower() in names
|
||||
|
||||
if is_asian_name(name):
|
||||
names = name.replace('-', ' ').split(' ')
|
||||
if len(names) == 2:
|
||||
if names[0].lower() in ASIAN_LAST_NAMES:
|
||||
lastname, firstname = names
|
||||
else:
|
||||
firstname, lastname = names
|
||||
else:
|
||||
names_ = name.split(' ')
|
||||
if '-' in names_[0]:
|
||||
lastname, firstname = [names[2], names[0] + '-' + names[1].lower()]
|
||||
elif '-' in names_[1]:
|
||||
lastname, firstname = [names[0], names[1] + '-' + names[2].lower()]
|
||||
elif names[0].lower() in ASIAN_FIRST_NAMES and names[2].lower() not in ASIAN_FIRST_NAMES:
|
||||
lastname, firstname = [names[2], names[0] + ' ' + names[1]]
|
||||
elif names[0].lower() not in ASIAN_FIRST_NAMES and names[2].lower() in ASIAN_FIRST_NAMES:
|
||||
lastname, firstname = [names[0], names[1] + ' ' + names[2]]
|
||||
elif names[0].lower() in ASIAN_LAST_NAMES:
|
||||
lastname, firstname = [names[0], names[1] + ' ' + names[2]]
|
||||
else:
|
||||
lastname, firstname = [names[2], names[0] + ' ' + names[1]]
|
||||
return lastname + ' ' + firstname
|
||||
|
||||
first_names = name.split(' ')
|
||||
last_names = []
|
||||
if re.search('^[0-9]+$', first_names[-1]):
|
||||
|
|
@ -269,7 +344,7 @@ def get_sort_name(name):
|
|||
add_name()
|
||||
name = ' '.join(last_names)
|
||||
if len(first_names):
|
||||
separator = ' ' if last_names[0].lower() in ASIAN_NAMES else ', '
|
||||
separator = ' ' if last_names[0].lower() in ASIAN_LAST_NAMES else ', '
|
||||
name += separator + ' '.join(first_names)
|
||||
return name
|
||||
|
||||
|
|
@ -299,8 +374,8 @@ def find_re(string, regexp):
|
|||
return result[0].strip()
|
||||
return ''
|
||||
|
||||
def find_string(string, string0='', string1 = ''):
|
||||
"""Return the string between string0 and string1.
|
||||
def find_string(string, string0='', string1=''):
|
||||
"""Return the string between string0 and string1.
|
||||
|
||||
If string0 or string1 is left out, begining or end of string is used.
|
||||
|
||||
|
|
@ -324,12 +399,23 @@ def find_string(string, string0='', string1 = ''):
|
|||
string1 = '$'
|
||||
return find_re(string, string0 + '(.*?)' + string1)
|
||||
|
||||
def is_asian_name(name):
|
||||
names = name.replace('-', ' ').lower().split(' ')
|
||||
return (len(names) == 2 and not '-' in name and (
|
||||
(names[0] in ASIAN_FIRST_NAMES and names[1] in ASIAN_LAST_NAMES) or
|
||||
(names[0] in ASIAN_LAST_NAMES and names[1] in ASIAN_FIRST_NAMES)
|
||||
)) or (
|
||||
len(names) == 3 and names[1] in ASIAN_FIRST_NAMES and (
|
||||
names[0] in ASIAN_FIRST_NAMES or names[2] in ASIAN_FIRST_NAMES
|
||||
)
|
||||
)
|
||||
|
||||
def parse_useragent(useragent):
|
||||
data = {}
|
||||
for key in UA_REGEXPS:
|
||||
for alias, regexp in UA_ALIASES[key].items():
|
||||
alias = alias if key == 'browser' else alias + ' \\1'
|
||||
useragent = re.sub(regexp, alias, useragent)
|
||||
useragent = re.sub(regexp, alias, useragent)
|
||||
for regexp in UA_REGEXPS[key]:
|
||||
data[key] = {'name': '', 'version': '', 'string': ''}
|
||||
match = re.compile(regexp).search(useragent)
|
||||
|
|
@ -352,7 +438,7 @@ def parse_useragent(useragent):
|
|||
'version': version,
|
||||
'string': string
|
||||
}
|
||||
break;
|
||||
break
|
||||
return data
|
||||
|
||||
def remove_special_characters(text):
|
||||
|
|
@ -373,14 +459,17 @@ def wrap(text, width):
|
|||
the text. Expects that existing line breaks are posix newlines (\n).
|
||||
See http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/148061
|
||||
"""
|
||||
return reduce(lambda line, word, width=width: '%s%s%s' %
|
||||
(line,
|
||||
' \n'[(len(line[line.rfind('\n')+1:])
|
||||
+ len(word.split('\n',1)[0]
|
||||
) >= width)],
|
||||
word),
|
||||
text.split(' ')
|
||||
)
|
||||
|
||||
def reduce_line(line, word):
|
||||
return '%s%s%s' % (
|
||||
line,
|
||||
' \n'[
|
||||
(len(line[line.rfind('\n')+1:]) + len(word.split('\n', 1)[0]) >= width)
|
||||
],
|
||||
word
|
||||
)
|
||||
|
||||
return reduce(reduce_line, text.split(' '))
|
||||
|
||||
def wrap_string(string, length=80, separator='\n', balance=False):
|
||||
'''
|
||||
|
|
@ -404,7 +493,7 @@ def wrap_string(string, length=80, separator='\n', balance=False):
|
|||
for word in words:
|
||||
if len(lines[len(lines) - 1] + word + u' ') <= length + 1:
|
||||
# word fits in current line
|
||||
lines[len(lines) - 1] += word + u' ';
|
||||
lines[len(lines) - 1] += word + u' '
|
||||
else:
|
||||
if len(word) <= length:
|
||||
# word fits in next line
|
||||
|
|
@ -414,7 +503,7 @@ def wrap_string(string, length=80, separator='\n', balance=False):
|
|||
position = length - len(lines[len(lines) - 1])
|
||||
lines[len(lines) - 1] += word[0:position]
|
||||
for i in range(position, len(word), length):
|
||||
lines.append(word[i:i+length]);
|
||||
lines.append(word[i:i+length])
|
||||
lines[len(lines) - 1] += u' '
|
||||
return separator.join(lines).strip()
|
||||
|
||||
|
|
@ -425,7 +514,7 @@ def truncate_string(string, length, padding='...', position='right'):
|
|||
# 'anticon...lement'
|
||||
# >>> truncate_string('anticonstitutionellement', 16, '...', 'right')
|
||||
# 'anticonstitut...'
|
||||
stringLength = len(string);
|
||||
stringLength = len(string)
|
||||
paddingLength = len(padding)
|
||||
if stringLength > length:
|
||||
if position == 'left':
|
||||
|
|
@ -436,7 +525,7 @@ def truncate_string(string, length, padding='...', position='right'):
|
|||
string = '%s%s%s' % (string[:left], padding, string[right:])
|
||||
elif position == 'right':
|
||||
string = '%s%s' % (string[:length - paddingLength], padding)
|
||||
return string;
|
||||
return string
|
||||
|
||||
def truncate_words(s, num):
|
||||
"""Truncates a string after a certain number of chacters, but ends with a word
|
||||
|
|
@ -473,7 +562,7 @@ def trim_string(string, num):
|
|||
def get_valid_filename(s):
|
||||
"""
|
||||
Returns the given string converted to a string that can be used for a clean
|
||||
filename. Specifically, leading and trailing spaces are removed;
|
||||
filename. Specifically, leading and trailing spaces are removed;
|
||||
all non-filename-safe characters are removed.
|
||||
|
||||
>>> get_valid_filename("john's portrait in 2004.jpg")
|
||||
|
|
@ -498,9 +587,11 @@ def get_text_list(list_, last_word='or'):
|
|||
>>> get_text_list([])
|
||||
''
|
||||
"""
|
||||
if len(list_) == 0: return ''
|
||||
if len(list_) == 1: return list_[0]
|
||||
return u'%s %s %s' % (u', '.join([unicode(i) for i in list_][:-1]), last_word, list_[-1])
|
||||
if len(list_) == 0:
|
||||
return ''
|
||||
if len(list_) == 1:
|
||||
return list_[0]
|
||||
return u'%s %s %s' % (u', '.join([i for i in list_][:-1]), last_word, list_[-1])
|
||||
|
||||
def get_list_text(text, last_word='or'):
|
||||
"""
|
||||
|
|
@ -519,7 +610,7 @@ def get_list_text(text, last_word='or'):
|
|||
if text:
|
||||
list_ = text.split(u', ')
|
||||
if list_:
|
||||
i=len(list_)-1
|
||||
i = len(list_)-1
|
||||
last = list_[i].split(last_word)
|
||||
if len(last) == 2:
|
||||
list_[i] = last[0].strip()
|
||||
|
|
@ -531,11 +622,11 @@ def normalize_newlines(text):
|
|||
|
||||
def recapitalize(text):
|
||||
"Recapitalizes text, placing caps after end-of-sentence punctuation."
|
||||
#capwords = ()
|
||||
# capwords = ()
|
||||
text = text.lower()
|
||||
capsRE = re.compile(r'(?:^|(?<=[\.\?\!] ))([a-z])')
|
||||
text = capsRE.sub(lambda x: x.group(1).upper(), text)
|
||||
#for capword in capwords:
|
||||
# for capword in capwords:
|
||||
# capwordRE = re.compile(r'\b%s\b' % capword, re.I)
|
||||
# text = capwordRE.sub(capword, text)
|
||||
return text
|
||||
|
|
@ -543,22 +634,28 @@ def recapitalize(text):
|
|||
def phone2numeric(phone):
|
||||
"Converts a phone number with letters into its numeric equivalent."
|
||||
letters = re.compile(r'[A-PR-Y]', re.I)
|
||||
char2number = lambda m: {'a': '2', 'c': '2', 'b': '2', 'e': '3',
|
||||
'd': '3', 'g': '4', 'f': '3', 'i': '4', 'h': '4', 'k': '5',
|
||||
'j': '5', 'm': '6', 'l': '5', 'o': '6', 'n': '6', 'p': '7',
|
||||
's': '7', 'r': '7', 'u': '8', 't': '8', 'w': '9', 'v': '8',
|
||||
'y': '9', 'x': '9'}.get(m.group(0).lower())
|
||||
|
||||
def char2number(m):
|
||||
return {
|
||||
'a': '2', 'c': '2', 'b': '2', 'e': '3',
|
||||
'd': '3', 'g': '4', 'f': '3', 'i': '4', 'h': '4', 'k': '5',
|
||||
'j': '5', 'm': '6', 'l': '5', 'o': '6', 'n': '6', 'p': '7',
|
||||
's': '7', 'r': '7', 'u': '8', 't': '8', 'w': '9', 'v': '8',
|
||||
'y': '9', 'x': '9'
|
||||
}.get(m.group(0).lower())
|
||||
return letters.sub(char2number, phone)
|
||||
|
||||
def compress_string(s):
|
||||
import cStringIO, gzip
|
||||
zbuf = cStringIO.StringIO()
|
||||
import gzip
|
||||
from six import BytesIO
|
||||
zbuf = BytesIO()
|
||||
zfile = gzip.GzipFile(mode='wb', compresslevel=6, fileobj=zbuf)
|
||||
zfile.write(s)
|
||||
zfile.close()
|
||||
return zbuf.getvalue()
|
||||
|
||||
smart_split_re = re.compile('("(?:[^"\\\\]*(?:\\\\.[^"\\\\]*)*)"|\'(?:[^\'\\\\]*(?:\\\\.[^\'\\\\]*)*)\'|[^\\s]+)')
|
||||
|
||||
def smart_split(text):
|
||||
"""
|
||||
Generator that splits a string by spaces, leaving quoted phrases together.
|
||||
|
|
@ -582,17 +679,17 @@ def words(text):
|
|||
returns words in text, removing punctuation
|
||||
"""
|
||||
text = text.split()
|
||||
return map(lambda x: re.sub("(([.!?:-_]|'s)$)", '', x), text)
|
||||
return [re.sub("(([.!?:-_]|'s)$)", '', x) for x in text]
|
||||
|
||||
def sort_string(string):
|
||||
string = string.replace(u'Æ', 'AE').replace(u'Ø', 'O').replace(u'Þ', 'Th')
|
||||
|
||||
#pad numbered titles
|
||||
# pad numbered titles
|
||||
string = re.sub('(\d),(\d{3})', '\\1\\2', string)
|
||||
string = re.sub('(\d+)', lambda x: '%010d' % int(x.group(0)), string)
|
||||
return unicodedata.normalize('NFKD', string)
|
||||
|
||||
def sorted_strings(strings, key=None):
|
||||
if not key:
|
||||
key = lambda k: sort_string(k)
|
||||
key = sort_string
|
||||
return sorted(strings, key=key)
|
||||
|
|
|
|||
|
|
@ -14,8 +14,8 @@ else:
|
|||
|
||||
__all__ = ['create_torrent', 'get_info_hash', 'get_torrent_info', 'get_files', 'get_torrent_size']
|
||||
|
||||
def create_torrent(file, url, params = {}, flag = Event(),
|
||||
progress = lambda x: None, progress_percent = 1):
|
||||
def create_torrent(file, url, params={}, flag=Event(),
|
||||
progress=lambda x: None, progress_percent=1):
|
||||
"Creates a torrent for a given file, using url as tracker url"
|
||||
from .makemetafile import make_meta_file
|
||||
return make_meta_file(file, url, params, flag, progress, progress_percent)
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
# Written by Petru Paler, Uoti Urpala, Ross Cohen and John Hoffman
|
||||
# see LICENSE.txt for license information
|
||||
from __future__ import print_function
|
||||
|
||||
from types import IntType, LongType, StringType, ListType, TupleType, DictType
|
||||
try:
|
||||
|
|
@ -53,8 +54,8 @@ def decode_dict(x, f):
|
|||
lastkey = None
|
||||
while x[f] != 'e':
|
||||
k, f = decode_string(x, f)
|
||||
#why is this needed
|
||||
#if lastkey >= k:
|
||||
# why is this needed
|
||||
# if lastkey >= k:
|
||||
# raise ValueError
|
||||
lastkey = k
|
||||
r[k], f = decode_func[x[f]](x, f)
|
||||
|
|
@ -81,9 +82,9 @@ def bdecode(x, sloppy = 1):
|
|||
r, l = decode_func[x[0]](x, 0)
|
||||
# except (IndexError, KeyError):
|
||||
except (IndexError, KeyError, ValueError):
|
||||
raise ValueError, "bad bencoded data"
|
||||
raise ValueError("bad bencoded data")
|
||||
if not sloppy and l != len(x):
|
||||
raise ValueError, "bad bencoded data"
|
||||
raise ValueError("bad bencoded data")
|
||||
return r
|
||||
|
||||
def test_bdecode():
|
||||
|
|
@ -102,10 +103,10 @@ def test_bdecode():
|
|||
assert 0
|
||||
except ValueError:
|
||||
pass
|
||||
assert bdecode('i4e') == 4L
|
||||
assert bdecode('i0e') == 0L
|
||||
assert bdecode('i123456789e') == 123456789L
|
||||
assert bdecode('i-10e') == -10L
|
||||
assert bdecode('i4e') == 4
|
||||
assert bdecode('i0e') == 0
|
||||
assert bdecode('i123456789e') == 123456789
|
||||
assert bdecode('i-10e') == -10
|
||||
try:
|
||||
bdecode('i-0e')
|
||||
assert 0
|
||||
|
|
@ -287,7 +288,7 @@ def bencode(x):
|
|||
try:
|
||||
encode_func[type(x)](x, r)
|
||||
except:
|
||||
print "*** error *** could not encode type %s (value: %s)" % (type(x), x)
|
||||
print("*** error *** could not encode type %s (value: %s)" % (type(x), x))
|
||||
assert 0
|
||||
return ''.join(r)
|
||||
|
||||
|
|
@ -295,7 +296,7 @@ def test_bencode():
|
|||
assert bencode(4) == 'i4e'
|
||||
assert bencode(0) == 'i0e'
|
||||
assert bencode(-10) == 'i-10e'
|
||||
assert bencode(12345678901234567890L) == 'i12345678901234567890e'
|
||||
assert bencode(12345678901234567890) == 'i12345678901234567890e'
|
||||
assert bencode('') == '0:'
|
||||
assert bencode('abc') == '3:abc'
|
||||
assert bencode('1234567890') == '10:1234567890'
|
||||
|
|
|
|||
|
|
@ -4,139 +4,151 @@
|
|||
#
|
||||
##
|
||||
|
||||
def _decode_int(data):
|
||||
"""
|
||||
decode integer from bytearray
|
||||
return int, remaining data
|
||||
"""
|
||||
data = data[1:]
|
||||
end = data.index(b'e')
|
||||
return int(data[:end],10), data[end+1:]
|
||||
class Decoder(object):
|
||||
|
||||
def _decode_str(data):
|
||||
"""
|
||||
decode string from bytearray
|
||||
return string, remaining data
|
||||
"""
|
||||
start = data.index(b':')
|
||||
l = int(data[:start].decode(),10)
|
||||
if l <= 0:
|
||||
raise Exception('invalid string size: %d'%d)
|
||||
start += 1
|
||||
ret = bytes(data[start:start+l])
|
||||
data = data[start+l:]
|
||||
return ret, data
|
||||
def _decode_int(self):
|
||||
"""
|
||||
decode integer from bytearray
|
||||
return int
|
||||
"""
|
||||
self.idx += 1
|
||||
start = self.idx
|
||||
end = self.data.index(b'e', self.idx)
|
||||
self.idx = end + 1
|
||||
return int(self.data[start:end])
|
||||
|
||||
def _decode_list(data):
|
||||
"""
|
||||
decode list from bytearray
|
||||
return list, remaining data
|
||||
"""
|
||||
ls = []
|
||||
data = data[1:]
|
||||
while data[0] != ord(b'e'):
|
||||
elem, data = _decode(data)
|
||||
ls.append(elem)
|
||||
return ls, data[1:]
|
||||
def _decode_str(self):
|
||||
"""
|
||||
decode string from bytearray
|
||||
return string
|
||||
"""
|
||||
start = self.data.index(b':', self.idx)
|
||||
l = int(self.data[self.idx:start].decode(), 10)
|
||||
if l < 0:
|
||||
raise Exception('invalid string size: %d' % l)
|
||||
start += 1
|
||||
ret = self.data[start:start+l]
|
||||
try:
|
||||
ret = ret.decode('utf-8')
|
||||
except:
|
||||
pass
|
||||
self.idx = start + l
|
||||
return ret
|
||||
|
||||
def _decode_dict(data):
|
||||
"""
|
||||
decode dict from bytearray
|
||||
return dict, remaining data
|
||||
"""
|
||||
d = {}
|
||||
data = data[1:]
|
||||
while data[0] != ord(b'e'):
|
||||
k, data = _decode_str(data)
|
||||
v, data = _decode(data)
|
||||
d[k.decode()] = v
|
||||
return d, data[1:]
|
||||
def _decode_list(self):
|
||||
"""
|
||||
decode list from bytearray
|
||||
return list
|
||||
"""
|
||||
ls = []
|
||||
self.idx += 1
|
||||
while self.data[self.idx] != ord(b'e'):
|
||||
ls.append(self._decode())
|
||||
self.idx += 1
|
||||
return ls
|
||||
|
||||
def _decode(data):
|
||||
"""
|
||||
decode a bytearray
|
||||
return deserialized object, remaining data
|
||||
"""
|
||||
ch = chr(data[0])
|
||||
if ch == 'l':
|
||||
return _decode_list(data)
|
||||
elif ch == 'i':
|
||||
return _decode_int(data)
|
||||
elif ch == 'd':
|
||||
return _decode_dict(data)
|
||||
elif ch.isdigit():
|
||||
return _decode_str(data)
|
||||
else:
|
||||
raise Exception('could not deserialize data: %s'%data)
|
||||
def _decode_dict(self):
|
||||
"""
|
||||
decode dict from bytearray
|
||||
return dict
|
||||
"""
|
||||
d = {}
|
||||
self.idx += 1
|
||||
while self.data[self.idx] != ord(b'e'):
|
||||
k = self._decode_str()
|
||||
v = self._decode()
|
||||
d[k] = v
|
||||
self.idx += 1
|
||||
return d
|
||||
|
||||
def _decode(self):
|
||||
ch = chr(self.data[self.idx])
|
||||
if ch == 'l':
|
||||
return self._decode_list()
|
||||
elif ch == 'i':
|
||||
return self._decode_int()
|
||||
elif ch == 'd':
|
||||
return self._decode_dict()
|
||||
elif ch.isdigit():
|
||||
return self._decode_str()
|
||||
else:
|
||||
raise Exception('could not decode data: %s' % data)
|
||||
|
||||
def decode(self, data):
|
||||
self.idx = 0
|
||||
self.data = data
|
||||
obj = self._decode()
|
||||
if len(data) != self.idx:
|
||||
raise Exception('failed to decode, extra data: %s' % data)
|
||||
return obj
|
||||
|
||||
def bdecode(data):
|
||||
"""
|
||||
decode a bytearray
|
||||
return deserialized object
|
||||
return decoded object
|
||||
"""
|
||||
obj , data = _decode(data)
|
||||
if len(data) > 0:
|
||||
raise Exception('failed to deserialize, extra data: %s'%data)
|
||||
return obj
|
||||
return Decoder().decode(data)
|
||||
|
||||
def _encode_str(s,buff):
|
||||
def _encode_str(s, buff):
|
||||
"""
|
||||
encode string to a buffer
|
||||
"""
|
||||
s = bytearray(s)
|
||||
l = len(s)
|
||||
buff.append(bytearray(str(l)+':','utf-8'))
|
||||
buff.append(bytearray(str(l)+':', 'utf-8'))
|
||||
buff.append(s)
|
||||
|
||||
def _encode_int(i,buff):
|
||||
|
||||
def _encode_int(i, buff):
|
||||
"""
|
||||
encode integer to a buffer
|
||||
"""
|
||||
buff.append(b'i')
|
||||
buff.append(bytearray(str(i),'ascii'))
|
||||
buff.append(bytearray(str(i), 'ascii'))
|
||||
buff.append(b'e')
|
||||
|
||||
def _encode_list(l,buff):
|
||||
def _encode_list(l, buff):
|
||||
"""
|
||||
encode list of elements to a buffer
|
||||
"""
|
||||
buff.append(b'l')
|
||||
for i in l:
|
||||
_encode(i,buff)
|
||||
_encode(i, buff)
|
||||
buff.append(b'e')
|
||||
|
||||
def _encode_dict(d,buff):
|
||||
def _encode_dict(d, buff):
|
||||
"""
|
||||
encode dict
|
||||
"""
|
||||
buff.append(b'd')
|
||||
l = list(d.keys())
|
||||
l.sort()
|
||||
for k in l:
|
||||
_encode(str(k),buff)
|
||||
_encode(d[k],buff)
|
||||
for k in sorted(d):
|
||||
if not isinstance(k, (bytes, str)):
|
||||
k = str(k)
|
||||
_encode(k, buff)
|
||||
_encode(d[k], buff)
|
||||
buff.append(b'e')
|
||||
|
||||
def _encode(obj,buff):
|
||||
def _encode(obj, buff):
|
||||
"""
|
||||
encode element obj to a buffer buff
|
||||
"""
|
||||
if isinstance(obj,str):
|
||||
_encode_str(bytearray(obj,'utf-8'),buff)
|
||||
elif isinstance(obj,bytes):
|
||||
_encode_str(bytearray(obj),buff)
|
||||
elif isinstance(obj,bytearray):
|
||||
_encode_str(obj,buff)
|
||||
if isinstance(obj, str):
|
||||
_encode_str(bytearray(obj, 'utf-8'), buff)
|
||||
elif isinstance(obj, bytes):
|
||||
_encode_str(bytearray(obj), buff)
|
||||
elif isinstance(obj, bytearray):
|
||||
_encode_str(obj, buff)
|
||||
elif str(obj).isdigit():
|
||||
_encode_int(obj,buff)
|
||||
elif isinstance(obj,list):
|
||||
_encode_list(obj,buff)
|
||||
elif hasattr(obj,'keys') and hasattr(obj,'values'):
|
||||
_encode_dict(obj,buff)
|
||||
elif str(obj) in ['True','False']:
|
||||
_encode_int(int(obj and '1' or '0'),buff)
|
||||
_encode_int(obj, buff)
|
||||
elif isinstance(obj, int):
|
||||
_encode_int(obj, buff)
|
||||
elif isinstance(obj, list):
|
||||
_encode_list(obj, buff)
|
||||
elif hasattr(obj, 'keys') and hasattr(obj, 'values'):
|
||||
_encode_dict(obj, buff)
|
||||
elif str(obj) in ['True', 'False']:
|
||||
_encode_int(int(obj and '1' or '0'), buff)
|
||||
else:
|
||||
raise Exception('non serializable object: %s'%obj)
|
||||
raise Exception('non serializable object: %s [%s]' % (obj, type(obj)))
|
||||
|
||||
|
||||
def bencode(obj):
|
||||
|
|
@ -144,8 +156,8 @@ def bencode(obj):
|
|||
bencode element, return bytearray
|
||||
"""
|
||||
buff = []
|
||||
_encode(obj,buff)
|
||||
ret = bytearray()
|
||||
_encode(obj, buff)
|
||||
ret = bytearray()
|
||||
for ba in buff:
|
||||
ret += ba
|
||||
ret += ba
|
||||
return bytes(ret)
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@
|
|||
import codecs
|
||||
|
||||
import ox
|
||||
|
||||
from . import srt
|
||||
|
||||
def _webvtt_timecode(t):
|
||||
return ox.format_duration(t * 1000, years=False)
|
||||
|
|
@ -30,3 +30,13 @@ def encode(data, webvtt=False):
|
|||
)
|
||||
|
||||
return codecs.BOM_UTF8 + srt.encode('utf-8')
|
||||
|
||||
def load(filename, offset=0):
|
||||
'''Parses vtt file
|
||||
|
||||
filename: path to an vtt file
|
||||
offset (float, seconds): shift all in/out points by offset
|
||||
|
||||
Returns list with dicts that have in, out, value and id
|
||||
'''
|
||||
return srt.load(filename, offset)
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ from __future__ import print_function
|
|||
import json
|
||||
import re
|
||||
|
||||
from six import text_type
|
||||
from ox.cache import read_url
|
||||
|
||||
HEADERS = {
|
||||
|
|
@ -16,9 +17,9 @@ USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7) '
|
|||
USER_AGENT += 'AppleWebKit/534.48.3 (KHTML, like Gecko) Version/5.1 Safari/534.48.3'
|
||||
|
||||
def get_movie_data(title, director):
|
||||
if isinstance(title, unicode):
|
||||
if isinstance(title, text_type):
|
||||
title = title.encode('utf-8')
|
||||
if isinstance(director, unicode):
|
||||
if isinstance(director, text_type):
|
||||
director = director.encode('utf-8')
|
||||
data = {}
|
||||
# itunes section (preferred source for link)
|
||||
|
|
@ -45,7 +46,7 @@ def get_movie_data(title, director):
|
|||
results = js['results']
|
||||
if results:
|
||||
url = host + results[0]['location']
|
||||
if not 'link' in data:
|
||||
if 'link' not in data:
|
||||
data['link'] = url
|
||||
headers = {
|
||||
'User-Agent': USER_AGENT
|
||||
|
|
|
|||
|
|
@ -17,7 +17,7 @@ def get(key):
|
|||
if key in auth:
|
||||
return auth[key]
|
||||
print("please add key %s to json file '%s'" % (key, user_auth))
|
||||
raise Exception,"no key %s found" % key
|
||||
raise Exception("no key %s found" % key)
|
||||
|
||||
def update(key, value):
|
||||
user_auth = os.environ.get('oxAUTH', os.path.expanduser('~/.ox/auth.json'))
|
||||
|
|
@ -31,4 +31,3 @@ def update(key, value):
|
|||
f = open(user_auth, "w")
|
||||
f.write(json.dumps(auth, indent=2))
|
||||
f.close()
|
||||
|
||||
|
|
|
|||
|
|
@ -8,13 +8,13 @@ from ox.cache import read_url
|
|||
from ox.html import strip_tags, decode_html
|
||||
from ox.text import find_re
|
||||
|
||||
import imdb
|
||||
from . import imdb
|
||||
|
||||
def get_id(url):
|
||||
return url.split("/")[-1]
|
||||
|
||||
def get_url(id):
|
||||
return "http://www.criterion.com/films/%s" % id
|
||||
return "https://www.criterion.com/films/%s" % id
|
||||
|
||||
def get_data(id, timeout=ox.cache.cache_timeout, get_imdb=False):
|
||||
'''
|
||||
|
|
@ -28,23 +28,34 @@ def get_data(id, timeout=ox.cache.cache_timeout, get_imdb=False):
|
|||
u'http://s3.amazonaws.com/criterion-production/product_images/185/343_box_348x490.jpg'
|
||||
'''
|
||||
data = {
|
||||
"id": id,
|
||||
"url": get_url(id)
|
||||
}
|
||||
try:
|
||||
html = read_url(data["url"], timeout=timeout, unicode=True)
|
||||
except:
|
||||
html = ox.cache.read_url(data["url"], timeout=timeout)
|
||||
data["number"] = find_re(html, "<li>Spine #(\d+)")
|
||||
html = read_url(data["url"], timeout=timeout).decode('utf-8', 'ignore')
|
||||
|
||||
data["title"] = decode_html(find_re(html, "<h1 class=\"movietitle\">(.*?)</h1>"))
|
||||
data["number"] = find_re(html, "<b>Spine #(\d+)")
|
||||
|
||||
data["title"] = decode_html(find_re(html, "<h1 class=\"header__primarytitle\".*?>(.*?)</h1>"))
|
||||
data["title"] = data["title"].split(u' \u2014 The Television Version')[0].strip()
|
||||
data["director"] = strip_tags(find_re(html, "<h2 class=\"director\">(.*?)</h2>"))
|
||||
results = find_re(html, '<div class="left_column">(.*?)</div>')
|
||||
results = re.compile("<li>(.*?)</li>").findall(results)
|
||||
data["country"] = results[0]
|
||||
data["year"] = results[1]
|
||||
results = find_re(html, '<ul class="film-meta-list">(.*?)</ul>')
|
||||
info = re.compile('<li itemprop="(.*?)".*?>(.*?)</li>', re.DOTALL).findall(results)
|
||||
info = {k: strip_tags(v).strip() for k, v in info}
|
||||
if 'director' in info:
|
||||
data['director'] = info['director']
|
||||
if 'countryOfOrigin' in info:
|
||||
data['country'] = [c.strip() for c in decode_html(info['countryOfOrigin']).split(', ')]
|
||||
if 'inLanguage' in info:
|
||||
data['language'] = [l.strip() for l in decode_html(info['inLanguage']).split(', ')]
|
||||
for v in re.compile('<li>(.*?)</li>', re.DOTALL).findall(results):
|
||||
if 'datePublished' in v:
|
||||
data['year'] = strip_tags(v).strip()
|
||||
elif 'duration' in v:
|
||||
data['duration'] = strip_tags(v).strip()
|
||||
data["synopsis"] = decode_html(strip_tags(find_re(html,
|
||||
"<div class=\"content_block last\">.*?<p>(.*?)</p>")))
|
||||
"<div class=\"product-summary\".*?>.*?<p>(.*?)</p>")))
|
||||
|
||||
result = find_re(html, "<div class=\"purchase\">(.*?)</div>")
|
||||
if 'Blu-Ray' in result or 'Essential Art House DVD' in result:
|
||||
|
|
@ -56,47 +67,46 @@ def get_data(id, timeout=ox.cache.cache_timeout, get_imdb=False):
|
|||
data["posters"] = [result]
|
||||
else:
|
||||
html_ = read_url(result, unicode=True)
|
||||
result = find_re(html_, '<a href="http://www.criterion.com/films/%s.*?">(.*?)</a>' % id)
|
||||
result = find_re(html_, '//www.criterion.com/films/%s.*?">(.*?)</a>' % id)
|
||||
result = find_re(result, "src=\"(.*?)\"")
|
||||
if result:
|
||||
data["posters"] = [result.replace("_w100", "")]
|
||||
else:
|
||||
data["posters"] = []
|
||||
data['posters'] = [re.sub('(\?\d+)$', '', p) for p in data['posters']]
|
||||
data['posters'] = [p for p in data['posters'] if p]
|
||||
|
||||
posters = find_re(html, '<div class="product-box-art".*?>(.*?)</div>')
|
||||
for poster in re.compile('<img src="(.*?)"').findall(posters):
|
||||
data['posters'].append(poster)
|
||||
|
||||
result = find_re(html, "<img alt=\"Film Still\" height=\"252\" src=\"(.*?)\"")
|
||||
if result:
|
||||
data["stills"] = [result]
|
||||
data["trailers"] = []
|
||||
else:
|
||||
data["stills"] = filter(lambda x: x, [find_re(html, "\"thumbnailURL\", \"(.*?)\"")])
|
||||
data["trailers"] = filter(lambda x: x, [find_re(html, "\"videoURL\", \"(.*?)\"")])
|
||||
data["stills"] = list(filter(lambda x: x, [find_re(html, "\"thumbnailURL\", \"(.*?)\"")]))
|
||||
data["trailers"] = list(filter(lambda x: x, [find_re(html, "\"videoURL\", \"(.*?)\"")]))
|
||||
|
||||
if timeout == ox.cache.cache_timeout:
|
||||
timeout = -1
|
||||
if get_imdb:
|
||||
if get_imdb and 'title' in data and 'director' in data:
|
||||
# removed year, as "title (year)" may fail to match
|
||||
data['imdbId'] = imdb.get_movie_id(data['title'], data['director'], timeout=timeout)
|
||||
return data
|
||||
|
||||
def get_ids(page=None):
|
||||
ids = []
|
||||
if page:
|
||||
url = "http://www.criterion.com/library/expanded_view?m=dvd&p=%s&pp=50&s=spine" % page
|
||||
html = read_url(url)
|
||||
results = re.compile("films/(\d+)").findall(html)
|
||||
html = read_url("https://www.criterion.com/shop/browse/list?sort=spine_number", unicode=True)
|
||||
results = re.compile("films/(\d+)-").findall(html)
|
||||
ids += results
|
||||
results = re.compile("boxsets/(.*?)\"").findall(html)
|
||||
for result in results:
|
||||
html = read_url("https://www.criterion.com/boxsets/" + result, unicode=True)
|
||||
results = re.compile("films/(\d+)-").findall(html)
|
||||
ids += results
|
||||
results = re.compile("boxsets/(.*?)\"").findall(html)
|
||||
for result in results:
|
||||
html = read_url("http://www.criterion.com/boxsets/" + result)
|
||||
results = re.compile("films/(\d+)").findall(html)
|
||||
ids += results
|
||||
return set(ids)
|
||||
html = read_url("http://www.criterion.com/library/expanded_view?m=dvd&p=1&pp=50&s=spine", unicode=True)
|
||||
results = re.compile("\&p=(\d+)\&").findall(html)
|
||||
pages = max(map(int, results))
|
||||
for page in range(1, pages):
|
||||
ids += get_ids(page)
|
||||
return sorted(set(ids), key=int)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
print(get_ids())
|
||||
|
|
|
|||
|
|
@ -1,21 +1,21 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
import re
|
||||
from six.moves.urllib.parse import unquote
|
||||
from ox.cache import read_url
|
||||
|
||||
|
||||
def get_video_url(url):
|
||||
'''
|
||||
>>> get_video_url('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3opar_priere-pour-refuznik-1-jeanluc-goda_shortfilms').split('?auth')[0]
|
||||
'http://www.dailymotion.com/cdn/FLV-320x240/video/x3opar_priere-pour-refuznik-1-jean-luc-god_shortfilms.flv'
|
||||
|
||||
>>> get_video_url('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3ou94_priere-pour-refuznik-2-jeanluc-goda_shortfilms').split('?auth')[0]
|
||||
'http://www.dailymotion.com/cdn/FLV-320x240/video/x3ou94_priere-pour-refuznik-2-jean-luc-god_shortfilms.flv'
|
||||
'''
|
||||
data = read_url(url)
|
||||
video = re.compile('''video", "(.*?)"''').findall(data)
|
||||
for v in video:
|
||||
v = unquote(v).split('@@')[0]
|
||||
return v
|
||||
return ''
|
||||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
import re
|
||||
from six.moves.urllib.parse import unquote
|
||||
from ox.cache import read_url
|
||||
|
||||
|
||||
def get_video_url(url):
|
||||
'''
|
||||
>>> get_video_url('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3opar_priere-pour-refuznik-1-jeanluc-goda_shortfilms').split('?auth')[0]
|
||||
'http://www.dailymotion.com/cdn/FLV-320x240/video/x3opar_priere-pour-refuznik-1-jean-luc-god_shortfilms.flv'
|
||||
|
||||
>>> get_video_url('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3ou94_priere-pour-refuznik-2-jeanluc-goda_shortfilms').split('?auth')[0]
|
||||
'http://www.dailymotion.com/cdn/FLV-320x240/video/x3ou94_priere-pour-refuznik-2-jean-luc-god_shortfilms.flv'
|
||||
'''
|
||||
data = read_url(url)
|
||||
video = re.compile('''video", "(.*?)"''').findall(data)
|
||||
for v in video:
|
||||
v = unquote(v).split('@@')[0]
|
||||
return v
|
||||
return ''
|
||||
|
|
|
|||
|
|
@ -6,17 +6,25 @@ from six.moves import urllib
|
|||
import ox
|
||||
from ox import strip_tags, decode_html
|
||||
from ox.cache import read_url
|
||||
import lxml.html
|
||||
|
||||
|
||||
def find(query, timeout=ox.cache.cache_timeout):
|
||||
"""
|
||||
Returns tuples with title, url, description
|
||||
"""
|
||||
if not isinstance(query, bytes):
|
||||
query = query.encode('utf-8')
|
||||
params = urllib.parse.urlencode({'q': query})
|
||||
url = 'http://duckduckgo.com/html/?' + params
|
||||
data = read_url(url, timeout=timeout).decode('utf-8')
|
||||
doc = lxml.html.document_fromstring(data)
|
||||
results = []
|
||||
regex = '<a .*?class="large" href="(.+?)">(.*?)</a>.*?<div class="snippet">(.*?)</div>'
|
||||
for r in re.compile(regex, re.DOTALL).findall(data):
|
||||
results.append((strip_tags(decode_html(r[1])), r[0], strip_tags(decode_html(r[2]))))
|
||||
for e in doc.xpath("//a[contains(@class, 'result__a')]"):
|
||||
url = e.attrib['href']
|
||||
if 'uddg=' in url:
|
||||
url = urllib.parse.unquote(url.split('&uddg=')[-1])
|
||||
title = e.text_content()
|
||||
description = ''
|
||||
results.append((title, url, description))
|
||||
return results
|
||||
|
||||
|
|
|
|||
|
|
@ -7,7 +7,7 @@ import time
|
|||
from ox import strip_tags, find_re
|
||||
from ox.cache import read_url
|
||||
|
||||
import google
|
||||
from . import google
|
||||
|
||||
|
||||
def get_show_url(title):
|
||||
|
|
|
|||
|
|
@ -21,11 +21,11 @@ def find(query, max_results=DEFAULT_MAX_RESULTS, timeout=DEFAULT_TIMEOUT):
|
|||
"""
|
||||
Return max_results tuples with title, url, description
|
||||
|
||||
>>> find("The Matrix site:imdb.com", 1)[0][0]
|
||||
u'The Matrix (1999) - IMDb'
|
||||
>>> str(find("The Matrix site:imdb.com", 1)[0][0])
|
||||
'The Matrix (1999) - IMDb'
|
||||
|
||||
>>> find("The Matrix site:imdb.com", 1)[0][1]
|
||||
u'http://www.imdb.com/title/tt0133093/'
|
||||
>>> str(find("The Matrix site:imdb.com", 1)[0][1])
|
||||
'http://www.imdb.com/title/tt0133093/'
|
||||
"""
|
||||
results = []
|
||||
offset = 0
|
||||
|
|
|
|||
|
|
@ -7,7 +7,7 @@ import time
|
|||
import unicodedata
|
||||
|
||||
from six.moves.urllib.parse import urlencode
|
||||
from six import string_types
|
||||
from six import text_type, string_types
|
||||
|
||||
from .. import find_re, strip_tags, decode_html
|
||||
from .. import cache
|
||||
|
|
@ -18,22 +18,95 @@ from . import duckduckgo
|
|||
from ..utils import datetime
|
||||
from ..geo import normalize_country_name
|
||||
|
||||
def read_url(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None, unicode=False):
|
||||
|
||||
def prepare_url(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None, unicode=False):
|
||||
headers = headers.copy()
|
||||
# https://webapps.stackexchange.com/questions/11003/how-can-i-disable-reconfigure-imdbs-automatic-geo-location-so-it-does-not-defau
|
||||
headers['X-Forwarded-For'] = '72.21.206.80'
|
||||
return url, data, headers, timeout, unicode
|
||||
|
||||
def read_url(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None, unicode=False):
|
||||
url, data, headers, timeout, unicode = prepare_url(url, data, headers, timeout, valid, unicode)
|
||||
return cache.read_url(url, data, headers, timeout, unicode=unicode)
|
||||
|
||||
def delete_url(url, data=None, headers=cache.DEFAULT_HEADERS):
|
||||
url, data, headers, timeout, unicode = prepare_url(url, data, headers)
|
||||
cache.store.delete(url, data, headers)
|
||||
|
||||
def get_url(id):
|
||||
return "http://www.imdb.com/title/tt%s/" % id
|
||||
|
||||
|
||||
def reference_section(id):
|
||||
return {
|
||||
'page': 'reference',
|
||||
're': [
|
||||
'<h4 name="{id}" id="{id}".*?<table(.*?)</table>'.format(id=id),
|
||||
'<a href="/name/.*?>(.*?)</a>'
|
||||
],
|
||||
'type': 'list'
|
||||
}
|
||||
|
||||
|
||||
def zebra_list(label, more=None):
|
||||
conditions = {
|
||||
'page': 'reference',
|
||||
're': [
|
||||
'_label">' + label + '</td>.*?<ul(.*?)</ul>',
|
||||
'<li.*?>(.*?)</li>'
|
||||
],
|
||||
'type': 'list',
|
||||
}
|
||||
if more:
|
||||
conditions['re'] += more
|
||||
return conditions
|
||||
|
||||
def zebra_table(label, more=None, type='string'):
|
||||
conditions = {
|
||||
'page': 'reference',
|
||||
're': [
|
||||
'_label">' + label + '</td>.*?<td>(.*?)</td>',
|
||||
],
|
||||
'type': type,
|
||||
}
|
||||
if more:
|
||||
conditions['re'] += more
|
||||
return conditions
|
||||
|
||||
def parse_aspectratio(value):
|
||||
r = value
|
||||
if ':' in value:
|
||||
r = value.split(':')
|
||||
n = r[0]
|
||||
d = r[1].strip().split(' ')[0]
|
||||
try:
|
||||
if float(d):
|
||||
value = str(float(n) / float(d))
|
||||
else:
|
||||
value = str(float(n))
|
||||
except:
|
||||
print('failed to parse aspect: %s' % value)
|
||||
else:
|
||||
value = '.'.join(value.strip().split('.')[:2])
|
||||
return value
|
||||
|
||||
'''
|
||||
'posterIds': {
|
||||
'page': 'posters',
|
||||
're': '/unknown-thumbnail/media/rm(.*?)/tt',
|
||||
'type': 'list'
|
||||
},
|
||||
'''
|
||||
|
||||
class Imdb(SiteParser):
|
||||
'''
|
||||
>>> Imdb('0068646')['title']
|
||||
u'The Godfather'
|
||||
>>> Imdb('0068646')['title'] == text_type(u'The Godfather')
|
||||
True
|
||||
|
||||
>>> Imdb('0133093')['title']
|
||||
u'The Matrix'
|
||||
>>> Imdb('0133093')['title'] == text_type(u'The Matrix')
|
||||
True
|
||||
'''
|
||||
regex = {
|
||||
regex = {
|
||||
'alternativeTitles': {
|
||||
'page': 'releaseinfo',
|
||||
're': [
|
||||
|
|
@ -41,98 +114,49 @@ class Imdb(SiteParser):
|
|||
"td>(.*?)</td>.*?<td>(.*?)</td>"
|
||||
],
|
||||
'type': 'list'
|
||||
|
||||
},
|
||||
'aspectratio': {
|
||||
'page': 'combined',
|
||||
're': 'Aspect Ratio:</h5><div class="info-content">([\d\.]+)',
|
||||
'page': 'reference',
|
||||
're': [
|
||||
'Aspect Ratio</td>.*?ipl-inline-list__item">\s+([\d\.\:\ ]+)',
|
||||
parse_aspectratio,
|
||||
],
|
||||
'type': 'float',
|
||||
},
|
||||
'budget': {
|
||||
'page': 'business',
|
||||
're': [
|
||||
'<h5>Budget</h5>\s*?\$(.*?)<br',
|
||||
lambda data: find_re(decode_html(data).replace(',', ''), '\d+')
|
||||
],
|
||||
'type': 'int'
|
||||
},
|
||||
'budget': zebra_table('Budget', more=[
|
||||
lambda data: find_re(decode_html(data).replace(',', ''), '\d+')
|
||||
], type='int'),
|
||||
'cast': {
|
||||
'page': 'combined',
|
||||
'page': 'reference',
|
||||
're': [
|
||||
'<td class="nm">.*?>(.*?)</a>.*?<td class="char">(.*?)</td>',
|
||||
lambda ll: [strip_tags(l) for l in ll]
|
||||
],
|
||||
'type': 'list'
|
||||
},
|
||||
'cinematographer': {
|
||||
'page': 'combined',
|
||||
're': [
|
||||
lambda data: data.split('Series Crew')[0],
|
||||
'Cinematography by</a>(.*?)</table>',
|
||||
'<a href="/name/.*?/">(.*?)</a>'
|
||||
' <table class="cast_list">(.*?)</table>',
|
||||
'<td.*?itemprop="actor".*?>.*?>(.*?)</a>.*?<td class="character">(.*?)</td>',
|
||||
lambda ll: [strip_tags(l) for l in ll] if isinstance(ll, list) else strip_tags(ll)
|
||||
],
|
||||
'type': 'list'
|
||||
},
|
||||
'cinematographer': reference_section('cinematographers'),
|
||||
'connections': {
|
||||
'page': 'movieconnections',
|
||||
're': '<h4 class="li_group">(.*?)</h4>(.*?)(<\/div>\n <a|<script)',
|
||||
'type': 'list'
|
||||
},
|
||||
'country': {
|
||||
'page': 'combined',
|
||||
're': [
|
||||
'<div class="info"><h5>Country:</h5>.*?<div class="info">',
|
||||
#'<a href="/country/.*?">(.*?)</a>', #links changed to work with existing caches, just take all links
|
||||
'<a.*?>(.*?)</a>',
|
||||
],
|
||||
'type': 'list'
|
||||
},
|
||||
'country': zebra_list('Country', more=['<a.*?>(.*?)</a>']),
|
||||
'creator': {
|
||||
'page': 'combined',
|
||||
'page': '',
|
||||
're': [
|
||||
'<h5>Creator.?:</h5>.*?<div class="info-content">(.*?)</div>',
|
||||
'<a href="/name/.*?>(.*?)</a>'
|
||||
],
|
||||
'type': 'list'
|
||||
},
|
||||
'director': {
|
||||
'page': 'combined',
|
||||
're': [
|
||||
lambda data: data.split('<b>Series Crew</b>')[0],
|
||||
'Directed by</a>(.*?)</table>',
|
||||
'<a href="/name/.*?>(.*?)</a>'
|
||||
],
|
||||
'type': 'list'
|
||||
},
|
||||
'_director': {
|
||||
'page': 'combined',
|
||||
're': [
|
||||
'<h5>Director:</h5>.*?<div class="info-content">(.*?)</div>',
|
||||
'<a href="/name/.*?>(.*?)</a>'
|
||||
],
|
||||
'type': 'list'
|
||||
},
|
||||
'editor': {
|
||||
'page': 'combined',
|
||||
're': [
|
||||
lambda data: data.split('Series Crew')[0],
|
||||
'Film Editing by</a>(.*?)</table>',
|
||||
'<a href="/name/.*?>(.*?)</a>'
|
||||
],
|
||||
'type': 'list'
|
||||
},
|
||||
'composer': {
|
||||
'page': 'combined',
|
||||
're': [
|
||||
lambda data: data.split('Series Crew')[0],
|
||||
'Original Music by</a>(.*?)</table>',
|
||||
'<a href="/name/.*?>(.*?)</a>'
|
||||
'<div class="credit_summary_item">.*?<h4.*?>Creator.?:</h4>(.*?)</div>',
|
||||
'<a href="/name/.*?>(.*?)</a>',
|
||||
lambda ll: strip_tags(ll)
|
||||
],
|
||||
'type': 'list'
|
||||
},
|
||||
'director': reference_section('directors'),
|
||||
'editor': reference_section('editors'),
|
||||
'composer': reference_section('composers'),
|
||||
'episodeTitle': {
|
||||
'page': 'combined',
|
||||
're': '<div id="tn15title">.*?<em>(.*?)</em>',
|
||||
'page': 'reference',
|
||||
're': '<h3 itemprop="name">(.*?)<',
|
||||
'type': 'string'
|
||||
},
|
||||
'filmingLocations': {
|
||||
|
|
@ -143,71 +167,44 @@ class Imdb(SiteParser):
|
|||
],
|
||||
'type': 'list'
|
||||
},
|
||||
'genre': {
|
||||
'page': 'combined',
|
||||
're': [
|
||||
'<h5>Genre:</h5>(.*?)<hr',
|
||||
'<a href="/Sections/Genres/.*?/">(.*?)</a>'
|
||||
],
|
||||
'type': 'list'
|
||||
},
|
||||
'gross': {
|
||||
'page': 'business',
|
||||
're': [
|
||||
'<h5>Gross</h5>\s*?\$(.*?)<br',
|
||||
lambda data: find_re(data.replace(',', ''), '\d+')
|
||||
],
|
||||
'type': 'int'
|
||||
},
|
||||
'genre': zebra_list('Genres', more=['<a.*?>(.*?)</a>']),
|
||||
'gross': zebra_table('Cumulative Worldwide Gross', more=[
|
||||
lambda data: find_re(decode_html(data).replace(',', ''), '\d+')
|
||||
], type='int'),
|
||||
'keyword': {
|
||||
'page': 'keywords',
|
||||
're': '<a href="/keyword/.*?>(.*?)</a>',
|
||||
'type': 'list'
|
||||
},
|
||||
'language': {
|
||||
'page': 'combined',
|
||||
're': [
|
||||
'<div class="info"><h5>Language:</h5>.*?<div class="info">',
|
||||
#'<a href="/language/.*?">(.*?)</a>', #links changed to work with existing caches, just take all links
|
||||
'<a.*?>(.*?)</a>',
|
||||
],
|
||||
'type': 'list'
|
||||
},
|
||||
'summary': {
|
||||
'page': 'plotsummary',
|
||||
're': '<p class="plotSummary">(.*?)<\/p>',
|
||||
'language': zebra_list('Language', more=['<a.*?>(.*?)</a>']),
|
||||
'originalTitle': {
|
||||
'page': 'releaseinfo',
|
||||
're': '<td>\(original title\)</td>\s*<td>(.*?)</td>',
|
||||
'type': 'string'
|
||||
},
|
||||
'summary': zebra_table('Plot Summary', more=[
|
||||
'<p>(.*?)<em'
|
||||
]),
|
||||
'posterId': {
|
||||
'page': 'combined',
|
||||
're': '/primary-photo/media/rm(.*?)/tt',
|
||||
'page': 'reference',
|
||||
're': '<img.*?class="titlereference-primary-image".*?src="(.*?)".*?>',
|
||||
'type': 'string'
|
||||
},
|
||||
'posterIds': {
|
||||
'page': 'posters',
|
||||
're': '/unknown-thumbnail/media/rm(.*?)/tt',
|
||||
'type': 'list'
|
||||
},
|
||||
'producer': {
|
||||
'page': 'combined',
|
||||
're': [
|
||||
lambda data: data.split('Series Crew')[0],
|
||||
'Produced by</a>(.*?)</table>',
|
||||
'<a href="/name/.*?/">(.*?)</a>'
|
||||
],
|
||||
'type': 'list'
|
||||
},
|
||||
'producer': reference_section('producers'),
|
||||
'productionCompany': {
|
||||
'page': 'combined',
|
||||
'page': 'reference',
|
||||
're': [
|
||||
'Production Companies</b><ul>(.*?)</ul>',
|
||||
'Production Companies.*?<ul(.*?)</ul>',
|
||||
'<a href="/company/.*?/">(.*?)</a>'
|
||||
],
|
||||
'type': 'list'
|
||||
},
|
||||
'rating': {
|
||||
'page': 'combined',
|
||||
're': '<div class="starbar-meta">.*?<b>([\d,.]+?)/10</b>',
|
||||
'page': 'reference',
|
||||
're': [
|
||||
'<div class="ipl-rating-star ">(.*?)</div>',
|
||||
'ipl-rating-star__rating">([\d,.]+?)</span>',
|
||||
],
|
||||
'type': 'float'
|
||||
},
|
||||
'releasedate': {
|
||||
|
|
@ -218,64 +215,55 @@ class Imdb(SiteParser):
|
|||
],
|
||||
'type': 'list'
|
||||
},
|
||||
'reviews': {
|
||||
'page': 'externalreviews',
|
||||
're': [
|
||||
'<ol>(.*?)</ol>',
|
||||
'<li><a href="(http.*?)".*?>(.*?)</a></li>'
|
||||
],
|
||||
'type': 'list'
|
||||
},
|
||||
'runtime': {
|
||||
'page': 'combined',
|
||||
're': '<h5>Runtime:</h5><div class="info-content">.*?([0-9]+ sec|[0-9]+ min).*?</div>',
|
||||
'type': 'string'
|
||||
},
|
||||
'color': {
|
||||
'page': 'combined',
|
||||
're': [
|
||||
'<h5>Color:</h5><div class="info-content">(.*?)</div>',
|
||||
'<a.*?>(.*?)</a>'
|
||||
],
|
||||
'type': 'list'
|
||||
},
|
||||
'sound': {
|
||||
'page': 'combined',
|
||||
're': [
|
||||
'<h5>Sound Mix:</h5><div class="info-content">(.*?)</div>',
|
||||
'<a.*?>(.*?)</a>'
|
||||
],
|
||||
'type': 'list'
|
||||
},
|
||||
#FIXME using some /offsite/ redirect now
|
||||
#'reviews': {
|
||||
# 'page': 'externalreviews',
|
||||
# 're': [
|
||||
# '<ul class="simpleList">(.*?)</ul>',
|
||||
# '<li>.*?<a href="(http.*?)".*?>(.*?)</a>.*?</li>'
|
||||
# ],
|
||||
# 'type': 'list'
|
||||
#},
|
||||
'runtime': zebra_list('Runtime'),
|
||||
'color': zebra_list('Color', more=[
|
||||
'<a.*?>([^(<]+)',
|
||||
lambda r: r[0] if isinstance(r, list) else r,
|
||||
strip_tags
|
||||
]),
|
||||
'sound': zebra_list('Sound Mix', more=[
|
||||
'<a.*?>([^(<]+)',
|
||||
lambda r: r[0] if isinstance(r, list) else r,
|
||||
strip_tags
|
||||
]),
|
||||
'season': {
|
||||
'page': 'combined',
|
||||
'page': 'reference',
|
||||
're': [
|
||||
'<h5>Original Air Date:</h5>.*?<div class="info-content">(.*?)</div>',
|
||||
'\(Season (\d+), Episode \d+\)',
|
||||
'<ul class="ipl-inline-list titlereference-overview-season-episode-numbers">(.*?)</ul>',
|
||||
'Season (\d+)',
|
||||
],
|
||||
'type': 'int'
|
||||
},
|
||||
'episode': {
|
||||
'page': 'combined',
|
||||
'page': 'reference',
|
||||
're': [
|
||||
'<h5>Original Air Date:</h5>.*?<div class="info-content">(.*?)</div>',
|
||||
'\(Season \d+, Episode (\d+)\)',
|
||||
'<ul class="ipl-inline-list titlereference-overview-season-episode-numbers">(.*?)</ul>',
|
||||
'Episode (\d+)',
|
||||
],
|
||||
'type': 'int'
|
||||
},
|
||||
'series': {
|
||||
'page': 'combined',
|
||||
're': '<h5>TV Series:</h5>.*?<a href="/title/tt(\d{7})',
|
||||
'page': 'reference',
|
||||
're': '<h4 itemprop="name">.*?<a href="/title/tt(\d{7})',
|
||||
'type': 'string'
|
||||
},
|
||||
'isSeries': {
|
||||
'page': 'combined',
|
||||
're': '<span class="tv-extra">(TV series|TV mini-series) ',
|
||||
'page': 'reference',
|
||||
're': 'property=\'og:title\'.*?content=".*?(TV series|TV mini-series).*?"',
|
||||
'type': 'string'
|
||||
},
|
||||
'title': {
|
||||
'page': 'combined',
|
||||
're': '<h1>(.*?) <span>',
|
||||
'page': 'releaseinfo',
|
||||
're': 'h3 itemprop="name">.*?>(.*?)</a>',
|
||||
'type': 'string'
|
||||
},
|
||||
'trivia': {
|
||||
|
|
@ -287,38 +275,45 @@ class Imdb(SiteParser):
|
|||
'type': 'list',
|
||||
},
|
||||
'votes': {
|
||||
'page': 'combined',
|
||||
're': '<a href="ratings" class="tn15more">([\d,]*?) votes</a>',
|
||||
'page': 'reference',
|
||||
're': [
|
||||
'class="ipl-rating-star__total-votes">\((.*?)\)',
|
||||
lambda r: r.replace(',', '')
|
||||
],
|
||||
'type': 'string'
|
||||
},
|
||||
'writer': {
|
||||
'page': 'combined',
|
||||
'writer': reference_section('writers'),
|
||||
'year': {
|
||||
'page': 'reference',
|
||||
're': [
|
||||
lambda data: data.split('Series Crew')[0],
|
||||
'Writing credits</a>(.*?)</table>',
|
||||
'<a href="/name/.*?/">(.*?)</a>'
|
||||
'<span class="titlereference-title-year">(.*?)</span>',
|
||||
'<a.*?>(\d+)',
|
||||
],
|
||||
'type': 'int'
|
||||
},
|
||||
'credits': {
|
||||
'page': 'fullcredits',
|
||||
're': [
|
||||
lambda data: data.split('<h4'),
|
||||
'>(.*?)</h4>.*?(<table.*?</table>)',
|
||||
lambda data: [d for d in data if d]
|
||||
],
|
||||
'type': 'list'
|
||||
},
|
||||
'year': {
|
||||
'page': 'combined',
|
||||
're': '="og:title" content="[^"]*?\((\d{4}).*?"',
|
||||
'type': 'int'
|
||||
}
|
||||
}
|
||||
|
||||
def read_url(self, url, timeout):
|
||||
if not url in self._cache:
|
||||
if url not in self._cache:
|
||||
self._cache[url] = read_url(url, timeout=timeout, unicode=True)
|
||||
return self._cache[url]
|
||||
|
||||
def __init__(self, id, timeout=-1):
|
||||
#use akas.imdb.com to always get original title:
|
||||
#http://www.imdb.com/help/show_leaf?titlelanguagedisplay
|
||||
self.baseUrl = "http://akas.imdb.com/title/tt%s/" % id
|
||||
# use akas.imdb.com to always get original title:
|
||||
# http://www.imdb.com/help/show_leaf?titlelanguagedisplay
|
||||
self.baseUrl = "http://www.imdb.com/title/tt%s/" % id
|
||||
super(Imdb, self).__init__(timeout)
|
||||
|
||||
url = self.baseUrl + 'combined'
|
||||
|
||||
url = self.baseUrl + 'reference'
|
||||
page = self.read_url(url, timeout=-1)
|
||||
if '<title>IMDb: Page not found</title>' in page \
|
||||
or 'The requested URL was not found on our server.' in page:
|
||||
|
|
@ -332,119 +327,15 @@ class Imdb(SiteParser):
|
|||
isinstance(self['alternativeTitles'][0], string_types):
|
||||
self['alternativeTitles'] = [self['alternativeTitles']]
|
||||
|
||||
for key in ('country', 'genre', 'language', 'sound', 'color'):
|
||||
if key in self:
|
||||
self[key] = [x[0] if len(x) == 1 and isinstance(x, list) else x for x in self[key]]
|
||||
self[key] = list(filter(lambda x: x.lower() != 'home', self[key]))
|
||||
|
||||
#normalize country names
|
||||
if 'country' in self:
|
||||
self['country'] = [normalize_country_name(c) or c for c in self['country']]
|
||||
|
||||
if 'sound' in self:
|
||||
self['sound'] = list(set(self['sound']))
|
||||
|
||||
types = {}
|
||||
stop_words = [
|
||||
'alternative spelling',
|
||||
'alternative title',
|
||||
'alternative transliteration',
|
||||
'closing credits title',
|
||||
'complete title',
|
||||
'IMAX version',
|
||||
'informal short title',
|
||||
'International (Spanish title)',
|
||||
'Japan (imdb display title)',
|
||||
'longer version',
|
||||
'new title',
|
||||
'original subtitled version',
|
||||
'pre-release title',
|
||||
'promotional abbreviation',
|
||||
'recut version',
|
||||
'reissue title',
|
||||
'restored version',
|
||||
'script title',
|
||||
'short title',
|
||||
'(subtitle)',
|
||||
'TV title',
|
||||
'working title',
|
||||
'World-wide (Spanish title)',
|
||||
]
|
||||
#ignore english japanese titles
|
||||
#for movies that are not only from japan
|
||||
if ['Japan'] != self.get('country', []):
|
||||
stop_words += [
|
||||
'Japan (English title)'
|
||||
]
|
||||
for t in self.get('alternativeTitles', []):
|
||||
for type in t[0].split('/'):
|
||||
type = type.strip()
|
||||
stop_word = False
|
||||
for key in stop_words:
|
||||
if key in type:
|
||||
stop_word = True
|
||||
break
|
||||
if not stop_word:
|
||||
if not type in types:
|
||||
types[type] = []
|
||||
types[type].append(t[1])
|
||||
titles = {}
|
||||
for type in types:
|
||||
for title in types[type]:
|
||||
if not title in titles:
|
||||
titles[title] = []
|
||||
titles[title].append(type)
|
||||
def select_title(type):
|
||||
title = types[type][0]
|
||||
count = 0
|
||||
if len(types[type]) > 1:
|
||||
for t in types[type]:
|
||||
if len(titles[t]) > count:
|
||||
count = len(titles[t])
|
||||
title = t
|
||||
return title
|
||||
|
||||
#FIXME: does work in python2.6, possible to import from __future__?
|
||||
#types = {type: select_title(type) for type in types}
|
||||
_types = {}
|
||||
for type in types:
|
||||
_types[type] = select_title(type)
|
||||
types = _types
|
||||
|
||||
regexps = [
|
||||
"^.+ \(imdb display title\) \(English title\)$",
|
||||
"^USA \(imdb display title\)$",
|
||||
"^International \(English title\)$",
|
||||
"^International \(English title\)$",
|
||||
"^UK \(imdb display title\)$",
|
||||
"^International \(.+\) \(English title\)$",
|
||||
"^World-wide \(English title\)$",
|
||||
]
|
||||
if 'Hong Kong' in self.get('country', []):
|
||||
regexps += [
|
||||
"Hong Kong \(English title\)"
|
||||
]
|
||||
english_countries = (
|
||||
'USA', 'UK', 'United States', 'United Kingdom',
|
||||
'Australia', 'New Zealand'
|
||||
)
|
||||
if not filter(lambda c: c in english_countries, self.get('country', [])):
|
||||
regexps += [
|
||||
"^[^(]+ \(English title\)$",
|
||||
"^.+ \(.+\) \(English title\)$",
|
||||
"^USA$",
|
||||
"^UK$",
|
||||
"^USA \(.+\)$",
|
||||
"^UK \(.+\)$",
|
||||
"^Australia \(.+\)$",
|
||||
"World-wide \(English title\)",
|
||||
"\(literal English title\)",
|
||||
"^International \(.+ title\)$",
|
||||
"^International \(.+\) \(.+ title\)$",
|
||||
]
|
||||
for regexp in regexps:
|
||||
for type in types:
|
||||
if re.compile(regexp).findall(type):
|
||||
#print types[type], type
|
||||
self['internationalTitle'] = types[type]
|
||||
break
|
||||
if 'internationalTitle' in self:
|
||||
break
|
||||
|
||||
def cleanup_title(title):
|
||||
if title.startswith('"') and title.endswith('"'):
|
||||
|
|
@ -454,44 +345,43 @@ class Imdb(SiteParser):
|
|||
title = re.sub('\(\#[.\d]+\)', '', title)
|
||||
return title.strip()
|
||||
|
||||
for t in ('title', 'internationalTitle'):
|
||||
for t in ('title', 'originalTitle'):
|
||||
if t in self:
|
||||
self[t] = cleanup_title(self[t])
|
||||
|
||||
if 'internationalTitle' in self and \
|
||||
self.get('title', '').lower() == self['internationalTitle'].lower():
|
||||
del self['internationalTitle']
|
||||
|
||||
if 'alternativeTitles' in self:
|
||||
alt = {}
|
||||
for t in self['alternativeTitles']:
|
||||
title = cleanup_title(t[1])
|
||||
if title not in (self.get('title'), self.get('internationalTitle')):
|
||||
if title.lower() not in (self.get('title', '').lower(), self.get('originalTitle', '').lower()):
|
||||
if title not in alt:
|
||||
alt[title] = []
|
||||
for c in t[0].split('/'):
|
||||
if not '(working title)' in c:
|
||||
c = c.replace('International', '').replace('World-wide', '').split('(')[0].strip()
|
||||
if c:
|
||||
alt[title].append(c)
|
||||
for cleanup in ('International', '(working title)', 'World-wide'):
|
||||
c = c.replace(cleanup, '')
|
||||
c = c.split('(')[0].strip()
|
||||
if c:
|
||||
alt[title].append(c)
|
||||
self['alternativeTitles'] = []
|
||||
for t in sorted(alt, key=lambda a: sorted(alt[a])):
|
||||
countries = sorted([normalize_country_name(c) or c for c in alt[t]])
|
||||
countries = sorted(set([normalize_country_name(c) or c for c in alt[t]]))
|
||||
self['alternativeTitles'].append((t, countries))
|
||||
if not self['alternativeTitles']:
|
||||
del self['alternativeTitles']
|
||||
|
||||
if 'internationalTitle' in self:
|
||||
self['originalTitle'] = self['title']
|
||||
self['title'] = self.pop('internationalTitle')
|
||||
|
||||
if 'runtime' in self and self['runtime']:
|
||||
if 'min' in self['runtime']: base=60
|
||||
else: base=1
|
||||
if isinstance(self['runtime'], list):
|
||||
self['runtime'] = self['runtime'][0]
|
||||
if 'min' in self['runtime']:
|
||||
base = 60
|
||||
else:
|
||||
base = 1
|
||||
self['runtime'] = int(find_re(self['runtime'], '([0-9]+)')) * base
|
||||
if 'runtime' in self and not self['runtime']:
|
||||
del self['runtime']
|
||||
if 'votes' in self: self['votes'] = self['votes'].replace(',', '')
|
||||
|
||||
if 'sound' in self:
|
||||
self['sound'] = list(sorted(set(self['sound'])))
|
||||
|
||||
if 'cast' in self:
|
||||
if isinstance(self['cast'][0], string_types):
|
||||
|
|
@ -499,6 +389,7 @@ class Imdb(SiteParser):
|
|||
self['actor'] = [c[0] for c in self['cast']]
|
||||
def cleanup_character(c):
|
||||
c = c.replace('(uncredited)', '').strip()
|
||||
c = re.sub('\s+', ' ', c)
|
||||
return c
|
||||
self['cast'] = [{'actor': x[0], 'character': cleanup_character(x[1])}
|
||||
for x in self['cast']]
|
||||
|
|
@ -522,18 +413,8 @@ class Imdb(SiteParser):
|
|||
return r
|
||||
cc[rel] = list(map(get_conn, re.compile('<a href="/title/tt(\d{7})/?">(.*?)</a>(.*?)<\/div', re.DOTALL).findall(data)))
|
||||
|
||||
|
||||
self['connections'] = cc
|
||||
|
||||
for key in ('country', 'genre'):
|
||||
if key in self:
|
||||
self[key] = list(filter(lambda x: x.lower() != 'home', self[key]))
|
||||
#0092999
|
||||
if '_director' in self:
|
||||
if 'series' in self or 'isSeries' in self:
|
||||
self['creator'] = self.pop('_director')
|
||||
else:
|
||||
del self['_director']
|
||||
if 'isSeries' in self:
|
||||
del self['isSeries']
|
||||
self['isSeries'] = True
|
||||
|
|
@ -555,7 +436,7 @@ class Imdb(SiteParser):
|
|||
if 'director' in self:
|
||||
self['episodeDirector'] = self['director']
|
||||
|
||||
if not 'creator' in series and 'director' in series:
|
||||
if 'creator' not in series and 'director' in series:
|
||||
series['creator'] = series['director']
|
||||
if len(series['creator']) > 10:
|
||||
series['creator'] = series['director'][:1]
|
||||
|
|
@ -566,7 +447,7 @@ class Imdb(SiteParser):
|
|||
|
||||
if 'year' in series:
|
||||
self['seriesYear'] = series['year']
|
||||
if not 'year' in self:
|
||||
if 'year' not in self:
|
||||
self['year'] = series['year']
|
||||
|
||||
if 'year' in self:
|
||||
|
|
@ -620,11 +501,48 @@ class Imdb(SiteParser):
|
|||
self['summary'] = self['summary'][0]
|
||||
self['summary'] = self['summary'].split('</p')[0].strip()
|
||||
|
||||
if 'credits' in self:
|
||||
credits = [
|
||||
[
|
||||
strip_tags(d[0].replace(' by', '')).strip(),
|
||||
[
|
||||
[
|
||||
strip_tags(x[0]).strip(),
|
||||
[t.strip().split(' (')[0].strip() for t in x[2].split(' / ')]
|
||||
]
|
||||
for x in
|
||||
re.compile('<td class="name">(.*?)</td>.*?<td>(.*?)</td>.*?<td class="credit">(.*?)</td>', re.DOTALL).findall(d[1])
|
||||
]
|
||||
] for d in self['credits'] if d
|
||||
]
|
||||
credits = [c for c in credits if c[1]]
|
||||
|
||||
self['credits'] = []
|
||||
self['lyricist'] = []
|
||||
self['singer'] = []
|
||||
for department, crew in credits:
|
||||
department = department.replace('(in alphabetical order)', '').strip()
|
||||
for c in crew:
|
||||
name = c[0]
|
||||
roles = c[1]
|
||||
self['credits'].append({
|
||||
'name': name,
|
||||
'roles': roles,
|
||||
'deparment': department
|
||||
})
|
||||
if department == 'Music Department':
|
||||
if 'lyricist' in roles:
|
||||
self['lyricist'].append(name)
|
||||
if 'playback singer' in roles:
|
||||
self['singer'].append(name)
|
||||
if not self['credits']:
|
||||
del self['credits']
|
||||
|
||||
class ImdbCombined(Imdb):
|
||||
def __init__(self, id, timeout=-1):
|
||||
_regex = {}
|
||||
for key in self.regex:
|
||||
if self.regex[key]['page'] in ('combined', 'releaseinfo'):
|
||||
if self.regex[key]['page'] in ('releaseinfo', 'reference'):
|
||||
_regex[key] = self.regex[key]
|
||||
self.regex = _regex
|
||||
super(ImdbCombined, self).__init__(id, timeout)
|
||||
|
|
@ -640,25 +558,25 @@ def get_movie_by_title(title, timeout=-1):
|
|||
If there is more than one film with that title for the year
|
||||
Title (Year/I)
|
||||
|
||||
>>> get_movie_by_title(u'"Father Knows Best" (1954) {(#5.34)}')
|
||||
u'1602860'
|
||||
>>> str(get_movie_by_title(u'"Father Knows Best" (1954) {(#5.34)}'))
|
||||
'1602860'
|
||||
|
||||
>>> get_movie_by_title(u'The Matrix (1999)')
|
||||
u'0133093'
|
||||
>>> str(get_movie_by_title(u'The Matrix (1999)'))
|
||||
'0133093'
|
||||
|
||||
>>> get_movie_by_title(u'Little Egypt (1951)')
|
||||
u'0043748'
|
||||
>>> str(get_movie_by_title(u'Little Egypt (1951)'))
|
||||
'0043748'
|
||||
|
||||
>>> str(get_movie_by_title(u'Little Egypt (1897/I)'))
|
||||
'0214882'
|
||||
|
||||
>>> get_movie_by_title(u'Little Egypt (1897/I)')
|
||||
u'0214882'
|
||||
|
||||
>>> get_movie_by_title(u'Little Egypt')
|
||||
None
|
||||
|
||||
>>> get_movie_by_title(u'"Dexter" (2006) {Father Knows Best (#1.9)}')
|
||||
u'0866567'
|
||||
>>> str(get_movie_by_title(u'"Dexter" (2006) {Father Knows Best (#1.9)}'))
|
||||
'0866567'
|
||||
'''
|
||||
params = {'s':'tt','q': title}
|
||||
params = {'s': 'tt', 'q': title}
|
||||
if not isinstance(title, bytes):
|
||||
try:
|
||||
params['q'] = unicodedata.normalize('NFKC', params['q']).encode('latin-1')
|
||||
|
|
@ -676,20 +594,21 @@ def get_movie_by_title(title, timeout=-1):
|
|||
|
||||
def get_movie_id(title, director='', year='', timeout=-1):
|
||||
'''
|
||||
>>> get_movie_id('The Matrix')
|
||||
u'0133093'
|
||||
>>> str(get_movie_id('The Matrix'))
|
||||
'0133093'
|
||||
|
||||
>>> get_movie_id('2 or 3 Things I Know About Her', 'Jean-Luc Godard')
|
||||
u'0060304'
|
||||
>>> str(get_movie_id('2 or 3 Things I Know About Her', 'Jean-Luc Godard'))
|
||||
'0060304'
|
||||
|
||||
>>> get_movie_id('2 or 3 Things I Know About Her', 'Jean-Luc Godard', '1967')
|
||||
u'0060304'
|
||||
>>> str(get_movie_id('2 or 3 Things I Know About Her', 'Jean-Luc Godard', '1967'))
|
||||
'0060304'
|
||||
|
||||
>>> get_movie_id(u"Histoire(s) du cinema: Le controle de l'univers", 'Jean-Luc Godard')
|
||||
u'0179214'
|
||||
>>> str(get_movie_id(u"Histoire(s) du cinema: Le controle de l'univers", u'Jean-Luc Godard'))
|
||||
'0179214'
|
||||
|
||||
>>> str(get_movie_id(u"Histoire(s) du cinéma: Le contrôle de l'univers", u'Jean-Luc Godard'))
|
||||
'0179214'
|
||||
|
||||
>>> get_movie_id(u"Histoire(s) du cinéma: Le contrôle de l'univers", 'Jean-Luc Godard')
|
||||
u'0179214'
|
||||
'''
|
||||
imdbId = {
|
||||
(u'Le jour se l\xe8ve', u'Marcel Carn\xe9'): '0031514',
|
||||
|
|
@ -729,7 +648,7 @@ def get_movie_id(title, director='', year='', timeout=-1):
|
|||
}.get((title, director), None)
|
||||
if imdbId:
|
||||
return imdbId
|
||||
params = {'s':'tt','q': title}
|
||||
params = {'s': 'tt', 'q': title}
|
||||
if director:
|
||||
params['q'] = u'"%s" %s' % (title, director)
|
||||
if year:
|
||||
|
|
@ -756,8 +675,8 @@ def get_movie_id(title, director='', year='', timeout=-1):
|
|||
if results:
|
||||
return results[0]
|
||||
|
||||
#print (title, director), ": '',"
|
||||
#print google_query
|
||||
#print((title, director), ": '',")
|
||||
#print(google_query)
|
||||
#results = google.find(google_query, timeout=timeout)
|
||||
results = duckduckgo.find(google_query, timeout=timeout)
|
||||
if results:
|
||||
|
|
@ -772,15 +691,12 @@ def get_movie_poster(imdbId):
|
|||
'''
|
||||
>>> get_movie_poster('0133093')
|
||||
'http://ia.media-imdb.com/images/M/MV5BMjEzNjg1NTg2NV5BMl5BanBnXkFtZTYwNjY3MzQ5._V1._SX338_SY475_.jpg'
|
||||
|
||||
>>> get_movie_poster('0994352')
|
||||
'http://ia.media-imdb.com/images/M/MV5BMjA3NzMyMzU1MV5BMl5BanBnXkFtZTcwNjc1ODUwMg@@._V1._SX594_SY755_.jpg'
|
||||
'''
|
||||
info = ImdbCombined(imdbId)
|
||||
if 'posterId' in info:
|
||||
url = "http://www.imdb.com/media/rm%s/tt%s" % (info['posterId'], imdbId)
|
||||
data = read_url(url).decode('utf-8', 'ignore')
|
||||
poster = find_re(data, 'img.*?id="primary-img".*?src="(.*?)"')
|
||||
poster = info['posterId']
|
||||
if '@._V' in poster:
|
||||
poster = poster.split('@._V')[0] + '@.jpg'
|
||||
return poster
|
||||
elif 'series' in info:
|
||||
return get_movie_poster(info['series'])
|
||||
|
|
@ -793,7 +709,7 @@ def get_episodes(imdbId, season=None):
|
|||
url += '?season=%d' % season
|
||||
data = cache.read_url(url)
|
||||
for e in re.compile('<div data-const="tt(\d{7})".*?>.*?<div>S(\d+), Ep(\d+)<\/div>\n<\/div>', re.DOTALL).findall(data):
|
||||
episodes['S%02dE%02d' %(int(e[1]), int(e[2]))] = e[0]
|
||||
episodes['S%02dE%02d' % (int(e[1]), int(e[2]))] = e[0]
|
||||
else:
|
||||
data = cache.read_url(url)
|
||||
match = re.compile('<strong>Season (\d+)</strong>').findall(data)
|
||||
|
|
@ -804,9 +720,11 @@ def get_episodes(imdbId, season=None):
|
|||
|
||||
def max_votes():
|
||||
url = 'http://www.imdb.com/search/title?num_votes=500000,&sort=num_votes,desc'
|
||||
data = cache.read_url(url)
|
||||
votes = max([int(v.replace(',', ''))
|
||||
for v in re.compile('<td class="sort_col">([\d,]+)</td>').findall(data)])
|
||||
data = cache.read_url(url).decode('utf-8', 'ignore')
|
||||
votes = max([
|
||||
int(v.replace(',', ''))
|
||||
for v in re.compile('<span name="nv" data-value="(\d+)"').findall(data)
|
||||
])
|
||||
return votes
|
||||
|
||||
def guess(title, director='', timeout=-1):
|
||||
|
|
|
|||
|
|
@ -3,26 +3,34 @@
|
|||
from __future__ import print_function
|
||||
import re
|
||||
|
||||
from ox.cache import read_url
|
||||
import ox.cache
|
||||
from ox.html import strip_tags
|
||||
from ox.text import find_re
|
||||
|
||||
|
||||
def read_url(url, timeout=ox.cache.cache_timeout):
|
||||
data = ox.cache.read_url(url, timeout=timeout)
|
||||
try:
|
||||
data = data.decode('utf-8')
|
||||
except UnicodeDecodeError:
|
||||
data = data.decode('latin-1')
|
||||
return data
|
||||
|
||||
def get_data(id):
|
||||
'''
|
||||
>>> get_data('1991/silence_of_the_lambs')['imdbId']
|
||||
u'0102926'
|
||||
>>> str(get_data('1991/silence_of_the_lambs')['imdbId'])
|
||||
'0102926'
|
||||
|
||||
>>> get_data('1991/silence_of_the_lambs')['posters'][0]
|
||||
u'http://www.impawards.com/1991/posters/silence_of_the_lambs_ver1.jpg'
|
||||
>>> str(get_data('1991/silence_of_the_lambs')['posters'][0])
|
||||
'http://www.impawards.com/1991/posters/silence_of_the_lambs_ver1.jpg'
|
||||
|
||||
>>> get_data('1991/silence_of_the_lambs')['url']
|
||||
u'http://www.impawards.com/1991/silence_of_the_lambs_ver1.html'
|
||||
>>> str(get_data('1991/silence_of_the_lambs')['url'])
|
||||
'http://www.impawards.com/1991/silence_of_the_lambs_ver1.html'
|
||||
'''
|
||||
data = {
|
||||
'url': get_url(id)
|
||||
}
|
||||
html = read_url(data['url'], unicode=True)
|
||||
html = read_url(data['url'])
|
||||
data['imdbId'] = find_re(html, 'imdb.com/title/tt(\d{7})')
|
||||
if not data['imdbId']:
|
||||
data['imdbId'] = _id_map.get(id, '')
|
||||
|
|
@ -37,16 +45,15 @@ def get_data(id):
|
|||
for result in results:
|
||||
result = result.replace('_xlg.html', '.html')
|
||||
url = 'http://www.impawards.com/%s/%s' % (data['year'], result)
|
||||
html = read_url(url, unicode=True)
|
||||
html = read_url(url)
|
||||
result = find_re(html, '<a href = (\w*?_xlg.html)')
|
||||
if result:
|
||||
url = 'http://www.impawards.com/%s/%s' % (data['year'], result)
|
||||
html = read_url(url, unicode=True)
|
||||
html = read_url(url)
|
||||
poster = 'http://www.impawards.com/%s/%s' % (data['year'], find_re(html, '<img SRC="(.*?)"'))
|
||||
else:
|
||||
poster = 'http://www.impawards.com/%s/%s' % (data['year'], find_re(html, '<img src="(posters.*?)"'))
|
||||
data['posters'].append(poster)
|
||||
|
||||
return data
|
||||
|
||||
def get_id(url):
|
||||
|
|
@ -60,27 +67,29 @@ def get_id(url):
|
|||
id = '%s/%s' % (year, '_'.join(split))
|
||||
return id
|
||||
|
||||
|
||||
def get_ids(page=None):
|
||||
ids = []
|
||||
if page:
|
||||
html = read_url('http://www.impawards.com/archives/page%s.html' % page, timeout = -1, unicode=True)
|
||||
html = read_url('http://www.impawards.com/archives/page%s.html' % page, timeout=-1)
|
||||
results = re.compile('<a href = \.\./(.*?)>', re.DOTALL).findall(html)
|
||||
for result in results:
|
||||
url = 'http://impawards.com/%s' % result
|
||||
ids.append(get_id(url))
|
||||
return set(ids)
|
||||
#get all
|
||||
html = read_url('http://www.impawards.com/archives/latest.html', timeout = 60*60, unicode=True)
|
||||
# get all
|
||||
html = read_url('http://www.impawards.com/archives/latest.html', timeout=60*60)
|
||||
pages = int(find_re(html, '<a href= page(.*?).html>')) + 1
|
||||
for page in range(pages, 0, -1):
|
||||
for id in get_ids(page):
|
||||
if not id in ids:
|
||||
if id not in ids:
|
||||
ids.append(id)
|
||||
return ids
|
||||
|
||||
|
||||
def get_url(id):
|
||||
url = u"http://www.impawards.com/%s.html" % id
|
||||
html = read_url(url, unicode=True)
|
||||
html = read_url(url)
|
||||
if find_re(html, "No Movie Posters on This Page"):
|
||||
url = u"http://www.impawards.com/%s_ver1.html" % id
|
||||
return url
|
||||
|
|
|
|||
|
|
@ -28,22 +28,32 @@ def get_show_url(title):
|
|||
def get_data(url):
|
||||
data = read_url(url, unicode=True)
|
||||
doc = document_fromstring(data)
|
||||
score = filter(lambda s: s.attrib.get('property') == 'v:average',
|
||||
doc.xpath('//span[@class="score_value"]'))
|
||||
score = [s for s in doc.xpath('//span[@class="score_value"]')
|
||||
if s.attrib.get('property') == 'v:average']
|
||||
if score:
|
||||
score = int(score[0].text)
|
||||
else:
|
||||
score = -1
|
||||
authors = [a.text
|
||||
for a in doc.xpath('//div[@class="review_content"]//div[@class="author"]//a')]
|
||||
sources = [d.text
|
||||
for d in doc.xpath('//div[@class="review_content"]//div[@class="source"]/a')]
|
||||
reviews = [d.text
|
||||
for d in doc.xpath('//div[@class="review_content"]//div[@class="review_body"]')]
|
||||
scores = [int(d.text.strip())
|
||||
for d in doc.xpath('//div[@class="review_content"]//div[contains(@class, "critscore")]')]
|
||||
urls = [a.attrib['href']
|
||||
for a in doc.xpath('//div[@class="review_content"]//a[contains(@class, "external")]')]
|
||||
authors = [
|
||||
a.text
|
||||
for a in doc.xpath('//div[@class="review_content"]//div[@class="author"]//a')
|
||||
]
|
||||
sources = [
|
||||
d.text
|
||||
for d in doc.xpath('//div[@class="review_content"]//div[@class="source"]/a')
|
||||
]
|
||||
reviews = [
|
||||
d.text
|
||||
for d in doc.xpath('//div[@class="review_content"]//div[@class="review_body"]')
|
||||
]
|
||||
scores = [
|
||||
int(d.text.strip())
|
||||
for d in doc.xpath('//div[@class="review_content"]//div[contains(@class, "critscore")]')
|
||||
]
|
||||
urls = [
|
||||
a.attrib['href']
|
||||
for a in doc.xpath('//div[@class="review_content"]//a[contains(@class, "external")]')
|
||||
]
|
||||
|
||||
metacritics = []
|
||||
for i in range(len(authors)):
|
||||
|
|
@ -54,7 +64,7 @@ def get_data(url):
|
|||
'quote': strip_tags(reviews[i]).strip(),
|
||||
'score': scores[i],
|
||||
})
|
||||
|
||||
|
||||
return {
|
||||
'critics': metacritics,
|
||||
'id': get_id(url),
|
||||
|
|
|
|||
|
|
@ -1,121 +0,0 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
from datetime import datetime
|
||||
import re
|
||||
import socket
|
||||
from six.moves.urllib.parse import quote
|
||||
|
||||
from ox.cache import read_url
|
||||
from ox import find_re, cache, strip_tags, decode_html, get_torrent_info, int_value, normalize_newlines
|
||||
from ox.normalize import normalize_imdbid
|
||||
import ox
|
||||
|
||||
from torrent import Torrent
|
||||
|
||||
|
||||
def _parse_results_page(data, max_results=10):
|
||||
results=[]
|
||||
regexp = '''<tr><td>(.*?)</td><td>(.*?)<a href="/tor/(.*?)">(.*?)</a>.*?</td>.*?</tr>'''
|
||||
for row in re.compile(regexp, re.DOTALL).findall(data):
|
||||
torrentDate = row[0]
|
||||
torrentExtra = row[1]
|
||||
torrentId = row[2]
|
||||
torrentTitle = decode_html(row[3]).strip()
|
||||
torrentLink = "http://www.mininova.org/tor/" + torrentId
|
||||
privateTracker = 'priv.gif' in torrentExtra
|
||||
if not privateTracker:
|
||||
results.append((torrentTitle, torrentLink, ''))
|
||||
return results
|
||||
|
||||
def find_movie(query=None, imdb=None, max_results=10):
|
||||
'''search for torrents on mininova
|
||||
'''
|
||||
if imdb:
|
||||
url = "http://www.mininova.org/imdb/?imdb=%s" % normalize_imdbid(imdb)
|
||||
else:
|
||||
url = "http://www.mininova.org/search/%s/seeds" % quote(query)
|
||||
data = read_url(url, unicode=True)
|
||||
return _parse_results_page(data, max_results)
|
||||
|
||||
def get_id(mininovaId):
|
||||
mininovaId = unicode(mininovaId)
|
||||
d = find_re(mininovaId, "/(\d+)")
|
||||
if d:
|
||||
return d
|
||||
mininovaId = mininovaId.split('/')
|
||||
if len(mininovaId) == 1:
|
||||
return mininovaId[0]
|
||||
else:
|
||||
return mininovaId[-1]
|
||||
|
||||
def exists(mininovaId):
|
||||
mininovaId = get_id(mininovaId)
|
||||
data = ox.net.read_url("http://www.mininova.org/tor/%s" % mininovaId)
|
||||
if not data or 'Torrent not found...' in data:
|
||||
return False
|
||||
if 'tracker</a> of this torrent requires registration.' in data:
|
||||
return False
|
||||
return True
|
||||
|
||||
def get_data(mininovaId):
|
||||
_key_map = {
|
||||
'by': u'uploader',
|
||||
}
|
||||
mininovaId = get_id(mininovaId)
|
||||
torrent = dict()
|
||||
torrent[u'id'] = mininovaId
|
||||
torrent[u'domain'] = 'mininova.org'
|
||||
torrent[u'comment_link'] = "http://www.mininova.org/tor/%s" % mininovaId
|
||||
torrent[u'torrent_link'] = "http://www.mininova.org/get/%s" % mininovaId
|
||||
torrent[u'details_link'] = "http://www.mininova.org/det/%s" % mininovaId
|
||||
|
||||
data = read_url(torrent['comment_link'], unicode=True) + read_url(torrent['details_link'], unicode=True)
|
||||
if '<h1>Torrent not found...</h1>' in data:
|
||||
return None
|
||||
|
||||
for d in re.compile('<p>.<strong>(.*?):</strong>(.*?)</p>', re.DOTALL).findall(data):
|
||||
key = d[0].lower().strip()
|
||||
key = _key_map.get(key, key)
|
||||
value = decode_html(strip_tags(d[1].strip()))
|
||||
torrent[key] = value
|
||||
|
||||
torrent[u'title'] = find_re(data, '<title>(.*?):.*?</title>')
|
||||
torrent[u'imdbId'] = find_re(data, 'title/tt(\d{7})')
|
||||
torrent[u'description'] = find_re(data, '<div id="description">(.*?)</div>')
|
||||
if torrent['description']:
|
||||
torrent['description'] = normalize_newlines(decode_html(strip_tags(torrent['description']))).strip()
|
||||
t = read_url(torrent[u'torrent_link'])
|
||||
torrent[u'torrent_info'] = get_torrent_info(t)
|
||||
return torrent
|
||||
|
||||
class Mininova(Torrent):
|
||||
'''
|
||||
>>> Mininova('123')
|
||||
{}
|
||||
>>> Mininova('1072195')['infohash']
|
||||
'72dfa59d2338e4a48c78cec9de25964cddb64104'
|
||||
'''
|
||||
def __init__(self, mininovaId):
|
||||
self.data = get_data(mininovaId)
|
||||
if not self.data:
|
||||
return
|
||||
Torrent.__init__(self)
|
||||
ratio = self.data['share ratio'].split(',')
|
||||
self['seeder'] = -1
|
||||
self['leecher'] = -1
|
||||
if len(ratio) == 2:
|
||||
val = int_value(ratio[0].replace(',','').strip())
|
||||
if val:
|
||||
self['seeder'] = int(val)
|
||||
val = int_value(ratio[1].replace(',','').strip())
|
||||
if val:
|
||||
self['leecher'] = int(val)
|
||||
val = int_value(self.data['downloads'].replace(',','').strip())
|
||||
if val:
|
||||
self['downloaded'] = int(val)
|
||||
else:
|
||||
self['downloaded'] = -1
|
||||
published = self.data['added on']
|
||||
published = published.split(' +')[0]
|
||||
self['published'] = datetime.strptime(published, "%a, %d %b %Y %H:%M:%S")
|
||||
|
||||
|
|
@ -2,12 +2,12 @@
|
|||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
import re
|
||||
|
||||
import feedparser
|
||||
from ox.cache import read_url
|
||||
from ox import find_re, strip_tags
|
||||
from ox.iso import langCode2To3, langTo3Code
|
||||
|
||||
def find_subtitles(imdb, parts = 1, language = "eng"):
|
||||
import feedparser
|
||||
if len(language) == 2:
|
||||
language = langCode2To3(language)
|
||||
elif len(language) != 3:
|
||||
|
|
|
|||
|
|
@ -32,7 +32,7 @@ def get_data(url):
|
|||
r['summary'] = get_og(data, 'description')
|
||||
|
||||
meter = re.compile('<span id="all-critics-meter" class="meter(.*?)">(.*?)</span>').findall(data)
|
||||
meter = filter(lambda m: m[1].isdigit(), meter)
|
||||
meter = [m for m in meter if m[1].isdigit()]
|
||||
if meter:
|
||||
r['tomatometer'] = meter[0][1]
|
||||
r['rating'] = find_re(data, 'Average Rating: <span>([\d.]+)/10</span>')
|
||||
|
|
|
|||
|
|
@ -33,7 +33,7 @@ class SiteParser(dict):
|
|||
return "%s%s" % (self.baseUrl, page)
|
||||
|
||||
def read_url(self, url, timeout):
|
||||
if not url in self._cache:
|
||||
if url not in self._cache:
|
||||
self._cache[url] = read_url(url, timeout=timeout, unicode=True)
|
||||
return self._cache[url]
|
||||
|
||||
|
|
|
|||
|
|
@ -95,7 +95,7 @@ def format_subsection(string):
|
|||
'ussports': 'US-Sports',
|
||||
'wunderbar': 'wunderBAR'
|
||||
}
|
||||
if subsection.has_key(string):
|
||||
if string in subsection:
|
||||
return subsection[string].replace(u'\xc3', 'ae')
|
||||
return string[:1].upper() + string[1:]
|
||||
|
||||
|
|
@ -219,8 +219,8 @@ def archive_news():
|
|||
else:
|
||||
dMax = days[m]
|
||||
for d in range(dMax, 0, -1):
|
||||
print('getNews(%d, %d, %d)' % (y, m, d))
|
||||
news = getNews(y, m ,d)
|
||||
print('get_news(%d, %d, %d)' % (y, m, d))
|
||||
news = get_news(y, m, d)
|
||||
for new in news:
|
||||
dirname = archivePath + '/' + new['date'][0:4] + '/' + new['date'][5:7] + new['date'][8:10] + '/' + new['date'][11:13] + new['date'][14:16]
|
||||
if not os.path.exists(dirname):
|
||||
|
|
@ -230,7 +230,7 @@ def archive_news():
|
|||
else:
|
||||
filename = dirname + '/' + new['url'] + '.json'
|
||||
if not os.path.exists(filename) or True:
|
||||
data = json.dumps(new, ensure_ascii = False)
|
||||
data = json.dumps(new, ensure_ascii=False)
|
||||
f = open(filename, 'w')
|
||||
f.write(data)
|
||||
f.close()
|
||||
|
|
@ -253,7 +253,7 @@ def archive_news():
|
|||
string = strings[3]
|
||||
if len(strings) == 6:
|
||||
string += '/' + strings[4]
|
||||
if not count.has_key(string):
|
||||
if string not in count:
|
||||
count[string] = {'count': 1, 'string': '%s %s http://www.spiegel.de/%s/0,1518,archiv-%d-%03d,00.html' % (new['date'], new['date'], new['section'].lower(), y, int(datetime(y, m, d).strftime('%j')))}
|
||||
else:
|
||||
count[string] = {'count': count[string]['count'] + 1, 'string': '%s %s' % (new['date'], count[string]['string'][17:])}
|
||||
|
|
@ -269,12 +269,12 @@ if __name__ == '__main__':
|
|||
# spiegel = Spiegel(2008, 8)
|
||||
# print(spiegel.getContents())
|
||||
# news = News(2001, 9, 10)
|
||||
# output(news.getNews())
|
||||
# output(news.get_news())
|
||||
'''
|
||||
x = []
|
||||
for d in range(10, 30):
|
||||
print('2/%d' % d)
|
||||
news = getNews(2008, 2, d)
|
||||
news = get_news(2008, 2, d)
|
||||
for new in news:
|
||||
strings = new['url'].split('/')
|
||||
string = format_section(strings[3])
|
||||
|
|
|
|||
|
|
@ -21,10 +21,10 @@ def find(query, max_results=DEFAULT_MAX_RESULTS, timeout=DEFAULT_TIMEOUT):
|
|||
Return max_results tuples with title, url, description
|
||||
|
||||
>>> find("The Matrix site:imdb.com", 1)[0][0]
|
||||
u'The Matrix (1999) - IMDb'
|
||||
'The Matrix (1999) - IMDb'
|
||||
|
||||
>>> find("The Matrix site:imdb.com", 1)[0][1]
|
||||
u'http://www.imdb.com/title/tt0133093/'
|
||||
'http://www.imdb.com/title/tt0133093/'
|
||||
"""
|
||||
results = []
|
||||
url = 'https://eu1.startpage.com/do/search?nosteeraway=1&abp=1&language=english&cmd=process_search&query=%s&x=0&y=0&cat=web&engine0=v1all' % quote_plus(query)
|
||||
|
|
|
|||
|
|
@ -9,11 +9,10 @@ from ox import find_re, cache, strip_tags, decode_html, get_torrent_info, normal
|
|||
from ox.normalize import normalize_imdbid
|
||||
import ox
|
||||
|
||||
from torrent import Torrent
|
||||
|
||||
cache_timeout = 24*60*60 # cache search only for 24 hours
|
||||
|
||||
season_episode = re.compile("S..E..", re.IGNORECASE)
|
||||
baseurl = "https://thepiratebay.org/"
|
||||
|
||||
|
||||
def read_url(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None, unicode=False):
|
||||
|
|
@ -25,7 +24,7 @@ def find_movies(query=None, imdb=None, max_results=10):
|
|||
if imdb:
|
||||
query = "tt" + normalize_imdbid(imdb)
|
||||
results = []
|
||||
next = ["https://thepiratebay.se/search/%s/0/3/200" % quote(query), ]
|
||||
next = [baseurl + "hsearch/%s/0/3/200" % quote(query), ]
|
||||
page_count = 1
|
||||
while next and page_count < 4:
|
||||
page_count += 1
|
||||
|
|
@ -33,12 +32,12 @@ def find_movies(query=None, imdb=None, max_results=10):
|
|||
if not url.startswith('http'):
|
||||
if not url.startswith('/'):
|
||||
url = "/" + url
|
||||
url = "https://thepiratebay.se" + url
|
||||
url = baseurl + url
|
||||
data = read_url(url, timeout=cache_timeout, unicode=True)
|
||||
regexp = '''<tr.*?<td class="vertTh"><a href="/browse/(.*?)".*?<td><a href="(/torrent/.*?)" class="detLink".*?>(.*?)</a>.*?</tr>'''
|
||||
for row in re.compile(regexp, re.DOTALL).findall(data):
|
||||
torrentType = row[0]
|
||||
torrentLink = "https://thepiratebay.se" + row[1]
|
||||
torrentLink = baseurl + row[1]
|
||||
torrentTitle = decode_html(row[2])
|
||||
# 201 = Movies , 202 = Movie DVDR, 205 TV Shows
|
||||
if torrentType in ['201']:
|
||||
|
|
@ -61,7 +60,7 @@ def get_id(piratebayId):
|
|||
|
||||
def exists(piratebayId):
|
||||
piratebayId = get_id(piratebayId)
|
||||
return ox.net.exists("https://thepiratebay.se/torrent/%s" % piratebayId)
|
||||
return ox.net.exists(baseurl + "torrent/%s" % piratebayId)
|
||||
|
||||
def get_data(piratebayId):
|
||||
_key_map = {
|
||||
|
|
@ -75,7 +74,7 @@ def get_data(piratebayId):
|
|||
torrent = dict()
|
||||
torrent[u'id'] = piratebayId
|
||||
torrent[u'domain'] = 'thepiratebay.org'
|
||||
torrent[u'comment_link'] = 'https://thepiratebay.se/torrent/%s' % piratebayId
|
||||
torrent[u'comment_link'] = baseurl + 'torrent/%s' % piratebayId
|
||||
|
||||
data = read_url(torrent['comment_link'], unicode=True)
|
||||
torrent[u'title'] = find_re(data, '<title>(.*?) \(download torrent\) - TPB</title>')
|
||||
|
|
@ -84,33 +83,15 @@ def get_data(piratebayId):
|
|||
torrent[u'title'] = decode_html(torrent[u'title']).strip()
|
||||
torrent[u'imdbId'] = find_re(data, 'title/tt(\d{7})')
|
||||
title = quote(torrent['title'].encode('utf-8'))
|
||||
torrent[u'torrent_link']="http://torrents.thepiratebay.org/%s/%s.torrent" % (piratebayId, title)
|
||||
torrent[u'magent_link']= find_re(data, '"(magnet:.*?)"')
|
||||
torrent[u'infohash'] = find_re(torrent[u'magent_link'], "btih:(.*?)&")
|
||||
for d in re.compile('dt>(.*?):</dt>.*?<dd.*?>(.*?)</dd>', re.DOTALL).findall(data):
|
||||
key = d[0].lower().strip()
|
||||
key = _key_map.get(key, key)
|
||||
value = decode_html(strip_tags(d[1].strip()))
|
||||
torrent[key] = value
|
||||
if not '<' in key:
|
||||
torrent[key] = value
|
||||
torrent[u'description'] = find_re(data, '<div class="nfo">(.*?)</div>')
|
||||
if torrent[u'description']:
|
||||
torrent['description'] = normalize_newlines(decode_html(strip_tags(torrent['description']))).strip()
|
||||
t = read_url(torrent[u'torrent_link'])
|
||||
torrent[u'torrent_info'] = get_torrent_info(t)
|
||||
return torrent
|
||||
|
||||
class Thepiratebay(Torrent):
|
||||
'''
|
||||
>>> Thepiratebay('123')
|
||||
{}
|
||||
|
||||
>>> Thepiratebay('3951349')['infohash']
|
||||
'4e84415d36ed7b54066160c05a0b0f061898d12b'
|
||||
'''
|
||||
def __init__(self, piratebayId):
|
||||
self.data = get_data(piratebayId)
|
||||
if not self.data:
|
||||
return
|
||||
Torrent.__init__(self)
|
||||
published = self.data['uploaded']
|
||||
published = published.replace(' GMT', '').split(' +')[0]
|
||||
self['published'] = datetime.strptime(published, "%Y-%m-%d %H:%M:%S")
|
||||
|
||||
|
|
|
|||
|
|
@ -1,37 +0,0 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
from ox import int_value
|
||||
|
||||
|
||||
class Torrent(dict):
|
||||
'''
|
||||
>>> Torrent()
|
||||
{'files': 1, 'domain': u'', 'subtitle language': u'', 'seeder': -1, 'description': u'', 'language': u'', 'title': u'', 'imdbId': u'', 'downloaded': -1, 'leecher': -1, 'torrent_link': u'', 'torrent_info': {}, 'published': u'', 'announce': '', 'infohash': '', 'id': u'', 'comment_link': u'', 'size': -1}
|
||||
'''
|
||||
_string_keys = ('id', 'title', 'description', 'infohash', 'torrent_link', 'comment_link',
|
||||
'imdbId', 'announce', 'domain', 'published', 'language', 'subtitle language')
|
||||
_int_keys = ('size', 'seeder', 'leecher', 'downloaded', 'files')
|
||||
_dict_keys = ('torrent_info', )
|
||||
_list_keys = ()
|
||||
data = {'torrent_info': {}}
|
||||
|
||||
def __init__(self):
|
||||
for key in self._string_keys:
|
||||
self[key] = self.data.get(key, u'')
|
||||
for key in self._dict_keys:
|
||||
self[key] = self.data.get(key, {})
|
||||
for key in self._list_keys:
|
||||
self[key] = self.data.get(key, [])
|
||||
for key in self._int_keys:
|
||||
value = self.data.get(key, -1)
|
||||
if not isinstance(value, int):
|
||||
value = int(int_value(value))
|
||||
self[key] = value
|
||||
self['infohash'] = self.data['torrent_info'].get('hash', '')
|
||||
self['size'] = self.data['torrent_info'].get('size', -1)
|
||||
self['announce'] = self.data['torrent_info'].get('announce', '')
|
||||
if 'files' in self.data['torrent_info']:
|
||||
self['files'] = len(self.data['torrent_info']['files'])
|
||||
else:
|
||||
self['files'] = 1
|
||||
|
||||
|
|
@ -116,7 +116,7 @@ def get_movie_data(wikipedia_url):
|
|||
|
||||
def get_image_url(name):
|
||||
url = 'http://en.wikipedia.org/wiki/Image:' + name.replace(' ', '%20')
|
||||
data = read_url(url)
|
||||
data = read_url(url).decode('utf-8')
|
||||
url = find_re(data, 'href="(http://upload.wikimedia.org/.*?)"')
|
||||
if not url:
|
||||
url = find_re(data, 'href="(//upload.wikimedia.org/.*?)"')
|
||||
|
|
@ -145,7 +145,7 @@ def find(query, max_results=10):
|
|||
url = "http://en.wikipedia.org/w/api.php?" + urllib.parse.urlencode(query)
|
||||
data = read_url(url)
|
||||
if not data:
|
||||
data = read_url(url, timeout=0)
|
||||
data = read_url(url, timeout=0)
|
||||
result = json.loads(data.decode('utf-8'))
|
||||
results = []
|
||||
if result and 'query' in result:
|
||||
|
|
|
|||
|
|
@ -7,7 +7,6 @@ import re
|
|||
from xml.dom.minidom import parseString
|
||||
import json
|
||||
|
||||
import feedparser
|
||||
import ox
|
||||
from ox.cache import read_url, cache_timeout
|
||||
|
||||
|
|
@ -27,15 +26,15 @@ def video_url(youtubeId, format='mp4', timeout=cache_timeout):
|
|||
"""
|
||||
fmt = None
|
||||
if format == '4k':
|
||||
fmt=38
|
||||
fmt = 38
|
||||
elif format == '1080p':
|
||||
fmt=37
|
||||
fmt = 37
|
||||
elif format == '720p':
|
||||
fmt=22
|
||||
fmt = 22
|
||||
elif format == 'mp4':
|
||||
fmt=18
|
||||
fmt = 18
|
||||
elif format == 'high':
|
||||
fmt=35
|
||||
fmt = 35
|
||||
elif format == 'webm':
|
||||
streams = videos(youtubeId, 'webm')
|
||||
return streams[max(streams.keys())]['url']
|
||||
|
|
@ -46,14 +45,14 @@ def video_url(youtubeId, format='mp4', timeout=cache_timeout):
|
|||
|
||||
def get_video_info(id):
|
||||
eurl = get_url(id)
|
||||
data = read_url(eurl)
|
||||
data = read_url(eurl).decode('utf-8')
|
||||
t = re.compile('\W[\'"]?t[\'"]?: ?[\'"](.+?)[\'"]').findall(data)
|
||||
if t:
|
||||
t = t[0]
|
||||
else:
|
||||
raise IOError
|
||||
url = "http://www.youtube.com/get_video_info?&video_id=%s&el=$el&ps=default&eurl=%s&hl=en_US&t=%s" % (id, quote(eurl), quote(t))
|
||||
data = read_url(url)
|
||||
data = read_url(url).decode('utf-8')
|
||||
info = {}
|
||||
for part in data.split('&'):
|
||||
key, value = part.split('=')
|
||||
|
|
@ -61,6 +60,7 @@ def get_video_info(id):
|
|||
return info
|
||||
|
||||
def find(query, max_results=10, offset=1, orderBy='relevance'):
|
||||
import feedparser
|
||||
query = quote(query)
|
||||
url = "http://gdata.youtube.com/feeds/api/videos?vq=%s&orderby=%s&start-index=%s&max-results=%s" % (query, orderBy, offset, max_results)
|
||||
data = read_url(url)
|
||||
|
|
@ -104,14 +104,20 @@ def info(id, timeout=cache_timeout):
|
|||
info['license'] = match[0].strip()
|
||||
info['license'] = re.sub('<.+?>', '', info['license']).strip()
|
||||
|
||||
subs = subtitles(id, timeout)
|
||||
if subs:
|
||||
info['subtitles'] = subs
|
||||
return info
|
||||
|
||||
def subtitles(id, timeout=cache_timeout):
|
||||
url = "http://www.youtube.com/api/timedtext?hl=en&type=list&tlangs=1&v=%s&asrs=1" % id
|
||||
data = read_url(url, timeout=timeout)
|
||||
xml = parseString(data)
|
||||
languages = [t.getAttribute('lang_code') for t in xml.getElementsByTagName('track')]
|
||||
subtitles = {}
|
||||
if languages:
|
||||
info['subtitles'] = {}
|
||||
for language in languages:
|
||||
url = "http://www.youtube.com/api/timedtext?hl=en&v=%s&type=track&lang=%s&name&kind"%(id, language)
|
||||
url = "http://www.youtube.com/api/timedtext?hl=en&v=%s&type=track&lang=%s&name&kind" % (id, language)
|
||||
data = read_url(url, timeout=timeout)
|
||||
xml = parseString(data)
|
||||
subs = []
|
||||
|
|
@ -128,8 +134,8 @@ def info(id, timeout=cache_timeout):
|
|||
'out': end,
|
||||
'value': ox.decode_html(text),
|
||||
})
|
||||
info['subtitles'][language] = subs
|
||||
return info
|
||||
subtitles[language] = subs
|
||||
return subtitles
|
||||
|
||||
def videos(id, format=''):
|
||||
stream_type = {
|
||||
|
|
@ -154,7 +160,7 @@ def videos(id, format=''):
|
|||
return streams
|
||||
|
||||
def playlist(url):
|
||||
data = read_url(url)
|
||||
data = read_url(url).decode('utf-8')
|
||||
items = []
|
||||
for i in list(set(re.compile('<a href="(/watch\?v=.*?)" title="(.*?)" ').findall(data))):
|
||||
items.append({
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue