Switch to python3
This commit is contained in:
parent
531041e89a
commit
9ba4b6a91a
5286 changed files with 677347 additions and 576888 deletions
34
Shared/lib/python3.4/site-packages/ox/__init__.py
Normal file
34
Shared/lib/python3.4/site-packages/ox/__init__.py
Normal file
|
|
@ -0,0 +1,34 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
# GPL 2011
|
||||
__version__ = '2.1.1'
|
||||
|
||||
from . import cache
|
||||
from . import js
|
||||
from . import jsonc
|
||||
from . import net
|
||||
from . import srt
|
||||
from . import utils
|
||||
|
||||
from .api import *
|
||||
from .file import *
|
||||
from .form import *
|
||||
from .format import *
|
||||
from .geo import *
|
||||
from .html import *
|
||||
#image depends on PIL, not easy enough to instal on osx
|
||||
try:
|
||||
from .image import *
|
||||
except:
|
||||
pass
|
||||
from .location import *
|
||||
from .movie import *
|
||||
from .normalize import *
|
||||
from .oembed import *
|
||||
from .text import *
|
||||
#currently broken in python3
|
||||
try:
|
||||
from .torrent import *
|
||||
except:
|
||||
pass
|
||||
from .fixunicode import *
|
||||
112
Shared/lib/python3.4/site-packages/ox/api.py
Normal file
112
Shared/lib/python3.4/site-packages/ox/api.py
Normal file
|
|
@ -0,0 +1,112 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
# GPL 2011
|
||||
from __future__ import with_statement
|
||||
|
||||
from six.moves import http_cookiejar as cookielib
|
||||
import gzip
|
||||
from six import StringIO
|
||||
from six.moves import urllib
|
||||
from types import MethodType
|
||||
|
||||
from . import __version__
|
||||
from .utils import json
|
||||
from .form import MultiPartForm
|
||||
|
||||
__all__ = ['getAPI', 'API']
|
||||
|
||||
def getAPI(url, cj=None):
|
||||
return API(url, cj)
|
||||
|
||||
class API(object):
|
||||
__version__ = __version__
|
||||
__name__ = 'ox'
|
||||
DEBUG = False
|
||||
debuglevel = 0
|
||||
|
||||
def __init__(self, url, cj=None):
|
||||
if cj:
|
||||
self._cj = cj
|
||||
else:
|
||||
self._cj = cookielib.CookieJar()
|
||||
self._opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(self._cj),
|
||||
urllib.HTTPHandler(debuglevel=self.debuglevel))
|
||||
self._opener.addheaders = [
|
||||
('User-Agent', '%s/%s' % (self.__name__, self.__version__))
|
||||
]
|
||||
|
||||
self.url = url
|
||||
r = self._request('api', {'docs': True})
|
||||
self._properties = r['data']['actions']
|
||||
self._actions = r['data']['actions'].keys()
|
||||
for a in r['data']['actions']:
|
||||
self._add_action(a)
|
||||
|
||||
def _add_method(self, method, name):
|
||||
if name is None:
|
||||
name = method.func_name
|
||||
setattr(self, name, MethodType(method, self, type(self)))
|
||||
|
||||
def _add_action(self, action):
|
||||
def method(self, *args, **kw):
|
||||
if not kw:
|
||||
if args:
|
||||
kw = args[0]
|
||||
else:
|
||||
kw = None
|
||||
return self._request(action, kw)
|
||||
if 'doc' in self._properties[action]:
|
||||
method.__doc__ = self._properties[action]['doc']
|
||||
method.func_name = str(action)
|
||||
self._add_method(method, action)
|
||||
|
||||
def _json_request(self, url, form):
|
||||
result = {}
|
||||
try:
|
||||
body = str(form)
|
||||
request = urllib.reuqest.Request(str(url))
|
||||
request.add_header('Content-type', form.get_content_type())
|
||||
request.add_header('Content-Length', str(len(body)))
|
||||
request.add_header('Accept-Encoding', 'gzip, deflate')
|
||||
request.add_data(body)
|
||||
f = self._opener.open(request)
|
||||
result = f.read()
|
||||
if f.headers.get('content-encoding', None) == 'gzip':
|
||||
result = gzip.GzipFile(fileobj=StringIO.StringIO(result)).read()
|
||||
result = result.decode('utf-8')
|
||||
return json.loads(result)
|
||||
except urllib.error.HTTPError as e:
|
||||
if self.DEBUG:
|
||||
import webbrowser
|
||||
if e.code >= 500:
|
||||
with open('/tmp/error.html', 'w') as f:
|
||||
f.write(e.read())
|
||||
webbrowser.open_new_tab('/tmp/error.html')
|
||||
|
||||
result = e.read()
|
||||
try:
|
||||
result = result.decode('utf-8')
|
||||
result = json.loads(result)
|
||||
except:
|
||||
result = {'status':{}}
|
||||
result['status']['code'] = e.code
|
||||
result['status']['text'] = str(e)
|
||||
return result
|
||||
except:
|
||||
if self.DEBUG:
|
||||
import webbrowser
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
if result:
|
||||
with open('/tmp/error.html', 'w') as f:
|
||||
f.write(str(result))
|
||||
webbrowser.open_new_tab('/tmp/error.html')
|
||||
raise
|
||||
|
||||
def _request(self, action, data=None):
|
||||
form = MultiPartForm()
|
||||
form.add_field('action', action)
|
||||
if data:
|
||||
form.add_field('data', json.dumps(data))
|
||||
return self._json_request(self.url, form)
|
||||
|
||||
333
Shared/lib/python3.4/site-packages/ox/cache.py
Normal file
333
Shared/lib/python3.4/site-packages/ox/cache.py
Normal file
|
|
@ -0,0 +1,333 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
# GPL 2011
|
||||
from __future__ import with_statement, print_function
|
||||
|
||||
import gzip
|
||||
import zlib
|
||||
import hashlib
|
||||
import os
|
||||
from six import BytesIO
|
||||
import time
|
||||
from six.moves import urllib
|
||||
import sqlite3
|
||||
|
||||
from .utils import json
|
||||
from .file import makedirs
|
||||
|
||||
from . import net
|
||||
from .net import DEFAULT_HEADERS, detect_encoding
|
||||
|
||||
cache_timeout = 30*24*60*60 # default is 30 days
|
||||
|
||||
COMPRESS_TYPES = (
|
||||
'text/html',
|
||||
'text/plain',
|
||||
'text/xml',
|
||||
'application/xhtml+xml',
|
||||
'application/x-javascript',
|
||||
'application/javascript',
|
||||
'application/ecmascript',
|
||||
'application/rss+xml'
|
||||
)
|
||||
|
||||
def status(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):
|
||||
'''
|
||||
>>> status('http://google.com')
|
||||
200
|
||||
>>> status('http://google.com/mysearch')
|
||||
404
|
||||
'''
|
||||
headers = get_headers(url, data, headers)
|
||||
return int(headers['status'])
|
||||
|
||||
def exists(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):
|
||||
'''
|
||||
>>> exists('http://google.com')
|
||||
True
|
||||
>>> exists('http://google.com/mysearch')
|
||||
False
|
||||
'''
|
||||
s = status(url, data, headers, timeout)
|
||||
if s >= 200 and s < 400:
|
||||
return True
|
||||
return False
|
||||
|
||||
def get_headers(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):
|
||||
url_headers = store.get(url, data, headers, timeout, "headers")
|
||||
if not url_headers:
|
||||
url_headers = net.get_headers(url, data, headers)
|
||||
store.set(url, data, -1, url_headers)
|
||||
return url_headers
|
||||
|
||||
class InvalidResult(Exception):
|
||||
"""Base class for exceptions in this module."""
|
||||
def __init__(self, result, headers):
|
||||
self.result = result
|
||||
self.headers = headers
|
||||
|
||||
def _fix_unicode_url(url):
|
||||
if not isinstance(url, bytes):
|
||||
url = url.encode('utf-8')
|
||||
return url
|
||||
|
||||
def read_url(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout, valid=None, unicode=False):
|
||||
'''
|
||||
url - url to load
|
||||
data - possible post data
|
||||
headers - headers to send with request
|
||||
timeout - get from cache if cache not older than given seconds, -1 to get from cache
|
||||
valid - function to check if result is ok, its passed result and headers
|
||||
if this function fails, InvalidResult will be raised deal with it in your code
|
||||
'''
|
||||
if net.DEBUG:
|
||||
print('ox.cache.read_url', url)
|
||||
#FIXME: send last-modified / etag from cache and only update if needed
|
||||
#url = _fix_unicode_url(url)
|
||||
result = store.get(url, data, headers, timeout)
|
||||
url_headers = {}
|
||||
if not result:
|
||||
try:
|
||||
url_headers, result = net.read_url(url, data, headers, return_headers=True)
|
||||
except urllib.error.HTTPError as e:
|
||||
e.headers['Status'] = "%s" % e.code
|
||||
for key in e.headers:
|
||||
url_headers[key.lower()] = e.headers[key]
|
||||
result = e.read()
|
||||
if url_headers.get('content-encoding', None) == 'gzip':
|
||||
result = gzip.GzipFile(fileobj=BytesIO(result)).read()
|
||||
if not valid or valid(result, url_headers):
|
||||
store.set(url, post_data=data, data=result, headers=url_headers)
|
||||
else:
|
||||
raise InvalidResult(result, url_headers)
|
||||
if unicode:
|
||||
ctype = url_headers.get('content-type', '').lower()
|
||||
if 'charset' in ctype:
|
||||
encoding = ctype.split('charset=')[-1]
|
||||
else:
|
||||
encoding = detect_encoding(result)
|
||||
if not encoding:
|
||||
encoding = 'latin-1'
|
||||
result = result.decode(encoding)
|
||||
return result
|
||||
|
||||
def save_url(url, filename, overwrite=False):
|
||||
if not os.path.exists(filename) or overwrite:
|
||||
dirname = os.path.dirname(filename)
|
||||
if not os.path.exists(dirname):
|
||||
os.makedirs(dirname)
|
||||
data = read_url(url)
|
||||
f = open(filename, 'w')
|
||||
f.write(data)
|
||||
f.close()
|
||||
|
||||
def cache_path():
|
||||
return os.environ.get('oxCACHE', os.path.expanduser('~/.ox/cache'))
|
||||
|
||||
class Cache:
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def get(self, url, data, headers=DEFAULT_HEADERS, timeout=-1, value="data"):
|
||||
'''
|
||||
if value == 'data' return data of url if its in the cache else None
|
||||
if value == 'headers' return headers for url
|
||||
'''
|
||||
pass
|
||||
|
||||
def set(self, url, post_data, data, headers):
|
||||
pass
|
||||
|
||||
class SQLiteCache(Cache):
|
||||
def __init__(self):
|
||||
path = cache_path()
|
||||
if not os.path.exists(path):
|
||||
os.makedirs(path)
|
||||
self.db = os.path.join(path, "cache.sqlite")
|
||||
self.create()
|
||||
|
||||
def connect(self):
|
||||
self.conn = sqlite3.connect(self.db, timeout=10)
|
||||
return self.conn
|
||||
|
||||
def create(self):
|
||||
conn = self.connect()
|
||||
c = conn.cursor()
|
||||
# Create table and indexes
|
||||
c.execute('''CREATE TABLE IF NOT EXISTS cache (url_hash varchar(42) unique, domain text, url text,
|
||||
post_data text, headers text, created int, data blob, only_headers int)''')
|
||||
c.execute('''CREATE INDEX IF NOT EXISTS cache_domain ON cache (domain)''')
|
||||
c.execute('''CREATE INDEX IF NOT EXISTS cache_url ON cache (url)''')
|
||||
c.execute('''CREATE INDEX IF NOT EXISTS cache_url_hash ON cache (url_hash)''')
|
||||
|
||||
c.execute('''CREATE TABLE IF NOT EXISTS setting (key varchar(1024) unique, value text)''')
|
||||
if int(self.get_setting(c, 'version', 0)) < 1:
|
||||
self.set_setting(c, 'version', 1)
|
||||
c.execute('''ALTER TABLE cache ADD compressed INT DEFAULT 0''')
|
||||
conn.commit()
|
||||
|
||||
def get_setting(self, c, key, default=None):
|
||||
c.execute('SELECT value FROM setting WHERE key = ?', (key, ))
|
||||
for row in c:
|
||||
return row[0]
|
||||
return default
|
||||
|
||||
def set_setting(self, c, key, value):
|
||||
c.execute(u'INSERT OR REPLACE INTO setting values (?, ?)', (key, str(value)))
|
||||
|
||||
def get(self, url, data={}, headers=DEFAULT_HEADERS, timeout=-1, value="data"):
|
||||
r = None
|
||||
if timeout == 0:
|
||||
return r
|
||||
if data:
|
||||
url_hash = hashlib.sha1((url + '?' + data).encode('utf-8')).hexdigest()
|
||||
else:
|
||||
url_hash = hashlib.sha1(url.encode('utf-8')).hexdigest()
|
||||
|
||||
conn = self.connect()
|
||||
c = conn.cursor()
|
||||
sql = 'SELECT %s, compressed FROM cache WHERE url_hash=?' % value
|
||||
if timeout > 0:
|
||||
now = time.mktime(time.localtime())
|
||||
t = (url_hash, now-timeout)
|
||||
sql += ' AND created > ?'
|
||||
else:
|
||||
t = (url_hash, )
|
||||
if value != "headers":
|
||||
sql += ' AND only_headers != 1 '
|
||||
c.execute(sql, t)
|
||||
for row in c:
|
||||
r = row[0]
|
||||
if value == 'headers':
|
||||
r = json.loads(r)
|
||||
elif value == 'data':
|
||||
if row[1] == 1:
|
||||
r = zlib.decompress(r)
|
||||
else:
|
||||
r = str(r)
|
||||
break
|
||||
|
||||
c.close()
|
||||
conn.close()
|
||||
return r
|
||||
|
||||
def set(self, url, post_data, data, headers):
|
||||
if post_data:
|
||||
url_hash = hashlib.sha1((url + '?' + post_data).encode('utf-8')).hexdigest()
|
||||
else:
|
||||
url_hash = hashlib.sha1(url.encode('utf-8')).hexdigest()
|
||||
|
||||
domain = ".".join(urllib.parse.urlparse(url)[1].split('.')[-2:])
|
||||
|
||||
conn = self.connect()
|
||||
c = conn.cursor()
|
||||
|
||||
# Insert a row of data
|
||||
if not post_data: post_data=""
|
||||
only_headers = 0
|
||||
if data == -1:
|
||||
only_headers = 1
|
||||
data = ""
|
||||
created = time.mktime(time.localtime())
|
||||
content_type = headers.get('content-type', '').split(';')[0].strip()
|
||||
if content_type in COMPRESS_TYPES:
|
||||
compressed = 1
|
||||
data = zlib.compress(data)
|
||||
else:
|
||||
compressed = 0
|
||||
data = sqlite3.Binary(data)
|
||||
|
||||
#fixme: this looks wrong
|
||||
try:
|
||||
_headers = json.dumps(headers)
|
||||
except:
|
||||
for h in headers:
|
||||
headers[h] = headers[h].decode(detect_encoding(headers[h]))
|
||||
_headers = json.dumps(headers)
|
||||
t = (url_hash, domain, url, post_data, _headers, created,
|
||||
data, only_headers, compressed)
|
||||
c.execute(u"""INSERT OR REPLACE INTO cache values (?, ?, ?, ?, ?, ?, ?, ?, ?)""", t)
|
||||
|
||||
# Save (commit) the changes and clean up
|
||||
conn.commit()
|
||||
c.close()
|
||||
conn.close()
|
||||
|
||||
class FileCache(Cache):
|
||||
def __init__(self):
|
||||
f, self.root = cache_path().split(':')
|
||||
|
||||
def files(self, domain, h):
|
||||
prefix = os.path.join(self.root, domain, h[:2], h[2:4], h[4:6], h[6:8])
|
||||
i = os.path.join(prefix, '%s.json'%h)
|
||||
f = os.path.join(prefix, '%s.dat'%h)
|
||||
return prefix, i, f
|
||||
|
||||
def get(self, url, data={}, headers=DEFAULT_HEADERS, timeout=-1, value="data"):
|
||||
r = None
|
||||
if timeout == 0:
|
||||
return r
|
||||
|
||||
if data:
|
||||
url_hash = hashlib.sha1((url + '?' + data).encode('utf-8')).hexdigest()
|
||||
else:
|
||||
url_hash = hashlib.sha1(url.encode('utf-8')).hexdigest()
|
||||
|
||||
domain = ".".join(urllib.parse.urlparse(url)[1].split('.')[-2:])
|
||||
prefix, i, f = self.files(domain, url_hash)
|
||||
if os.path.exists(i):
|
||||
with open(i) as _i:
|
||||
try:
|
||||
info = json.load(_i)
|
||||
except:
|
||||
return r
|
||||
now = time.mktime(time.localtime())
|
||||
expired = now-timeout
|
||||
|
||||
if value != 'headers' and info['only_headers']:
|
||||
return None
|
||||
if timeout < 0 or info['created'] > expired:
|
||||
if value == 'headers':
|
||||
r = info['headers']
|
||||
else:
|
||||
with open(f) as data:
|
||||
r = data.read()
|
||||
if info['compressed']:
|
||||
r = zlib.decompress(r)
|
||||
return r
|
||||
|
||||
def set(self, url, post_data, data, headers):
|
||||
if post_data:
|
||||
url_hash = hashlib.sha1((url + '?' + post_data).encode('utf-8')).hexdigest()
|
||||
else:
|
||||
url_hash = hashlib.sha1(url.encode('utf-8')).hexdigest()
|
||||
|
||||
domain = ".".join(urllib.parse.urlparse(url)[1].split('.')[-2:])
|
||||
prefix, i, f = self.files(domain, url_hash)
|
||||
makedirs(prefix)
|
||||
|
||||
created = time.mktime(time.localtime())
|
||||
content_type = headers.get('content-type', '').split(';')[0].strip()
|
||||
|
||||
info = {
|
||||
'compressed': content_type in COMPRESS_TYPES,
|
||||
'only_headers': data == -1,
|
||||
'created': created,
|
||||
'headers': headers,
|
||||
'url': url,
|
||||
}
|
||||
if post_data:
|
||||
info['post_data'] = post_data
|
||||
if not info['only_headers']:
|
||||
if info['compressed']:
|
||||
data = zlib.compress(data)
|
||||
with open(f, 'w') as _f:
|
||||
_f.write(data)
|
||||
with open(i, 'w') as _i:
|
||||
json.dump(info, _i)
|
||||
|
||||
if cache_path().startswith('fs:'):
|
||||
store = FileCache()
|
||||
else:
|
||||
store = SQLiteCache()
|
||||
|
||||
0
Shared/lib/python3.4/site-packages/ox/django/__init__.py
Normal file
0
Shared/lib/python3.4/site-packages/ox/django/__init__.py
Normal file
|
|
@ -0,0 +1 @@
|
|||
from actions import actions
|
||||
143
Shared/lib/python3.4/site-packages/ox/django/api/actions.py
Normal file
143
Shared/lib/python3.4/site-packages/ox/django/api/actions.py
Normal file
|
|
@ -0,0 +1,143 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
from __future__ import division, with_statement
|
||||
import inspect
|
||||
import sys
|
||||
|
||||
from django.conf import settings
|
||||
|
||||
from ..shortcuts import render_to_json_response, json_response
|
||||
from ...utils import json
|
||||
|
||||
def autodiscover():
|
||||
#register api actions from all installed apps
|
||||
from django.utils.importlib import import_module
|
||||
from django.utils.module_loading import module_has_submodule
|
||||
for app in settings.INSTALLED_APPS:
|
||||
if app != 'api':
|
||||
mod = import_module(app)
|
||||
try:
|
||||
import_module('%s.views'%app)
|
||||
except:
|
||||
if module_has_submodule(mod, 'views'):
|
||||
raise
|
||||
|
||||
def trim(docstring):
|
||||
if not docstring:
|
||||
return ''
|
||||
# Convert tabs to spaces (following the normal Python rules)
|
||||
# and split into a list of lines:
|
||||
lines = docstring.expandtabs().splitlines()
|
||||
# Determine minimum indentation (first line doesn't count):
|
||||
indent = sys.maxint
|
||||
for line in lines[1:]:
|
||||
stripped = line.lstrip()
|
||||
if stripped:
|
||||
indent = min(indent, len(line) - len(stripped))
|
||||
# Remove indentation (first line is special):
|
||||
trimmed = [lines[0].strip()]
|
||||
if indent < sys.maxint:
|
||||
for line in lines[1:]:
|
||||
trimmed.append(line[indent:].rstrip())
|
||||
# Strip off trailing and leading blank lines:
|
||||
while trimmed and not trimmed[-1]:
|
||||
trimmed.pop()
|
||||
while trimmed and not trimmed[0]:
|
||||
trimmed.pop(0)
|
||||
# Return a single string:
|
||||
return '\n'.join(trimmed)
|
||||
|
||||
|
||||
class ApiActions(dict):
|
||||
properties = {}
|
||||
versions = {}
|
||||
def __init__(self):
|
||||
|
||||
def api(request):
|
||||
'''
|
||||
returns list of all known api actions
|
||||
param data {
|
||||
docs: bool
|
||||
}
|
||||
if docs is true, action properties contain docstrings
|
||||
return {
|
||||
status: {'code': int, 'text': string},
|
||||
data: {
|
||||
actions: {
|
||||
'api': {
|
||||
cache: true,
|
||||
doc: 'recursion'
|
||||
},
|
||||
'hello': {
|
||||
cache: true,
|
||||
..
|
||||
}
|
||||
...
|
||||
}
|
||||
}
|
||||
}
|
||||
'''
|
||||
data = json.loads(request.POST.get('data', '{}'))
|
||||
docs = data.get('docs', False)
|
||||
code = data.get('code', False)
|
||||
version = getattr(request, 'version', None)
|
||||
if version:
|
||||
_actions = self.versions.get(version, {}).keys()
|
||||
_actions = list(set(_actions + self.keys()))
|
||||
else:
|
||||
_actions = self.keys()
|
||||
_actions.sort()
|
||||
actions = {}
|
||||
for a in _actions:
|
||||
actions[a] = self.properties[a]
|
||||
if docs:
|
||||
actions[a]['doc'] = self.doc(a, version)
|
||||
if code:
|
||||
actions[a]['code'] = self.code(a, version)
|
||||
response = json_response({'actions': actions})
|
||||
return render_to_json_response(response)
|
||||
self.register(api)
|
||||
|
||||
def doc(self, name, version=None):
|
||||
if version:
|
||||
f = self.versions[version].get(name, self.get(name))
|
||||
else:
|
||||
f = self[name]
|
||||
return trim(f.__doc__)
|
||||
|
||||
def code(self, name, version=None):
|
||||
if version:
|
||||
f = self.versions[version].get(name, self.get(name))
|
||||
else:
|
||||
f = self[name]
|
||||
if name != 'api' and hasattr(f, 'func_closure') and f.func_closure:
|
||||
fc = filter(lambda c: hasattr(c.cell_contents, '__call__'), f.func_closure)
|
||||
f = fc[len(fc)-1].cell_contents
|
||||
info = f.func_code.co_filename[len(settings.PROJECT_ROOT)+1:]
|
||||
info = u'%s:%s' % (info, f.func_code.co_firstlineno)
|
||||
return info, trim(inspect.getsource(f))
|
||||
|
||||
def register(self, method, action=None, cache=True, version=None):
|
||||
if not action:
|
||||
action = method.func_name
|
||||
if version:
|
||||
if not version in self.versions:
|
||||
self.versions[version] = {}
|
||||
self.versions[version][action] = method
|
||||
else:
|
||||
self[action] = method
|
||||
self.properties[action] = {'cache': cache}
|
||||
|
||||
def unregister(self, action):
|
||||
if action in self:
|
||||
del self[action]
|
||||
|
||||
actions = ApiActions()
|
||||
|
||||
def error(request):
|
||||
'''
|
||||
this action is used to test api error codes, it should return a 503 error
|
||||
'''
|
||||
success = error_is_success
|
||||
return render_to_json_response({})
|
||||
actions.register(error)
|
||||
13
Shared/lib/python3.4/site-packages/ox/django/api/urls.py
Normal file
13
Shared/lib/python3.4/site-packages/ox/django/api/urls.py
Normal file
|
|
@ -0,0 +1,13 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
|
||||
from django.conf.urls import patterns
|
||||
|
||||
import views
|
||||
|
||||
import actions
|
||||
actions.autodiscover()
|
||||
|
||||
urlpatterns = patterns("",
|
||||
(r'^$', views.api),
|
||||
)
|
||||
44
Shared/lib/python3.4/site-packages/ox/django/api/views.py
Normal file
44
Shared/lib/python3.4/site-packages/ox/django/api/views.py
Normal file
|
|
@ -0,0 +1,44 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
from __future__ import division, with_statement
|
||||
|
||||
from django.shortcuts import render_to_response
|
||||
from django.template import RequestContext
|
||||
from django.conf import settings
|
||||
|
||||
from ..shortcuts import render_to_json_response, json_response
|
||||
|
||||
from actions import actions
|
||||
|
||||
def api(request):
|
||||
if request.META['REQUEST_METHOD'] == "OPTIONS":
|
||||
response = render_to_json_response({'status': {'code': 200,
|
||||
'text': 'use POST'}})
|
||||
response['Access-Control-Allow-Origin'] = '*'
|
||||
return response
|
||||
if not 'action' in request.POST:
|
||||
methods = actions.keys()
|
||||
api = []
|
||||
for f in sorted(methods):
|
||||
api.append({'name': f,
|
||||
'doc': actions.doc(f).replace('\n', '<br>\n')})
|
||||
context = RequestContext(request, {
|
||||
'api': api,
|
||||
'settings': settings,
|
||||
'sitename': settings.SITENAME
|
||||
})
|
||||
return render_to_response('api.html', context)
|
||||
action = request.POST['action']
|
||||
version = getattr(request, 'version', None)
|
||||
if version:
|
||||
f = actions.versions.get(version, {}).get(action, actions.get(action))
|
||||
else:
|
||||
f = actions.get(action)
|
||||
if f:
|
||||
response = f(request)
|
||||
else:
|
||||
response = render_to_json_response(json_response(status=400,
|
||||
text='Unknown action %s' % action))
|
||||
response['Access-Control-Allow-Origin'] = '*'
|
||||
return response
|
||||
|
||||
32
Shared/lib/python3.4/site-packages/ox/django/decorators.py
Normal file
32
Shared/lib/python3.4/site-packages/ox/django/decorators.py
Normal file
|
|
@ -0,0 +1,32 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
|
||||
try:
|
||||
from django.contrib.auth.decorators import wraps
|
||||
except:
|
||||
from django.utils.functional import wraps
|
||||
from shortcuts import render_to_json_response
|
||||
|
||||
def login_required_json(function=None):
|
||||
"""
|
||||
Decorator for views that checks that the user is logged in
|
||||
return json error if not logged in.
|
||||
"""
|
||||
|
||||
def _wrapped_view(request, *args, **kwargs):
|
||||
if request.user.is_authenticated():
|
||||
return function(request, *args, **kwargs)
|
||||
return render_to_json_response({'status': {'code': 401, 'text': 'login required'}})
|
||||
return wraps(function)(_wrapped_view)
|
||||
|
||||
def admin_required_json(function=None):
|
||||
"""
|
||||
Decorator for views that checks that the user is logged in
|
||||
return json error if not logged in.
|
||||
"""
|
||||
|
||||
def _wrapped_view(request, *args, **kwargs):
|
||||
if request.user.is_authenticated() and request.user.get_profile().get_level() == 'admin':
|
||||
return function(request, *args, **kwargs)
|
||||
return render_to_json_response({'status': {'code': 403, 'text': 'permission denied'}})
|
||||
return wraps(function)(_wrapped_view)
|
||||
108
Shared/lib/python3.4/site-packages/ox/django/fields.py
Normal file
108
Shared/lib/python3.4/site-packages/ox/django/fields.py
Normal file
|
|
@ -0,0 +1,108 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
import time
|
||||
import datetime
|
||||
|
||||
from django.db import models
|
||||
from django.utils import datetime_safe
|
||||
|
||||
from ox.utils import json
|
||||
|
||||
|
||||
def to_json(python_object):
|
||||
if isinstance(python_object, datetime.datetime):
|
||||
if python_object.year < 1900:
|
||||
tt = python_object.timetuple()
|
||||
value = '%d-%02d-%02dT%02d:%02d%02dZ' % tuple(list(tt)[:6])
|
||||
else:
|
||||
value = python_object.strftime('%Y-%m-%dT%H:%M:%SZ')
|
||||
return {'__class__': 'datetime.datetime',
|
||||
'__value__': value}
|
||||
if isinstance(python_object, datetime_safe.datetime):
|
||||
return {'__class__': 'datetime.datetime',
|
||||
'__value__': python_object.strftime('%Y-%m-%dT%H:%M:%SZ')}
|
||||
if isinstance(python_object, time.struct_time):
|
||||
return {'__class__': 'time.asctime',
|
||||
'__value__': time.asctime(python_object)}
|
||||
try:
|
||||
if isinstance(python_object, bytes):
|
||||
return {'__class__': 'bytes',
|
||||
'__value__': list(python_object)}
|
||||
except:
|
||||
pass
|
||||
raise TypeError(repr(python_object) + ' is not JSON serializable')
|
||||
|
||||
def from_json(json_object):
|
||||
if '__class__' in json_object:
|
||||
if json_object['__class__'] == 'bytes':
|
||||
return bytes(json_object['__value__'])
|
||||
if json_object['__class__'] == 'datetime_safe.datetime' \
|
||||
or json_object['__class__'] == 'datetime.datetime':
|
||||
return datetime_safe.datetime.strptime(json_object['__value__'], '%Y-%m-%dT%H:%M:%SZ')
|
||||
if json_object['__class__'] == 'time.asctime':
|
||||
return time.strptime(json_object['__value__'])
|
||||
return json_object
|
||||
|
||||
class DictField(models.TextField):
|
||||
"""DictField is a textfield that contains JSON-serialized dictionaries."""
|
||||
|
||||
# Used so to_python() is called
|
||||
__metaclass__ = models.SubfieldBase
|
||||
|
||||
def to_python(self, value):
|
||||
"""Convert our string value to python after we load it from the DB"""
|
||||
if value == None:
|
||||
return value
|
||||
if isinstance(value, dict):
|
||||
return value
|
||||
try:
|
||||
value = json.loads(value, object_hook=from_json)
|
||||
except: #this is required to load fixtures
|
||||
value = eval(value)
|
||||
assert isinstance(value, dict)
|
||||
return value
|
||||
|
||||
def get_db_prep_save(self, value, connection):
|
||||
"""Convert our JSON object to a string before we save"""
|
||||
if value == None:
|
||||
return value
|
||||
if isinstance(value, basestring):
|
||||
value = eval(value)
|
||||
assert isinstance(value, dict)
|
||||
value = json.dumps(value, default=to_json)
|
||||
return super(DictField, self).get_db_prep_save(value, connection=connection)
|
||||
|
||||
class TupleField(models.TextField):
|
||||
"""TupleField is a textfield that contains JSON-serialized tuples."""
|
||||
|
||||
# Used so to_python() is called
|
||||
__metaclass__ = models.SubfieldBase
|
||||
|
||||
def to_python(self, value):
|
||||
"""Convert our string value to JSON after we load it from the DB"""
|
||||
if isinstance(value, tuple):
|
||||
return value
|
||||
|
||||
try:
|
||||
value = json.loads(value, object_hook=from_json)
|
||||
except: #this is required to load fixtures
|
||||
value = eval(value)
|
||||
assert isinstance(value, list)
|
||||
return tuple(value)
|
||||
|
||||
def get_db_prep_save(self, value, connection):
|
||||
"""Convert our JSON object to a string before we save"""
|
||||
if isinstance(value, basestring):
|
||||
value = eval(value)
|
||||
if isinstance(value, list):
|
||||
value = tuple(value)
|
||||
assert isinstance(value, tuple)
|
||||
value = json.dumps(value, default=to_json)
|
||||
return super(TupleField, self).get_db_prep_save(value, connection=connection)
|
||||
|
||||
try:
|
||||
from south.modelsinspector import add_introspection_rules
|
||||
add_introspection_rules([], ["^ox.django\.fields\.DictField"])
|
||||
add_introspection_rules([], ["^ox.django\.fields\.TupleField"])
|
||||
except:
|
||||
pass
|
||||
58
Shared/lib/python3.4/site-packages/ox/django/http.py
Normal file
58
Shared/lib/python3.4/site-packages/ox/django/http.py
Normal file
|
|
@ -0,0 +1,58 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
import os
|
||||
import mimetypes
|
||||
from datetime import datetime, timedelta
|
||||
from urllib import quote
|
||||
|
||||
from django.http import HttpResponse, Http404
|
||||
from django.conf import settings
|
||||
|
||||
|
||||
def HttpFileResponse(path, content_type=None, filename=None):
|
||||
if not os.path.exists(path):
|
||||
raise Http404
|
||||
if not content_type:
|
||||
content_type = mimetypes.guess_type(path)[0]
|
||||
if not content_type:
|
||||
content_type = 'application/octet-stream'
|
||||
|
||||
if getattr(settings, 'XACCELREDIRECT', False):
|
||||
response = HttpResponse()
|
||||
response['Content-Length'] = os.stat(path).st_size
|
||||
|
||||
for PREFIX in ('STATIC', 'MEDIA'):
|
||||
root = getattr(settings, PREFIX+'_ROOT', '')
|
||||
url = getattr(settings, PREFIX+'_URL', '')
|
||||
if root and path.startswith(root):
|
||||
path = url + path[len(root)+1:]
|
||||
if isinstance(path, unicode):
|
||||
path = path.encode('utf-8')
|
||||
response['X-Accel-Redirect'] = path
|
||||
if content_type:
|
||||
response['Content-Type'] = content_type
|
||||
elif getattr(settings, 'XSENDFILE', False):
|
||||
response = HttpResponse()
|
||||
if isinstance(path, unicode):
|
||||
path = path.encode('utf-8')
|
||||
response['X-Sendfile'] = path
|
||||
if content_type:
|
||||
response['Content-Type'] = content_type
|
||||
response['Content-Length'] = os.stat(path).st_size
|
||||
else:
|
||||
response = HttpResponse(open(path), content_type=content_type)
|
||||
if filename:
|
||||
if isinstance(filename, unicode):
|
||||
filename = filename.encode('utf-8')
|
||||
response['Content-Disposition'] = "attachment; filename*=UTF=8''%s" % quote(filename)
|
||||
|
||||
response['Expires'] = datetime.strftime(datetime.utcnow() + timedelta(days=1), "%a, %d-%b-%Y %H:%M:%S GMT")
|
||||
|
||||
def allow_access():
|
||||
for key in ('X-Accel-Redirect', 'X-Sendfile'):
|
||||
if key in response:
|
||||
del response[key]
|
||||
response['Access-Control-Allow-Origin'] = '*'
|
||||
response.allow_access = allow_access
|
||||
return response
|
||||
|
||||
15
Shared/lib/python3.4/site-packages/ox/django/middleware.py
Normal file
15
Shared/lib/python3.4/site-packages/ox/django/middleware.py
Normal file
|
|
@ -0,0 +1,15 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
|
||||
from shortcuts import HttpErrorJson, render_to_json_response
|
||||
|
||||
class ExceptionMiddleware(object):
|
||||
def process_exception(self, request, exception):
|
||||
if isinstance(exception, HttpErrorJson):
|
||||
return render_to_json_response(exception.response)
|
||||
return None
|
||||
|
||||
class ChromeFrameMiddleware(object):
|
||||
def process_response(self, request, response):
|
||||
response['X-UA-Compatible'] = 'chrome=1'
|
||||
return response
|
||||
113
Shared/lib/python3.4/site-packages/ox/django/monitor.py
Normal file
113
Shared/lib/python3.4/site-packages/ox/django/monitor.py
Normal file
|
|
@ -0,0 +1,113 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import signal
|
||||
import threading
|
||||
import atexit
|
||||
import Queue
|
||||
|
||||
_interval = 1.0
|
||||
_times = {}
|
||||
_files = []
|
||||
|
||||
_running = False
|
||||
_queue = Queue.Queue()
|
||||
_lock = threading.Lock()
|
||||
|
||||
def _restart(path):
|
||||
_queue.put(True)
|
||||
prefix = 'monitor (pid=%d):' % os.getpid()
|
||||
print >> sys.stderr, '%s Change detected to \'%s\'.' % (prefix, path)
|
||||
print >> sys.stderr, '%s Triggering process restart.' % prefix
|
||||
os.kill(os.getpid(), signal.SIGINT)
|
||||
|
||||
def _modified(path):
|
||||
try:
|
||||
# If path doesn't denote a file and were previously
|
||||
# tracking it, then it has been removed or the file type
|
||||
# has changed so force a restart. If not previously
|
||||
# tracking the file then we can ignore it as probably
|
||||
# pseudo reference such as when file extracted from a
|
||||
# collection of modules contained in a zip file.
|
||||
|
||||
if not os.path.isfile(path):
|
||||
return path in _times
|
||||
|
||||
# Check for when file last modified.
|
||||
|
||||
mtime = os.stat(path).st_mtime
|
||||
if path not in _times:
|
||||
_times[path] = mtime
|
||||
|
||||
# Force restart when modification time has changed, even
|
||||
# if time now older, as that could indicate older file
|
||||
# has been restored.
|
||||
|
||||
if mtime != _times[path]:
|
||||
return True
|
||||
except:
|
||||
# If any exception occured, likely that file has been
|
||||
# been removed just before stat(), so force a restart.
|
||||
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def _monitor():
|
||||
while 1:
|
||||
# Check modification times on all files in sys.modules.
|
||||
|
||||
for module in sys.modules.values():
|
||||
if not hasattr(module, '__file__'):
|
||||
continue
|
||||
path = getattr(module, '__file__')
|
||||
if not path:
|
||||
continue
|
||||
if os.path.splitext(path)[1] in ['.pyc', '.pyo', '.pyd']:
|
||||
path = path[:-1]
|
||||
if _modified(path):
|
||||
return _restart(path)
|
||||
|
||||
# Check modification times on files which have
|
||||
# specifically been registered for monitoring.
|
||||
|
||||
for path in _files:
|
||||
if _modified(path):
|
||||
return _restart(path)
|
||||
|
||||
# Go to sleep for specified interval.
|
||||
|
||||
try:
|
||||
return _queue.get(timeout=_interval)
|
||||
except:
|
||||
pass
|
||||
|
||||
_thread = threading.Thread(target=_monitor)
|
||||
_thread.setDaemon(True)
|
||||
|
||||
def _exiting():
|
||||
try:
|
||||
_queue.put(True)
|
||||
except:
|
||||
pass
|
||||
_thread.join()
|
||||
|
||||
atexit.register(_exiting)
|
||||
|
||||
def track(path):
|
||||
if not path in _files:
|
||||
_files.append(path)
|
||||
|
||||
def start(interval=1.0):
|
||||
global _interval
|
||||
if interval < _interval:
|
||||
_interval = interval
|
||||
|
||||
global _running
|
||||
_lock.acquire()
|
||||
if not _running:
|
||||
_running = True
|
||||
_thread.start()
|
||||
_lock.release()
|
||||
73
Shared/lib/python3.4/site-packages/ox/django/query.py
Normal file
73
Shared/lib/python3.4/site-packages/ox/django/query.py
Normal file
|
|
@ -0,0 +1,73 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
|
||||
from django.db.models.sql import Query
|
||||
from django.db.models.sql.compiler import SQLCompiler
|
||||
from django.db import connections
|
||||
import django.db.models.query
|
||||
|
||||
'''
|
||||
models.py:
|
||||
-----------------------------------
|
||||
from ox.django.query import QuerySet
|
||||
|
||||
class Manager(models.Manager):
|
||||
def get_query_set(self):
|
||||
return QuerySet(self.model)
|
||||
|
||||
class Model(models.Model):
|
||||
...
|
||||
objects = Manager()
|
||||
'''
|
||||
|
||||
class SQLCompiler(SQLCompiler):
|
||||
|
||||
def get_ordering(self):
|
||||
result, group_by = super(SQLCompiler, self).get_ordering()
|
||||
if self.query.nulls_last and len(result):
|
||||
if self.connection.vendor == 'sqlite':
|
||||
_result = []
|
||||
for r in result:
|
||||
if r.endswith(' DESC'):
|
||||
_r = r[:-len(' DESC')]
|
||||
elif r.endswith(' ASC'):
|
||||
_r = r[:-len(' ASC')]
|
||||
_result.append(_r + ' IS NULL')
|
||||
_result.append(r)
|
||||
|
||||
result = _result
|
||||
else:
|
||||
result = map(lambda e: e + ' NULLS LAST', result)
|
||||
return result, group_by
|
||||
|
||||
class Query(Query):
|
||||
nulls_last = False
|
||||
|
||||
def clone(self, *args, **kwargs):
|
||||
obj = super(Query, self).clone(*args, **kwargs)
|
||||
obj.nulls_last = self.nulls_last
|
||||
return obj
|
||||
|
||||
def get_compiler(self, using=None, connection=None):
|
||||
if using is None and connection is None:
|
||||
raise ValueError("Need either using or connection")
|
||||
if using:
|
||||
connection = connections[using]
|
||||
# Check that the compiler will be able to execute the query
|
||||
for alias, aggregate in self.aggregate_select.items():
|
||||
connection.ops.check_aggregate_support(aggregate)
|
||||
|
||||
return SQLCompiler(self, connection, using)
|
||||
|
||||
|
||||
class QuerySet(django.db.models.query.QuerySet):
|
||||
|
||||
def __init__(self, model=None, query=None, using=None, **kwargs):
|
||||
super(QuerySet, self).__init__(model=model, query=query, using=None, **kwargs)
|
||||
self.query = query or Query(self.model)
|
||||
|
||||
def order_by(self, *args, **kwargs):
|
||||
nulls_last = kwargs.pop('nulls_last', False)
|
||||
obj = super(QuerySet, self).order_by(*args, **kwargs)
|
||||
obj.query.nulls_last = nulls_last
|
||||
return obj
|
||||
51
Shared/lib/python3.4/site-packages/ox/django/shortcuts.py
Normal file
51
Shared/lib/python3.4/site-packages/ox/django/shortcuts.py
Normal file
|
|
@ -0,0 +1,51 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
import datetime
|
||||
from django.utils import datetime_safe
|
||||
from django.http import HttpResponse, Http404
|
||||
try:
|
||||
import simplejson as json
|
||||
except ImportError:
|
||||
from django.utils import simplejson as json
|
||||
from django.conf import settings
|
||||
|
||||
class HttpErrorJson(Http404):
|
||||
def __init__(self, response):
|
||||
self.response = response
|
||||
|
||||
def json_response(data=None, status=200, text='ok'):
|
||||
if not data:
|
||||
data = {}
|
||||
return {'status': {'code': status, 'text': text}, 'data': data}
|
||||
|
||||
def _to_json(python_object):
|
||||
if isinstance(python_object, datetime.datetime):
|
||||
if python_object.year < 1900:
|
||||
tt = python_object.timetuple()
|
||||
return '%d-%02d-%02dT%02d:%02d%02dZ' % tuple(list(tt)[:6])
|
||||
return python_object.strftime('%Y-%m-%dT%H:%M:%SZ')
|
||||
if isinstance(python_object, datetime_safe.datetime):
|
||||
return python_object.strftime('%Y-%m-%dT%H:%M:%SZ')
|
||||
raise TypeError(u'%s %s is not JSON serializable' % (repr(python_object), type(python_object)))
|
||||
|
||||
def render_to_json_response(dictionary, content_type="text/json", status=200):
|
||||
indent=None
|
||||
if settings.DEBUG:
|
||||
content_type = "text/javascript"
|
||||
indent = 2
|
||||
if getattr(settings, 'JSON_DEBUG', False):
|
||||
print json.dumps(dictionary, indent=2, default=_to_json, ensure_ascii=False).encode('utf-8')
|
||||
|
||||
return HttpResponse(json.dumps(dictionary, indent=indent, default=_to_json,
|
||||
ensure_ascii=False).encode('utf-8'), content_type=content_type, status=status)
|
||||
|
||||
def get_object_or_404_json(klass, *args, **kwargs):
|
||||
from django.shortcuts import _get_queryset
|
||||
queryset = _get_queryset(klass)
|
||||
try:
|
||||
return queryset.get(*args, **kwargs)
|
||||
except queryset.model.DoesNotExist:
|
||||
response = {'status': {'code': 404,
|
||||
'text': '%s not found' % queryset.model._meta.object_name}}
|
||||
raise HttpErrorJson(response)
|
||||
|
||||
48
Shared/lib/python3.4/site-packages/ox/django/utils.py
Normal file
48
Shared/lib/python3.4/site-packages/ox/django/utils.py
Normal file
|
|
@ -0,0 +1,48 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
from django.utils.datetime_safe import datetime
|
||||
from django.http import HttpResponse,Http404
|
||||
from django.core.servers.basehttp import FileWrapper
|
||||
from django.conf import settings
|
||||
|
||||
import mimetypes
|
||||
import os
|
||||
|
||||
def basic_sendfile(fname,download_name=None):
|
||||
if not os.path.exists(fname):
|
||||
raise Http404
|
||||
|
||||
wrapper = FileWrapper(open(fname,"r"))
|
||||
|
||||
content_type = mimetypes.guess_type(fname)[0]
|
||||
response = HttpResponse(wrapper, content_type=content_type)
|
||||
response['Content-Length'] = os.path.getsize(fname)
|
||||
|
||||
if download_name:
|
||||
response['Content-Disposition'] = "attachment; filename=%s"%download_name
|
||||
|
||||
return response
|
||||
|
||||
def x_sendfile(fname,download_name=None):
|
||||
if not os.path.exists(fname):
|
||||
raise Http404
|
||||
|
||||
content_type = mimetypes.guess_type(fname)[0]
|
||||
response = HttpResponse('', content_type=content_type)
|
||||
response['Content-Length'] = os.path.getsize(fname)
|
||||
response['X-Sendfile'] = fname
|
||||
|
||||
if download_name:
|
||||
response['Content-Disposition'] = "attachment; filename=%s"%download_name
|
||||
|
||||
return response
|
||||
|
||||
try:
|
||||
__sendfile = getattr(settings,'SENDFILE',False) == 'x_sendfile'
|
||||
except:
|
||||
__sendfile = False
|
||||
if __sendfile == 'x_sendfile':
|
||||
sendfile = x_sendfile
|
||||
else:
|
||||
sendfile = basic_sendfile
|
||||
|
||||
67
Shared/lib/python3.4/site-packages/ox/django/views.py
Normal file
67
Shared/lib/python3.4/site-packages/ox/django/views.py
Normal file
|
|
@ -0,0 +1,67 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
import cookielib
|
||||
import urllib2
|
||||
from StringIO import StringIO
|
||||
|
||||
from celery.utils import get_full_cls_name
|
||||
from celery.backends import default_backend
|
||||
|
||||
from django.http import HttpResponse
|
||||
from django.conf import settings
|
||||
|
||||
from shortcuts import json_response
|
||||
import ox
|
||||
|
||||
|
||||
def task_status(request, task_id):
|
||||
response = json_response(status=200, text='ok')
|
||||
status = default_backend.get_status(task_id)
|
||||
res = default_backend.get_result(task_id)
|
||||
response['data'] = {
|
||||
'id': task_id,
|
||||
'status': status,
|
||||
'result': res
|
||||
}
|
||||
if status in default_backend.EXCEPTION_STATES:
|
||||
traceback = default_backend.get_traceback(task_id)
|
||||
response['data'].update({'result': str(res.args[0]),
|
||||
'exc': get_full_cls_name(res.__class__),
|
||||
'traceback': traceback})
|
||||
return response
|
||||
|
||||
class SessionCookieJar(cookielib.LWPCookieJar):
|
||||
def save(self):
|
||||
return "#LWP-Cookies-2.0\n" + self.as_lwp_str()
|
||||
|
||||
def load(self, data, ignore_discard=True, ignore_expires=True):
|
||||
f = StringIO(data)
|
||||
self._really_load(f, 'memory', ignore_discard, ignore_expires)
|
||||
|
||||
def api_proxy(request):
|
||||
'''
|
||||
settings.OXAPI_URL =...
|
||||
from ox.django.views import api_proxy
|
||||
urlpatterns = patterns('',
|
||||
url(r'^api/$', api_proxy)
|
||||
'''
|
||||
url = settings.OXAPI_URL
|
||||
cj = SessionCookieJar()
|
||||
if 'cj' in request.session:
|
||||
cj.load(request.session['cj'])
|
||||
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
|
||||
opener.addheaders = [
|
||||
('User-Agent', request.META.get('HTTP_USER_AGENT'))
|
||||
]
|
||||
form = ox.MultiPartForm()
|
||||
for key in request.POST:
|
||||
form.add_field(key, request.POST[key])
|
||||
r = urllib2.Request(url)
|
||||
body = str(form)
|
||||
r.add_header('Content-type', form.get_content_type())
|
||||
r.add_header('Content-length', len(body))
|
||||
r.add_data(body)
|
||||
f = opener.open(r)
|
||||
response = HttpResponse(f.read())
|
||||
request.session['cj'] = cj.save()
|
||||
return response
|
||||
9
Shared/lib/python3.4/site-packages/ox/django/widgets.py
Normal file
9
Shared/lib/python3.4/site-packages/ox/django/widgets.py
Normal file
|
|
@ -0,0 +1,9 @@
|
|||
import django.newforms as forms
|
||||
from string import Template
|
||||
from django.utils.safestring import mark_safe
|
||||
|
||||
class FirefoggWidget(forms.FileInput):
|
||||
def render(self, name, value, attrs=None):
|
||||
tpl = Template(u"""<h1>This should be a Firefogg widget for $name, current value: $value</h1>""")
|
||||
return mark_safe(tpl.substitute(name=name, value=value))
|
||||
|
||||
341
Shared/lib/python3.4/site-packages/ox/file.py
Normal file
341
Shared/lib/python3.4/site-packages/ox/file.py
Normal file
|
|
@ -0,0 +1,341 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
# GPL 2008
|
||||
from __future__ import division, with_statement, print_function
|
||||
import os
|
||||
import hashlib
|
||||
import re
|
||||
import shutil
|
||||
import struct
|
||||
import subprocess
|
||||
import sqlite3
|
||||
|
||||
from .utils import json
|
||||
|
||||
__all__ = ['sha1sum', 'oshash', 'avinfo', 'makedirs']
|
||||
|
||||
EXTENSIONS = {
|
||||
'audio': [
|
||||
'aac', 'aif', 'aiff',
|
||||
'flac', 'm4a', 'mp3', 'oga', 'ogg', 'wav', 'wma'
|
||||
],
|
||||
'image': [
|
||||
'bmp', 'gif', 'jpeg', 'jpg', 'png', 'svg', 'webp'
|
||||
],
|
||||
'subtitle': [
|
||||
'idx', 'srt', 'sub'
|
||||
],
|
||||
'video': [
|
||||
'3gp',
|
||||
'avi', 'divx', 'dv', 'flv', 'm2t', 'm4v', 'mkv', 'mov', 'mp4',
|
||||
'mpeg', 'mpg', 'mts', 'ogm', 'ogv', 'rm', 'vob', 'webm', 'wmv',
|
||||
'mod', 'tod', # http://en.wikipedia.org/wiki/MOD_and_TOD
|
||||
'mxf', 'ts'
|
||||
],
|
||||
}
|
||||
|
||||
def cmd(program):
|
||||
local = os.path.expanduser('~/.ox/bin/%s' % program)
|
||||
if os.path.exists(local):
|
||||
program = local
|
||||
return program
|
||||
|
||||
def _get_file_cache():
|
||||
import ox.cache
|
||||
path = ox.cache.cache_path()
|
||||
if path.startswith('fs:'):
|
||||
path = path[3:]
|
||||
return os.path.join(path, 'files.sqlite')
|
||||
|
||||
def cache(filename, type='oshash'):
|
||||
conn = sqlite3.connect(_get_file_cache(), timeout=10)
|
||||
conn.text_factory = str
|
||||
conn.row_factory = sqlite3.Row
|
||||
|
||||
if not cache.init:
|
||||
c = conn.cursor()
|
||||
c.execute('CREATE TABLE IF NOT EXISTS cache (path varchar(1024) unique, oshash varchar(16), sha1 varchar(42), size int, mtime int, info text)')
|
||||
c.execute('CREATE INDEX IF NOT EXISTS cache_oshash ON cache (oshash)')
|
||||
c.execute('CREATE INDEX IF NOT EXISTS cache_sha1 ON cache (sha1)')
|
||||
conn.commit()
|
||||
cache.init = True
|
||||
c = conn.cursor()
|
||||
c.execute('SELECT oshash, sha1, info, size, mtime FROM cache WHERE path = ?', (filename, ))
|
||||
stat = os.stat(filename)
|
||||
row = None
|
||||
h = None
|
||||
sha1 = None
|
||||
info = ''
|
||||
for row in c:
|
||||
if stat.st_size == row['size'] and int(stat.st_mtime) == int(row['mtime']):
|
||||
value = row[type]
|
||||
if value:
|
||||
if type == 'info':
|
||||
value = json.loads(value)
|
||||
return value
|
||||
h = row['oshash']
|
||||
sha1 = row['sha1']
|
||||
info = row['info']
|
||||
if type == 'oshash':
|
||||
value = h = oshash(filename, cached=False)
|
||||
elif type == 'sha1':
|
||||
value = sha1 = sha1sum(filename, cached=False)
|
||||
elif type == 'info':
|
||||
value = avinfo(filename, cached=False)
|
||||
info = json.dumps(value)
|
||||
t = (filename, h, sha1, stat.st_size, int(stat.st_mtime), info)
|
||||
with conn:
|
||||
sql = u'INSERT OR REPLACE INTO cache values (?, ?, ?, ?, ?, ?)'
|
||||
c.execute(sql, t)
|
||||
return value
|
||||
cache.init = None
|
||||
|
||||
def cleanup_cache():
|
||||
conn = sqlite3.connect(_get_file_cache(), timeout=10)
|
||||
conn.text_factory = str
|
||||
conn.row_factory = sqlite3.Row
|
||||
c = conn.cursor()
|
||||
c.execute('SELECT path FROM cache')
|
||||
paths = [r[0] for r in c]
|
||||
for path in paths:
|
||||
if not os.path.exists(path):
|
||||
c.execute('DELETE FROM cache WHERE path = ?', (path, ))
|
||||
conn.commit()
|
||||
c.execute('VACUUM')
|
||||
conn.commit()
|
||||
|
||||
def sha1sum(filename, cached=False):
|
||||
if cached:
|
||||
return cache(filename, 'sha1')
|
||||
sha1 = hashlib.sha1()
|
||||
with open(filename) as f:
|
||||
for chunk in iter(lambda: f.read(128*sha1.block_size), ''):
|
||||
sha1.update(chunk)
|
||||
return sha1.hexdigest()
|
||||
|
||||
'''
|
||||
os hash - http://trac.opensubtitles.org/projects/opensubtitles/wiki/HashSourceCodes
|
||||
plus modification for files < 64k, buffer is filled with file data and padded with 0
|
||||
'''
|
||||
def oshash(filename, cached=True):
|
||||
if cached:
|
||||
return cache(filename, 'oshash')
|
||||
try:
|
||||
longlongformat = 'q' # long long
|
||||
bytesize = struct.calcsize(longlongformat)
|
||||
|
||||
f = open(filename, "rb")
|
||||
|
||||
filesize = os.path.getsize(filename)
|
||||
hash = filesize
|
||||
if filesize < 65536:
|
||||
for x in range(int(filesize/bytesize)):
|
||||
buffer = f.read(bytesize)
|
||||
(l_value,)= struct.unpack(longlongformat, buffer)
|
||||
hash += l_value
|
||||
hash = hash & 0xFFFFFFFFFFFFFFFF #to remain as 64bit number
|
||||
else:
|
||||
for x in range(int(65536/bytesize)):
|
||||
buffer = f.read(bytesize)
|
||||
(l_value,)= struct.unpack(longlongformat, buffer)
|
||||
hash += l_value
|
||||
hash = hash & 0xFFFFFFFFFFFFFFFF #to remain as 64bit number
|
||||
f.seek(max(0,filesize-65536),0)
|
||||
for x in range(int(65536/bytesize)):
|
||||
buffer = f.read(bytesize)
|
||||
(l_value,)= struct.unpack(longlongformat, buffer)
|
||||
hash += l_value
|
||||
hash = hash & 0xFFFFFFFFFFFFFFFF
|
||||
f.close()
|
||||
returnedhash = "%016x" % hash
|
||||
return returnedhash
|
||||
except(IOError):
|
||||
return "IOError"
|
||||
|
||||
def avinfo(filename, cached=True):
|
||||
if cached:
|
||||
return cache(filename, 'info')
|
||||
if os.path.getsize(filename):
|
||||
ffmpeg2theora = cmd('ffmpeg2theora')
|
||||
p = subprocess.Popen([ffmpeg2theora], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||
info, error = p.communicate()
|
||||
version = info.split('\n')[0].split(' - ')[0].split(' ')[-1]
|
||||
if version < '0.27':
|
||||
raise EnvironmentError('version of ffmpeg2theora needs to be 0.27 or later, found %s' % version)
|
||||
p = subprocess.Popen([ffmpeg2theora, '--info', filename],
|
||||
stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||
info, error = p.communicate()
|
||||
try:
|
||||
info = json.loads(info)
|
||||
except:
|
||||
#remove metadata, can be broken
|
||||
reg = re.compile('"metadata": {.*?},', re.DOTALL)
|
||||
info = re.sub(reg, '', info)
|
||||
info = json.loads(info)
|
||||
if 'video' in info:
|
||||
for v in info['video']:
|
||||
if not 'display_aspect_ratio' in v and 'width' in v:
|
||||
v['display_aspect_ratio'] = '%d:%d' % (v['width'], v['height'])
|
||||
v['pixel_aspect_ratio'] = '1:1'
|
||||
if len(info.get('audio', [])) > 1:
|
||||
if 'metadata' in info['audio'][0]:
|
||||
for stream in info['audio']:
|
||||
language = stream.get('metadata', {}).get('language')
|
||||
if language and language != 'und':
|
||||
stream['language'] = language[0]
|
||||
else:
|
||||
ffmpeg = cmd('ffmpeg')
|
||||
p = subprocess.Popen([ffmpeg, '-i', filename], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||
stdout, stderr = p.communicate()
|
||||
languages = [re.compile('\((.+?)\):').findall(l) for l in stderr.split('\n') if 'Stream' in l and 'Audio' in l]
|
||||
for i, stream in enumerate(info['audio']):
|
||||
language = languages[i]
|
||||
if language and language[0] != 'und':
|
||||
stream['language'] = language[0]
|
||||
return info
|
||||
|
||||
return {'path': filename, 'size': 0}
|
||||
|
||||
def ffprobe(filename):
|
||||
p = subprocess.Popen([
|
||||
cmd('ffprobe'),
|
||||
'-show_format',
|
||||
'-show_streams',
|
||||
'-print_format',
|
||||
'json',
|
||||
'-i', filename
|
||||
|
||||
], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||
info, error = p.communicate()
|
||||
ffinfo = json.loads(info)
|
||||
|
||||
def fix_value(key, value):
|
||||
if key == 'r_frame_rate':
|
||||
value = value.replace('/', ':')
|
||||
elif key == 'bit_rate':
|
||||
value = float(value) / 1000
|
||||
elif key == 'duration':
|
||||
value = float(value)
|
||||
elif key == 'size':
|
||||
value = int(value)
|
||||
return value
|
||||
|
||||
info = {}
|
||||
for key in ('duration', 'size', 'bit_rate'):
|
||||
info[{
|
||||
'bit_rate': 'bitrate'
|
||||
}.get(key, key)] = fix_value(key, ffinfo['format'][key])
|
||||
info['audio'] = []
|
||||
info['video'] = []
|
||||
info['metadata'] = ffinfo['format'].get('tags', {})
|
||||
for s in ffinfo['streams']:
|
||||
tags = s.pop('tags', {})
|
||||
language = None
|
||||
for t in tags:
|
||||
if t == 'language':
|
||||
language = tags[t]
|
||||
else:
|
||||
info['metadata'][t] = tags[t]
|
||||
if s.get('codec_type') in ('audio', 'video'):
|
||||
stream = {}
|
||||
if language and language != 'und':
|
||||
stream['language'] = language
|
||||
keys = [
|
||||
'codec_name',
|
||||
'width',
|
||||
'height',
|
||||
'bit_rate',
|
||||
'index',
|
||||
'display_aspect_ratio',
|
||||
'sample_rate',
|
||||
'channels',
|
||||
]
|
||||
if s['codec_type'] == 'video':
|
||||
keys += [
|
||||
'sample_aspect_ratio',
|
||||
'r_frame_rate',
|
||||
'pix_fmt',
|
||||
]
|
||||
|
||||
for key in keys:
|
||||
if key in s:
|
||||
stream[{
|
||||
'codec_name': 'codec',
|
||||
'bit_rate': 'bitrate',
|
||||
'index': 'id',
|
||||
'r_frame_rate': 'framerate',
|
||||
'sample_rate': 'samplerate',
|
||||
'pix_fmt': 'pixel_format',
|
||||
}.get(key, key)] = fix_value(key, s[key])
|
||||
info[s['codec_type']].append(stream)
|
||||
else:
|
||||
pass
|
||||
#print s
|
||||
for v in info['video']:
|
||||
if not 'display_aspect_ratio' in v and 'width' in v:
|
||||
v['display_aspect_ratio'] = '%d:%d' % (v['width'], v['height'])
|
||||
v['pixel_aspect_ratio'] = '1:1'
|
||||
info['oshash'] = oshash(filename)
|
||||
info['path'] = os.path.basename(filename)
|
||||
return info
|
||||
|
||||
def makedirs(path):
|
||||
if not os.path.exists(path):
|
||||
try:
|
||||
os.makedirs(path)
|
||||
except OSError as e:
|
||||
if e.errno != 17:
|
||||
raise
|
||||
|
||||
def copy_file(source, target, verbose=False):
|
||||
if verbose:
|
||||
print('copying', source, 'to', target)
|
||||
write_path(target)
|
||||
shutil.copyfile(source, target)
|
||||
|
||||
def read_file(file, verbose=False):
|
||||
if verbose:
|
||||
print('reading', file)
|
||||
f = open(file)
|
||||
data = f.read()
|
||||
f.close()
|
||||
return data
|
||||
|
||||
def read_json(file, verbose=False):
|
||||
if verbose:
|
||||
print('reading', file)
|
||||
with open(file) as fd:
|
||||
data = json.load(fd)
|
||||
return data
|
||||
|
||||
def write_file(file, data, verbose=False):
|
||||
if verbose:
|
||||
print('writing', file)
|
||||
write_path(file)
|
||||
f = open(file, 'w')
|
||||
f.write(data)
|
||||
f.close()
|
||||
return len(data)
|
||||
|
||||
def write_image(file, image, verbose=False):
|
||||
if verbose:
|
||||
print('writing', file)
|
||||
write_path(file)
|
||||
image.save(file)
|
||||
|
||||
def write_json(file, data, ensure_ascii=True, indent=0, sort_keys=False, verbose=False):
|
||||
data = json.dumps(data, ensure_ascii=ensure_ascii, indent=indent, sort_keys=sort_keys)
|
||||
write_file(file, data if ensure_ascii else data.encode('utf-8'), verbose=verbose)
|
||||
|
||||
def write_link(source, target, verbose=False):
|
||||
if verbose:
|
||||
print('linking', source, 'to', target)
|
||||
write_path(target)
|
||||
if os.path.exists(target):
|
||||
os.unlink(target)
|
||||
os.symlink(source, target)
|
||||
|
||||
def write_path(file):
|
||||
path = os.path.split(file)[0]
|
||||
if path and not os.path.exists(path):
|
||||
os.makedirs(path)
|
||||
335
Shared/lib/python3.4/site-packages/ox/fixunicode.py
Normal file
335
Shared/lib/python3.4/site-packages/ox/fixunicode.py
Normal file
|
|
@ -0,0 +1,335 @@
|
|||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
# -*- coding: utf-8 -*-
|
||||
# from http://blog.lumino.so/2012/08/20/fix-unicode-mistakes-with-python/
|
||||
# MIT
|
||||
from __future__ import print_function
|
||||
|
||||
import unicodedata
|
||||
|
||||
from six import unichr
|
||||
|
||||
__all__ = ['fix_bad_unicode']
|
||||
|
||||
def fix_bad_unicode(text):
|
||||
"""
|
||||
Something you will find all over the place, in real-world text, is text
|
||||
that's mistakenly encoded as utf-8, decoded in some ugly format like
|
||||
latin-1 or even Windows codepage 1252, and encoded as utf-8 again.
|
||||
|
||||
This causes your perfectly good Unicode-aware code to end up with garbage
|
||||
text because someone else (or maybe "someone else") made a mistake.
|
||||
|
||||
This function looks for the evidence of that having happened and fixes it.
|
||||
It determines whether it should replace nonsense sequences of single-byte
|
||||
characters that were really meant to be UTF-8 characters, and if so, turns
|
||||
them into the correctly-encoded Unicode character that they were meant to
|
||||
represent.
|
||||
|
||||
The input to the function must be Unicode. It's not going to try to
|
||||
auto-decode bytes for you -- then it would just create the problems it's
|
||||
supposed to fix.
|
||||
|
||||
>>> fix_bad_unicode(u'único')
|
||||
'único'
|
||||
|
||||
>>> fix_bad_unicode('This text is fine already :þ')
|
||||
'This text is fine already :þ'
|
||||
|
||||
|
||||
Because these characters often come from Microsoft products, we allow
|
||||
for the possibility that we get not just Unicode characters 128-255, but
|
||||
also Windows's conflicting idea of what characters 128-160 are.
|
||||
|
||||
>>> fix_bad_unicode('This — should be an em dash')
|
||||
'This — should be an em dash'
|
||||
|
||||
We might have to deal with both Windows characters and raw control
|
||||
characters at the same time, especially when dealing with characters like
|
||||
\x81 that have no mapping in Windows.
|
||||
|
||||
>>> fix_bad_unicode('This text is sad .â\x81”.')
|
||||
'This text is sad .⁔.'
|
||||
|
||||
This function even fixes multiple levels of badness:
|
||||
|
||||
>>> wtf = '\xc3\xa0\xc2\xb2\xc2\xa0_\xc3\xa0\xc2\xb2\xc2\xa0'
|
||||
>>> fix_bad_unicode(wtf)
|
||||
'ಠ_ಠ'
|
||||
|
||||
However, it has safeguards against fixing sequences of letters and
|
||||
punctuation that can occur in valid text:
|
||||
|
||||
>>> fix_bad_unicode('not such a fan of Charlotte Brontë…”')
|
||||
'not such a fan of Charlotte Brontë…”'
|
||||
|
||||
Cases of genuine ambiguity can sometimes be addressed by finding other
|
||||
characters that are not double-encoding, and expecting the encoding to
|
||||
be consistent:
|
||||
|
||||
>>> fix_bad_unicode('AHÅ™, the new sofa from IKEA®')
|
||||
'AHÅ™, the new sofa from IKEA®'
|
||||
|
||||
Finally, we handle the case where the text is in a single-byte encoding
|
||||
that was intended as Windows-1252 all along but read as Latin-1:
|
||||
|
||||
>>> fix_bad_unicode('This text was never Unicode at all\x85')
|
||||
'This text was never Unicode at all…'
|
||||
"""
|
||||
if not isinstance(text, str):
|
||||
raise TypeError("This isn't even decoded into Unicode yet. "
|
||||
"Decode it first.")
|
||||
if len(text) == 0:
|
||||
return text
|
||||
|
||||
maxord = max(ord(char) for char in text)
|
||||
tried_fixing = []
|
||||
if maxord < 128:
|
||||
# Hooray! It's ASCII!
|
||||
return text
|
||||
else:
|
||||
attempts = [(text, text_badness(text) + len(text))]
|
||||
if maxord < 256:
|
||||
tried_fixing = reinterpret_latin1_as_utf8(text)
|
||||
tried_fixing2 = reinterpret_latin1_as_windows1252(text)
|
||||
attempts.append((tried_fixing, text_cost(tried_fixing)))
|
||||
attempts.append((tried_fixing2, text_cost(tried_fixing2)))
|
||||
elif all(ord(char) in WINDOWS_1252_CODEPOINTS for char in text):
|
||||
tried_fixing = reinterpret_windows1252_as_utf8(text)
|
||||
attempts.append((tried_fixing, text_cost(tried_fixing)))
|
||||
else:
|
||||
# We can't imagine how this would be anything but valid text.
|
||||
return text
|
||||
|
||||
# Sort the results by badness
|
||||
attempts.sort(key=lambda x: x[1])
|
||||
#print attempts
|
||||
goodtext = attempts[0][0]
|
||||
if goodtext == text:
|
||||
return goodtext
|
||||
else:
|
||||
return fix_bad_unicode(goodtext)
|
||||
|
||||
|
||||
def reinterpret_latin1_as_utf8(wrongtext):
|
||||
newbytes = wrongtext.encode('latin-1', 'replace')
|
||||
return newbytes.decode('utf-8', 'replace')
|
||||
|
||||
|
||||
def reinterpret_windows1252_as_utf8(wrongtext):
|
||||
altered_bytes = []
|
||||
for char in wrongtext:
|
||||
if ord(char) in WINDOWS_1252_GREMLINS:
|
||||
altered_bytes.append(char.encode('WINDOWS_1252'))
|
||||
else:
|
||||
altered_bytes.append(char.encode('latin-1', 'replace'))
|
||||
return b''.join(altered_bytes).decode('utf-8', 'replace')
|
||||
|
||||
|
||||
def reinterpret_latin1_as_windows1252(wrongtext):
|
||||
"""
|
||||
Maybe this was always meant to be in a single-byte encoding, and it
|
||||
makes the most sense in Windows-1252.
|
||||
"""
|
||||
return wrongtext.encode('latin-1').decode('WINDOWS_1252', 'replace')
|
||||
|
||||
|
||||
def text_badness(text):
|
||||
'''
|
||||
Look for red flags that text is encoded incorrectly:
|
||||
|
||||
Obvious problems:
|
||||
- The replacement character \ufffd, indicating a decoding error
|
||||
- Unassigned or private-use Unicode characters
|
||||
|
||||
Very weird things:
|
||||
- Adjacent letters from two different scripts
|
||||
- Letters in scripts that are very rarely used on computers (and
|
||||
therefore, someone who is using them will probably get Unicode right)
|
||||
- Improbable control characters, such as 0x81
|
||||
|
||||
Moderately weird things:
|
||||
- Improbable single-byte characters, such as ƒ or ¬
|
||||
- Letters in somewhat rare scripts
|
||||
'''
|
||||
assert isinstance(text, str)
|
||||
errors = 0
|
||||
very_weird_things = 0
|
||||
weird_things = 0
|
||||
prev_letter_script = None
|
||||
for pos in range(len(text)):
|
||||
char = text[pos]
|
||||
index = ord(char)
|
||||
if index < 256:
|
||||
# Deal quickly with the first 256 characters.
|
||||
weird_things += SINGLE_BYTE_WEIRDNESS[index]
|
||||
if SINGLE_BYTE_LETTERS[index]:
|
||||
prev_letter_script = 'latin'
|
||||
else:
|
||||
prev_letter_script = None
|
||||
else:
|
||||
category = unicodedata.category(char)
|
||||
if category == 'Co':
|
||||
# Unassigned or private use
|
||||
errors += 1
|
||||
elif index == 0xfffd:
|
||||
# Replacement character
|
||||
errors += 1
|
||||
elif index in WINDOWS_1252_GREMLINS:
|
||||
lowchar = char.encode('WINDOWS_1252').decode('latin-1')
|
||||
weird_things += SINGLE_BYTE_WEIRDNESS[ord(lowchar)] - 0.5
|
||||
|
||||
if category.startswith('L'):
|
||||
# It's a letter. What kind of letter? This is typically found
|
||||
# in the first word of the letter's Unicode name.
|
||||
name = unicodedata.name(char)
|
||||
scriptname = name.split()[0]
|
||||
freq, script = SCRIPT_TABLE.get(scriptname, (0, 'other'))
|
||||
if prev_letter_script:
|
||||
if script != prev_letter_script:
|
||||
very_weird_things += 1
|
||||
if freq == 1:
|
||||
weird_things += 2
|
||||
elif freq == 0:
|
||||
very_weird_things += 1
|
||||
prev_letter_script = script
|
||||
else:
|
||||
prev_letter_script = None
|
||||
|
||||
return 100 * errors + 10 * very_weird_things + weird_things
|
||||
|
||||
|
||||
def text_cost(text):
|
||||
"""
|
||||
Assign a cost function to the length plus weirdness of a text string.
|
||||
"""
|
||||
return text_badness(text) + len(text)
|
||||
|
||||
#######################################################################
|
||||
# The rest of this file is esoteric info about characters, scripts, and their
|
||||
# frequencies.
|
||||
#
|
||||
# Start with an inventory of "gremlins", which are characters from all over
|
||||
# Unicode that Windows has instead assigned to the control characters
|
||||
# 0x80-0x9F. We might encounter them in their Unicode forms and have to figure
|
||||
# out what they were originally.
|
||||
|
||||
WINDOWS_1252_GREMLINS = [
|
||||
# adapted from http://effbot.org/zone/unicode-gremlins.htm
|
||||
0x0152, # LATIN CAPITAL LIGATURE OE
|
||||
0x0153, # LATIN SMALL LIGATURE OE
|
||||
0x0160, # LATIN CAPITAL LETTER S WITH CARON
|
||||
0x0161, # LATIN SMALL LETTER S WITH CARON
|
||||
0x0178, # LATIN CAPITAL LETTER Y WITH DIAERESIS
|
||||
0x017E, # LATIN SMALL LETTER Z WITH CARON
|
||||
0x017D, # LATIN CAPITAL LETTER Z WITH CARON
|
||||
0x0192, # LATIN SMALL LETTER F WITH HOOK
|
||||
0x02C6, # MODIFIER LETTER CIRCUMFLEX ACCENT
|
||||
0x02DC, # SMALL TILDE
|
||||
0x2013, # EN DASH
|
||||
0x2014, # EM DASH
|
||||
0x201A, # SINGLE LOW-9 QUOTATION MARK
|
||||
0x201C, # LEFT DOUBLE QUOTATION MARK
|
||||
0x201D, # RIGHT DOUBLE QUOTATION MARK
|
||||
0x201E, # DOUBLE LOW-9 QUOTATION MARK
|
||||
0x2018, # LEFT SINGLE QUOTATION MARK
|
||||
0x2019, # RIGHT SINGLE QUOTATION MARK
|
||||
0x2020, # DAGGER
|
||||
0x2021, # DOUBLE DAGGER
|
||||
0x2022, # BULLET
|
||||
0x2026, # HORIZONTAL ELLIPSIS
|
||||
0x2030, # PER MILLE SIGN
|
||||
0x2039, # SINGLE LEFT-POINTING ANGLE QUOTATION MARK
|
||||
0x203A, # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
|
||||
0x20AC, # EURO SIGN
|
||||
0x2122, # TRADE MARK SIGN
|
||||
]
|
||||
|
||||
# a list of Unicode characters that might appear in Windows-1252 text
|
||||
WINDOWS_1252_CODEPOINTS = list(range(256)) + WINDOWS_1252_GREMLINS
|
||||
|
||||
# Rank the characters typically represented by a single byte -- that is, in
|
||||
# Latin-1 or Windows-1252 -- by how weird it would be to see them in running
|
||||
# text.
|
||||
#
|
||||
# 0 = not weird at all
|
||||
# 1 = rare punctuation or rare letter that someone could certainly
|
||||
# have a good reason to use. All Windows-1252 gremlins are at least
|
||||
# weirdness 1.
|
||||
# 2 = things that probably don't appear next to letters or other
|
||||
# symbols, such as math or currency symbols
|
||||
# 3 = obscure symbols that nobody would go out of their way to use
|
||||
# (includes symbols that were replaced in ISO-8859-15)
|
||||
# 4 = why would you use this?
|
||||
# 5 = unprintable control character
|
||||
#
|
||||
# The Portuguese letter à (0xc3) is marked as weird because it would usually
|
||||
# appear in the middle of a word in actual Portuguese, and meanwhile it
|
||||
# appears in the mis-encodings of many common characters.
|
||||
|
||||
SINGLE_BYTE_WEIRDNESS = (
|
||||
# 0 1 2 3 4 5 6 7 8 9 a b c d e f
|
||||
5, 5, 5, 5, 5, 5, 5, 5, 5, 0, 0, 5, 5, 5, 5, 5, # 0x00
|
||||
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, # 0x10
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x20
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x30
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x40
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x50
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x60
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, # 0x70
|
||||
2, 5, 1, 4, 1, 1, 3, 3, 4, 3, 1, 1, 1, 5, 1, 5, # 0x80
|
||||
5, 1, 1, 1, 1, 3, 1, 1, 4, 1, 1, 1, 1, 5, 1, 1, # 0x90
|
||||
1, 0, 2, 2, 3, 2, 4, 2, 4, 2, 2, 0, 3, 1, 1, 4, # 0xa0
|
||||
2, 2, 3, 3, 4, 3, 3, 2, 4, 4, 4, 0, 3, 3, 3, 0, # 0xb0
|
||||
0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0xc0
|
||||
1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, # 0xd0
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0xe0
|
||||
1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, # 0xf0
|
||||
)
|
||||
|
||||
# Pre-cache the Unicode data saying which of these first 256 characters are
|
||||
# letters. We'll need it often.
|
||||
SINGLE_BYTE_LETTERS = [
|
||||
unicodedata.category(unichr(i)).startswith('L')
|
||||
for i in range(256)
|
||||
]
|
||||
|
||||
# A table telling us how to interpret the first word of a letter's Unicode
|
||||
# name. The number indicates how frequently we expect this script to be used
|
||||
# on computers. Many scripts not included here are assumed to have a frequency
|
||||
# of "0" -- if you're going to write in Linear B using Unicode, you're
|
||||
# probably aware enough of encoding issues to get it right.
|
||||
#
|
||||
# The lowercase name is a general category -- for example, Han characters and
|
||||
# Hiragana characters are very frequently adjacent in Japanese, so they all go
|
||||
# into category 'cjk'. Letters of different categories are assumed not to
|
||||
# appear next to each other often.
|
||||
SCRIPT_TABLE = {
|
||||
'LATIN': (3, 'latin'),
|
||||
'CJK': (2, 'cjk'),
|
||||
'ARABIC': (2, 'arabic'),
|
||||
'CYRILLIC': (2, 'cyrillic'),
|
||||
'GREEK': (2, 'greek'),
|
||||
'HEBREW': (2, 'hebrew'),
|
||||
'KATAKANA': (2, 'cjk'),
|
||||
'HIRAGANA': (2, 'cjk'),
|
||||
'HIRAGANA-KATAKANA': (2, 'cjk'),
|
||||
'HANGUL': (2, 'cjk'),
|
||||
'DEVANAGARI': (2, 'devanagari'),
|
||||
'THAI': (2, 'thai'),
|
||||
'FULLWIDTH': (2, 'cjk'),
|
||||
'MODIFIER': (2, None),
|
||||
'HALFWIDTH': (1, 'cjk'),
|
||||
'BENGALI': (1, 'bengali'),
|
||||
'LAO': (1, 'lao'),
|
||||
'KHMER': (1, 'khmer'),
|
||||
'TELUGU': (1, 'telugu'),
|
||||
'MALAYALAM': (1, 'malayalam'),
|
||||
'SINHALA': (1, 'sinhala'),
|
||||
'TAMIL': (1, 'tamil'),
|
||||
'GEORGIAN': (1, 'georgian'),
|
||||
'ARMENIAN': (1, 'armenian'),
|
||||
'KANNADA': (1, 'kannada'), # mostly used for looks of disapproval
|
||||
'MASCULINE': (1, 'latin'),
|
||||
'FEMININE': (1, 'latin')
|
||||
}
|
||||
|
||||
99
Shared/lib/python3.4/site-packages/ox/form.py
Normal file
99
Shared/lib/python3.4/site-packages/ox/form.py
Normal file
|
|
@ -0,0 +1,99 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
# GPL 2014
|
||||
from __future__ import with_statement, print_function
|
||||
|
||||
import itertools
|
||||
import mimetypes
|
||||
import random
|
||||
import sys
|
||||
|
||||
|
||||
__all__ = ['MultiPartForm']
|
||||
|
||||
# from /usr/lib/python3.4/email/generator.py
|
||||
# Helper used by Generator._make_boundary
|
||||
_width = len(repr(sys.maxsize-1))
|
||||
_fmt = '%%0%dd' % _width
|
||||
|
||||
def _make_boundary():
|
||||
# Craft a random boundary.
|
||||
token = random.randrange(sys.maxsize)
|
||||
boundary = ('=' * 15) + (_fmt % token) + '=='
|
||||
return boundary
|
||||
|
||||
class MultiPartForm(object):
|
||||
"""Accumulate the data to be used when posting a form."""
|
||||
|
||||
def __init__(self):
|
||||
self.form_fields = []
|
||||
self.files = []
|
||||
self.boundary = _make_boundary()
|
||||
return
|
||||
|
||||
def get_content_type(self):
|
||||
return 'multipart/form-data; boundary=%s' % self.boundary
|
||||
|
||||
def add_field(self, name, value):
|
||||
"""Add a simple field to the form data."""
|
||||
if isinstance(name, unicode):
|
||||
name = name.encode('utf-8')
|
||||
if isinstance(value, unicode):
|
||||
value = value.encode('utf-8')
|
||||
self.form_fields.append((name, value))
|
||||
return
|
||||
|
||||
def add_file(self, fieldname, filename, fileHandle, mimetype=None):
|
||||
"""Add a file to be uploaded."""
|
||||
if isinstance(fieldname, unicode):
|
||||
fieldname = fieldname.encode('utf-8')
|
||||
if isinstance(filename, unicode):
|
||||
filename = filename.encode('utf-8')
|
||||
|
||||
if hasattr(fileHandle, 'read'):
|
||||
body = fileHandle.read()
|
||||
else:
|
||||
body = fileHandle
|
||||
if mimetype is None:
|
||||
mimetype = mimetypes.guess_type(filename)[0] or 'application/octet-stream'
|
||||
self.files.append((fieldname, filename, mimetype, body))
|
||||
return
|
||||
|
||||
def __str__(self):
|
||||
"""Return a string representing the form data, including attached files."""
|
||||
# Build a list of lists, each containing "lines" of the
|
||||
# request. Each part is separated by a boundary string.
|
||||
# Once the list is built, return a string where each
|
||||
# line is separated by '\r\n'.
|
||||
parts = []
|
||||
part_boundary = '--' + self.boundary
|
||||
|
||||
# Add the form fields
|
||||
parts.extend(
|
||||
[ part_boundary,
|
||||
'Content-Disposition: form-data; name="%s"' % name,
|
||||
'',
|
||||
value,
|
||||
]
|
||||
for name, value in self.form_fields
|
||||
)
|
||||
|
||||
# Add the files to upload
|
||||
parts.extend(
|
||||
[ part_boundary,
|
||||
'Content-Disposition: file; name="%s"; filename="%s"' % \
|
||||
(field_name, filename),
|
||||
'Content-Type: %s' % content_type,
|
||||
'',
|
||||
body,
|
||||
]
|
||||
for field_name, filename, content_type, body in self.files
|
||||
)
|
||||
|
||||
# Flatten the list and add closing boundary marker,
|
||||
# then return CR+LF separated data
|
||||
flattened = list(itertools.chain(*parts))
|
||||
flattened.append('--' + self.boundary + '--')
|
||||
flattened.append('')
|
||||
return '\r\n'.join(flattened)
|
||||
|
||||
457
Shared/lib/python3.4/site-packages/ox/format.py
Normal file
457
Shared/lib/python3.4/site-packages/ox/format.py
Normal file
|
|
@ -0,0 +1,457 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
import math
|
||||
import re
|
||||
import string
|
||||
|
||||
|
||||
def toAZ(num):
|
||||
"""
|
||||
Converts an integer to bijective base 26 string using A-Z
|
||||
|
||||
>>> for i in range(1, 1000): assert fromAZ(toAZ(i)) == i
|
||||
|
||||
>>> toAZ(1)
|
||||
'A'
|
||||
|
||||
>>> toAZ(4461)
|
||||
'FOO'
|
||||
|
||||
>>> toAZ(1234567890)
|
||||
'CYWOQVJ'
|
||||
"""
|
||||
if num < 1: raise ValueError("must supply a positive integer")
|
||||
digits = string.ascii_uppercase
|
||||
az = ''
|
||||
while num != 0:
|
||||
num, r = divmod(num, 26)
|
||||
u, r = divmod(r - 1, 26)
|
||||
num += u
|
||||
az = digits[r] + az
|
||||
return az
|
||||
|
||||
def fromAZ(num):
|
||||
"""
|
||||
Converts a bijective base 26 string to an integer
|
||||
|
||||
>>> fromAZ('A')
|
||||
1
|
||||
>>> fromAZ('AA')
|
||||
27
|
||||
>>> fromAZ('AAA')
|
||||
703
|
||||
>>> fromAZ('FOO')
|
||||
4461
|
||||
"""
|
||||
num = num.replace('-','')
|
||||
digits = string.ascii_uppercase
|
||||
r = 0
|
||||
for exp, char in enumerate(reversed(num)):
|
||||
r = r + (pow(26, exp) * (digits.index(char) + 1))
|
||||
return r
|
||||
|
||||
def to26(q):
|
||||
"""
|
||||
Converts an integer to base 26
|
||||
|
||||
>>> for i in range(0, 1000): assert from26(to26(i)) == i
|
||||
|
||||
>>> to26(0)
|
||||
'A'
|
||||
|
||||
>>> to26(347485647)
|
||||
'BDGKMAP'
|
||||
"""
|
||||
if q < 0: raise ValueError("must supply a positive integer")
|
||||
base26 = string.ascii_uppercase
|
||||
converted = []
|
||||
while q != 0:
|
||||
q, r = divmod(q, 26)
|
||||
l = base26[r]
|
||||
converted.insert(0, l)
|
||||
return "".join(converted) or 'A'
|
||||
|
||||
def from26(q):
|
||||
"""
|
||||
Converts an base 26 string to an integer
|
||||
>>> from26('A')
|
||||
0
|
||||
"""
|
||||
base26 = string.ascii_uppercase
|
||||
q = q.replace('-','')
|
||||
r = 0
|
||||
for i in q:
|
||||
r = r * 26 + base26.index(i.upper())
|
||||
return r
|
||||
|
||||
def to32(q):
|
||||
"""
|
||||
Converts an integer to base 32
|
||||
We exclude 4 of the 26 letters: I L O U.
|
||||
http://www.crockford.com/wrmg/base32.html
|
||||
|
||||
>>> for i in range(0, 1000): assert from32(to32(i)) == i
|
||||
|
||||
>>> to32(0)
|
||||
'0'
|
||||
|
||||
>>> to32(347485647)
|
||||
'ABCDEF'
|
||||
|
||||
>>> to32(555306645)
|
||||
'GHJKMN'
|
||||
|
||||
>>> to32(800197332334559L)
|
||||
'PQRSTVWXYZ'
|
||||
|
||||
>>> to32(32)
|
||||
'10'
|
||||
|
||||
>>> to32(119292)
|
||||
'3MFW'
|
||||
|
||||
>>> to32(939387374)
|
||||
'VZVTFE'
|
||||
|
||||
>>> to32(-1)
|
||||
Traceback (most recent call last):
|
||||
...
|
||||
ValueError: must supply a positive integer
|
||||
"""
|
||||
|
||||
if q < 0: raise ValueError("must supply a positive integer")
|
||||
letters = "0123456789ABCDEFGHJKMNPQRSTVWXYZ"
|
||||
converted = []
|
||||
while q != 0:
|
||||
q, r = divmod(q, 32)
|
||||
l = letters[r]
|
||||
converted.insert(0, l)
|
||||
return "".join(converted) or '0'
|
||||
|
||||
def from32(q):
|
||||
"""
|
||||
Converts an base 32 string to an integer
|
||||
We exclude 4 of the 26 letters: I L O U.
|
||||
http://www.crockford.com/wrmg/base32.html
|
||||
|
||||
>>> from32('A')
|
||||
10
|
||||
|
||||
>>> from32('i')
|
||||
1
|
||||
|
||||
>>> from32('Li1l')
|
||||
33825
|
||||
|
||||
>>> from32('10')
|
||||
32
|
||||
"""
|
||||
_32map = {
|
||||
'0': 0,
|
||||
'O': 0,
|
||||
'1': 1,
|
||||
'I': 1,
|
||||
'L': 1,
|
||||
'2': 2,
|
||||
'3': 3,
|
||||
'4': 4,
|
||||
'5': 5,
|
||||
'6': 6,
|
||||
'7': 7,
|
||||
'8': 8,
|
||||
'9': 9,
|
||||
'A': 10,
|
||||
'B': 11,
|
||||
'C': 12,
|
||||
'D': 13,
|
||||
'E': 14,
|
||||
'F': 15,
|
||||
'G': 16,
|
||||
'H': 17,
|
||||
'J': 18,
|
||||
'K': 19,
|
||||
'M': 20,
|
||||
'N': 21,
|
||||
'P': 22,
|
||||
'Q': 23,
|
||||
'R': 24,
|
||||
'S': 25,
|
||||
'T': 26,
|
||||
'V': 27,
|
||||
'W': 28,
|
||||
'X': 29,
|
||||
'Y': 30,
|
||||
'Z': 31,
|
||||
}
|
||||
base32 = ('0123456789' + string.ascii_uppercase)[:32]
|
||||
q = q.replace('-','')
|
||||
q = ''.join([base32[_32map[i.upper()]] for i in q])
|
||||
return int(q, 32)
|
||||
|
||||
def to36(q):
|
||||
"""
|
||||
Converts an integer to base 36 (a useful scheme for human-sayable IDs
|
||||
like 'fuck' (739172), 'shit' (1329077) or 'hitler' (1059538851)).
|
||||
|
||||
>>> to36(35)
|
||||
'z'
|
||||
>>> to36(119292)
|
||||
'2k1o'
|
||||
>>> int(to36(939387374), 36)
|
||||
939387374
|
||||
>>> to36(0)
|
||||
'0'
|
||||
>>> to36(-393)
|
||||
Traceback (most recent call last):
|
||||
...
|
||||
ValueError: must supply a positive integer
|
||||
"""
|
||||
if q < 0: raise ValueError("must supply a positive integer")
|
||||
letters = "0123456789abcdefghijklmnopqrstuvwxyz"
|
||||
converted = []
|
||||
while q != 0:
|
||||
q, r = divmod(q, 36)
|
||||
converted.insert(0, letters[r])
|
||||
return "".join(converted) or '0'
|
||||
|
||||
def from36(q):
|
||||
return int(q, 36)
|
||||
|
||||
def int_value(strValue, default=u''):
|
||||
"""
|
||||
>>> int_value('abc23')
|
||||
u'23'
|
||||
|
||||
>>> int_value(' abc23')
|
||||
u'23'
|
||||
|
||||
>>> int_value('ab')
|
||||
u''
|
||||
"""
|
||||
try:
|
||||
val = re.compile('(\d+)').findall(unicode(strValue).strip())[0]
|
||||
except:
|
||||
val = default
|
||||
return val
|
||||
|
||||
def float_value(strValue, default=u''):
|
||||
"""
|
||||
>>> float_value('abc23.4')
|
||||
u'23.4'
|
||||
|
||||
>>> float_value(' abc23.4')
|
||||
u'23.4'
|
||||
|
||||
>>> float_value('ab')
|
||||
u''
|
||||
"""
|
||||
try:
|
||||
val = re.compile('([\d.]+)').findall(unicode(strValue).strip())[0]
|
||||
except:
|
||||
val = default
|
||||
return val
|
||||
|
||||
def format_number(number, longName, shortName):
|
||||
"""
|
||||
Return the number in a human-readable format (23 KB, 23.4 MB, 23.42 GB)
|
||||
|
||||
>>> format_number(123, 'Byte', 'B')
|
||||
'123 Bytes'
|
||||
|
||||
>>> format_number(1234, 'Byte', 'B')
|
||||
'1 KB'
|
||||
|
||||
>>> format_number(1234567, 'Byte', 'B')
|
||||
'1.2 MB'
|
||||
|
||||
>>> format_number(1234567890, 'Byte', 'B')
|
||||
'1.15 GB'
|
||||
|
||||
>>> format_number(1234567890123456789, 'Byte', 'B')
|
||||
'1,096.5166 PB'
|
||||
|
||||
>>> format_number(-1234567890123456789, 'Byte', 'B')
|
||||
'-1,096.5166 PB'
|
||||
|
||||
"""
|
||||
if abs(number) < 1024:
|
||||
return '%s %s%s' % (format_thousands(number), longName, number != 1 and 's' or '')
|
||||
prefix = ['K', 'M', 'G', 'T', 'P']
|
||||
for i in range(5):
|
||||
if abs(number) < math.pow(1024, i + 2) or i == 4:
|
||||
n = number / math.pow(1024, i + 1)
|
||||
return '%s %s%s' % (format_thousands('%.*f' % (i, n)), prefix[i], shortName)
|
||||
|
||||
def format_thousands(number, separator = ','):
|
||||
"""
|
||||
Return the number with separators (1,000,000)
|
||||
|
||||
>>> format_thousands(1)
|
||||
'1'
|
||||
>>> format_thousands(1000)
|
||||
'1,000'
|
||||
>>> format_thousands(1000000)
|
||||
'1,000,000'
|
||||
"""
|
||||
string = str(number).split('.')
|
||||
l = []
|
||||
for i, character in enumerate(reversed(string[0])):
|
||||
if i and (not (i % 3)):
|
||||
l.insert(0, separator)
|
||||
l.insert(0, character)
|
||||
string[0] = ''.join(l)
|
||||
return '.'.join(string)
|
||||
|
||||
def format_bits(number):
|
||||
return format_number(number, 'bit', 'b')
|
||||
|
||||
def format_bytes(number):
|
||||
return format_number(number, 'byte', 'B')
|
||||
|
||||
def format_pixels(number):
|
||||
return format_number(number, 'pixel', 'px')
|
||||
|
||||
def format_currency(amount, currency="$"):
|
||||
if amount:
|
||||
temp = "%.2f" % amount
|
||||
profile=re.compile(r"(\d)(\d\d\d[.,])")
|
||||
while 1:
|
||||
temp, count = re.subn(profile,r"\1,\2",temp)
|
||||
if not count:
|
||||
break
|
||||
if temp.startswith('-'):
|
||||
return "-"+ currency + temp[1:-3]
|
||||
return currency + temp[:-3]
|
||||
else:
|
||||
return ""
|
||||
|
||||
def plural(amount, unit, plural='s'):
|
||||
'''
|
||||
>>> plural(1, 'unit')
|
||||
'1 unit'
|
||||
>>> plural(2, 'unit')
|
||||
'2 units'
|
||||
'''
|
||||
if abs(amount) != 1:
|
||||
if plural == 's':
|
||||
unit = unit + plural
|
||||
else: unit = plural
|
||||
return "%s %s" % (format_thousands(amount), unit)
|
||||
|
||||
def format_duration(ms, verbosity=0, years=True, hours=True, milliseconds=True):
|
||||
'''
|
||||
verbosity
|
||||
0: D:HH:MM:SS
|
||||
1: Dd Hh Mm Ss
|
||||
2: D days H hours M minutes S seconds
|
||||
years
|
||||
True: 366 days are 1 year 1 day
|
||||
False: 366 days are 366 days
|
||||
hours
|
||||
True: 30 seconds are 00:00:30
|
||||
False: 30 seconds are 00:30
|
||||
milliseconds
|
||||
True: always display milliseconds
|
||||
False: never display milliseconds
|
||||
>>> format_duration(1000 * 60 * 60 * 24 * 366)
|
||||
'1:001:00:00:00.000'
|
||||
>>> format_duration(1000 * 60 * 60 * 24 * 366, years=False)
|
||||
'366:00:00:00.000'
|
||||
>>> format_duration(1000 * 60 * 60 * 24 * 365 + 2003, verbosity=2)
|
||||
'1 year 2 seconds 3 milliseconds'
|
||||
>>> format_duration(1000 * 30, hours=False, milliseconds=False)
|
||||
'00:30'
|
||||
'''
|
||||
if not ms and ms != 0:
|
||||
return ''
|
||||
if years:
|
||||
y = int(ms / 31536000000)
|
||||
d = int(ms % 31536000000 / 86400000)
|
||||
else:
|
||||
d = int(ms / 86400000)
|
||||
h = int(ms % 86400000 / 3600000)
|
||||
m = int(ms % 3600000 / 60000)
|
||||
s = int(ms % 60000 / 1000)
|
||||
ms = ms % 1000
|
||||
if verbosity == 0:
|
||||
if years and y:
|
||||
duration = "%d:%03d:%02d:%02d:%02d" % (y, d, h, m, s)
|
||||
elif d:
|
||||
duration = "%d:%02d:%02d:%02d" % (d, h, m, s)
|
||||
elif hours or h:
|
||||
duration = "%02d:%02d:%02d" % (h, m, s)
|
||||
else:
|
||||
duration = "%02d:%02d" % (m, s)
|
||||
if milliseconds:
|
||||
duration += ".%03d" % ms
|
||||
else:
|
||||
if verbosity == 1:
|
||||
durations = ["%sd" % d, "%sh" % h, "%sm" % m, "%ss" % s]
|
||||
if years:
|
||||
durations.insert(0, "%sy" % y)
|
||||
if milliseconds:
|
||||
durations.append("%sms" % ms)
|
||||
else:
|
||||
durations = [plural(d, 'day'), plural(h,'hour'),
|
||||
plural(m, 'minute'), plural(s, 'second')]
|
||||
if years:
|
||||
durations.insert(0, plural(y, 'year'))
|
||||
if milliseconds:
|
||||
durations.append(plural(ms, 'millisecond'))
|
||||
durations = filter(lambda x: not x.startswith('0'), durations)
|
||||
duration = ' '.join(durations)
|
||||
return duration
|
||||
|
||||
def ms2runtime(ms, shortenLong=False):
|
||||
# deprecated - use format_duration
|
||||
'''
|
||||
>>> ms2runtime(5000)
|
||||
'5 seconds'
|
||||
>>> ms2runtime(500000)
|
||||
'8 minutes 20 seconds'
|
||||
>>> ms2runtime(50000000)
|
||||
'13 hours 53 minutes 20 seconds'
|
||||
>>> ms2runtime(50000000-20000)
|
||||
'13 hours 53 minutes'
|
||||
'''
|
||||
if shortenLong and ms > 1000 * 60 * 60 * 24 * 464:
|
||||
return format_duration(ms, verbosity=1, milliseconds=False)
|
||||
return format_duration(ms, verbosity=2, milliseconds=False)
|
||||
|
||||
def ms2playtime(ms, hours=False):
|
||||
# deprecated - use format_duration
|
||||
'''
|
||||
>>> ms2playtime(5000)
|
||||
'00:05'
|
||||
>>> ms2playtime(500000)
|
||||
'08:20'
|
||||
>>> ms2playtime(50000000)
|
||||
'13:53:20'
|
||||
'''
|
||||
return format_duration(ms, hours=False, years=False, milliseconds=False)
|
||||
|
||||
def ms2time(ms):
|
||||
# deprecated - use format_duration
|
||||
'''
|
||||
>>> ms2time(44592123)
|
||||
'12:23:12.123'
|
||||
'''
|
||||
return format_duration(ms, years=False)
|
||||
|
||||
def time2ms(timeString):
|
||||
'''
|
||||
>>> time2ms('12:23:12.123')
|
||||
44592123
|
||||
'''
|
||||
ms = 0.0
|
||||
p = timeString.split(':')
|
||||
for i in range(len(p)):
|
||||
_p = p[i]
|
||||
if _p.endswith('.'): _p =_p[:-1]
|
||||
ms = ms * 60 + float(_p)
|
||||
return int(ms * 1000)
|
||||
|
||||
def shift_time(offset, timeString):
|
||||
newTime = time2ms(timeString) + offset
|
||||
return ms2time(newTime)
|
||||
|
||||
1958
Shared/lib/python3.4/site-packages/ox/geo.py
Normal file
1958
Shared/lib/python3.4/site-packages/ox/geo.py
Normal file
File diff suppressed because it is too large
Load diff
405
Shared/lib/python3.4/site-packages/ox/html.py
Normal file
405
Shared/lib/python3.4/site-packages/ox/html.py
Normal file
|
|
@ -0,0 +1,405 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
# GPL 2008
|
||||
import sys
|
||||
import re
|
||||
import string
|
||||
from six.moves.html_entities import name2codepoint
|
||||
from six import unichr
|
||||
|
||||
|
||||
# Configuration for add_links() function
|
||||
LEADING_PUNCTUATION = ['(', '<', '<']
|
||||
TRAILING_PUNCTUATION = ['.', ',', ')', '>', '\n', '>', "'", '"']
|
||||
|
||||
# list of possible strings used for bullets in bulleted lists
|
||||
DOTS = ['·', '*', '\xe2\x80\xa2', '•', '•', '•']
|
||||
|
||||
unencoded_ampersands_re = re.compile(r'&(?!(\w+|#\d+);)')
|
||||
word_split_re = re.compile(r'(\s+)')
|
||||
punctuation_re = re.compile('^(?P<lead>(?:%s)*)(?P<middle>.*?)(?P<trail>(?:%s)*)$' % \
|
||||
('|'.join([re.escape(x) for x in LEADING_PUNCTUATION]),
|
||||
'|'.join([re.escape(x) for x in TRAILING_PUNCTUATION])))
|
||||
simple_email_re = re.compile(r'^\S+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9._-]+$')
|
||||
link_target_attribute_re = re.compile(r'(<a [^>]*?)target=[^\s>]+')
|
||||
html_gunk_re = re.compile(r'(?:<br clear="all">|<i><\/i>|<b><\/b>|<em><\/em>|<strong><\/strong>|<\/?smallcaps>|<\/?uppercase>)', re.IGNORECASE)
|
||||
hard_coded_bullets_re = re.compile(r'((?:<p>(?:%s).*?[a-zA-Z].*?</p>\s*)+)' % '|'.join([re.escape(x) for x in DOTS]), re.DOTALL)
|
||||
trailing_empty_content_re = re.compile(r'(?:<p>(?: |\s|<br \/>)*?</p>\s*)+\Z')
|
||||
if sys.version[0] == 2:
|
||||
del x # Temporary variable
|
||||
|
||||
def escape(html):
|
||||
'''
|
||||
Returns the given HTML with ampersands, quotes and carets encoded
|
||||
|
||||
>>> escape('html "test" & <brothers>')
|
||||
'html "test" & <brothers>'
|
||||
'''
|
||||
if not isinstance(html, basestring):
|
||||
html = str(html)
|
||||
return html.replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"').replace("'", ''')
|
||||
|
||||
def linebreaks(value):
|
||||
'''
|
||||
Converts newlines into <p> and <br />
|
||||
'''
|
||||
value = re.sub(r'\r\n|\r|\n', '\n', value) # normalize newlines
|
||||
paras = re.split('\n{2,}', value)
|
||||
paras = ['<p>%s</p>' % p.strip().replace('\n', '<br />') for p in paras]
|
||||
return '\n\n'.join(paras)
|
||||
|
||||
def strip_tags(value):
|
||||
"""
|
||||
Returns the given HTML with all tags stripped
|
||||
|
||||
>>> strip_tags('some <h2>title</h2> <script>asdfasdf</script>')
|
||||
'some title asdfasdf'
|
||||
"""
|
||||
return re.sub(r'<[^>]*?>', '', value)
|
||||
|
||||
stripTags = strip_tags
|
||||
|
||||
def strip_spaces_between_tags(value):
|
||||
"Returns the given HTML with spaces between tags normalized to a single space"
|
||||
return re.sub(r'>\s+<', '> <', value)
|
||||
|
||||
def strip_entities(value):
|
||||
"Returns the given HTML with all entities (&something;) stripped"
|
||||
return re.sub(r'&(?:\w+|#\d);', '', value)
|
||||
|
||||
def fix_ampersands(value):
|
||||
"Returns the given HTML with all unencoded ampersands encoded correctly"
|
||||
return unencoded_ampersands_re.sub('&', value)
|
||||
|
||||
def add_links(text, trim_url_limit=None, nofollow=False):
|
||||
"""
|
||||
Converts any URLs in text into clickable links. Works on http://, https:// and
|
||||
www. links. Links can have trailing punctuation (periods, commas, close-parens)
|
||||
and leading punctuation (opening parens) and it'll still do the right thing.
|
||||
|
||||
If trim_url_limit is not None, the URLs in link text will be limited to
|
||||
trim_url_limit characters.
|
||||
|
||||
If nofollow is True, the URLs in link text will get a rel="nofollow" attribute.
|
||||
"""
|
||||
trim_url = lambda x, limit=trim_url_limit: limit is not None and (x[:limit] + (len(x) >=limit and '...' or '')) or x
|
||||
words = word_split_re.split(text)
|
||||
nofollow_attr = nofollow and ' rel="nofollow"' or ''
|
||||
for i, word in enumerate(words):
|
||||
match = punctuation_re.match(word)
|
||||
if match:
|
||||
lead, middle, trail = match.groups()
|
||||
if middle.startswith('www.') or ('@' not in middle and not middle.startswith('http://') and \
|
||||
len(middle) > 0 and middle[0] in string.letters + string.digits and \
|
||||
(middle.endswith('.org') or middle.endswith('.net') or middle.endswith('.com'))):
|
||||
middle = '<a href="http://%s"%s>%s</a>' % (middle, nofollow_attr, trim_url(middle))
|
||||
if middle.startswith('http://') or middle.startswith('https://'):
|
||||
middle = '<a href="%s"%s>%s</a>' % (middle, nofollow_attr, trim_url(middle))
|
||||
if '@' in middle and not middle.startswith('www.') and not ':' in middle \
|
||||
and simple_email_re.match(middle):
|
||||
middle = '<a href="mailto:%s">%s</a>' % (middle, middle)
|
||||
if lead + middle + trail != word:
|
||||
words[i] = lead + middle + trail
|
||||
return ''.join(words)
|
||||
|
||||
urlize = add_links
|
||||
|
||||
def clean_html(text):
|
||||
"""
|
||||
Cleans the given HTML. Specifically, it does the following:
|
||||
* Converts <b> and <i> to <strong> and <em>.
|
||||
* Encodes all ampersands correctly.
|
||||
* Removes all "target" attributes from <a> tags.
|
||||
* Removes extraneous HTML, such as presentational tags that open and
|
||||
immediately close and <br clear="all">.
|
||||
* Converts hard-coded bullets into HTML unordered lists.
|
||||
* Removes stuff like "<p> </p>", but only if it's at the
|
||||
bottom of the text.
|
||||
"""
|
||||
from text import normalize_newlines
|
||||
text = normalize_newlines(text)
|
||||
text = re.sub(r'<(/?)\s*b\s*>', '<\\1strong>', text)
|
||||
text = re.sub(r'<(/?)\s*i\s*>', '<\\1em>', text)
|
||||
text = fix_ampersands(text)
|
||||
# Remove all target="" attributes from <a> tags.
|
||||
text = link_target_attribute_re.sub('\\1', text)
|
||||
# Trim stupid HTML such as <br clear="all">.
|
||||
text = html_gunk_re.sub('', text)
|
||||
# Convert hard-coded bullets into HTML unordered lists.
|
||||
def replace_p_tags(match):
|
||||
s = match.group().replace('</p>', '</li>')
|
||||
for d in DOTS:
|
||||
s = s.replace('<p>%s' % d, '<li>')
|
||||
return '<ul>\n%s\n</ul>' % s
|
||||
text = hard_coded_bullets_re.sub(replace_p_tags, text)
|
||||
# Remove stuff like "<p> </p>", but only if it's at the bottom of the text.
|
||||
text = trailing_empty_content_re.sub('', text)
|
||||
return text
|
||||
|
||||
# This pattern matches a character entity reference (a decimal numeric
|
||||
# references, a hexadecimal numeric reference, or a named reference).
|
||||
charrefpat = re.compile(r'&(#(\d+|x[\da-fA-F]+)|[\w.:-]+);?')
|
||||
|
||||
def decode_html(html):
|
||||
"""
|
||||
>>> decode_html('me & you and $&%')
|
||||
u'me & you and $&%'
|
||||
>>> decode_html('€')
|
||||
u'\u20ac'
|
||||
>>> decode_html('Anniversary of Daoud's Republic')
|
||||
u"Anniversary of Daoud's Republic"
|
||||
"""
|
||||
if isinstance(html, bytes):
|
||||
html = html.decode('utf-8')
|
||||
uchr = unichr
|
||||
def entitydecode(match, uchr=uchr):
|
||||
entity = match.group(1)
|
||||
if entity == '#x80':
|
||||
return u'€'
|
||||
elif entity.startswith('#x'):
|
||||
return uchr(int(entity[2:], 16))
|
||||
elif entity.startswith('#'):
|
||||
return uchr(int(entity[1:]))
|
||||
elif entity in name2codepoint:
|
||||
return uchr(name2codepoint[entity])
|
||||
elif entity == 'apos':
|
||||
return "'"
|
||||
else:
|
||||
return match.group(0)
|
||||
return charrefpat.sub(entitydecode, html).replace(u'\xa0', ' ')
|
||||
|
||||
def highlight(text, query, hlClass="hl"):
|
||||
"""
|
||||
>>> highlight('me & you and $&%', 'and')
|
||||
'me & you <span class="hl">and</span> $&%'
|
||||
"""
|
||||
if query:
|
||||
text = text.replace('<br />', '|')
|
||||
query = re.escape(query).replace('\ ', '.')
|
||||
m = re.compile("(%s)" % query, re.IGNORECASE).findall(text)
|
||||
for i in m:
|
||||
text = re.sub("(%s)" % re.escape(i).replace('\ ', '.'), '<span class="%s">\\1</span>' % hlClass, text)
|
||||
text = text.replace('|', '<br />')
|
||||
return text
|
||||
|
||||
def escape_html(value):
|
||||
'''
|
||||
>>> escape_html(u'<script> foo')
|
||||
u'<script> foo'
|
||||
>>> escape_html(u'<script> foo')
|
||||
u'<script> foo'
|
||||
'''
|
||||
return escape(decode_html(value))
|
||||
|
||||
def sanitize_html(html, tags=None, global_attributes=[]):
|
||||
'''
|
||||
>>> sanitize_html('http://foo.com, bar')
|
||||
u'<a href="http://foo.com">http://foo.com</a>, bar'
|
||||
>>> sanitize_html('http://foo.com/foobar?foo, bar')
|
||||
u'<a href="http://foo.com/foobar?foo">http://foo.com/foobar?foo</a>, bar'
|
||||
>>> sanitize_html('(see: www.foo.com)')
|
||||
u'(see: <a href="http://www.foo.com">www.foo.com</a>)'
|
||||
>>> sanitize_html('foo@bar.com')
|
||||
u'<a href="mailto:foo@bar.com">foo@bar.com</a>'
|
||||
>>> sanitize_html(sanitize_html('foo@bar.com'))
|
||||
u'<a href="mailto:foo@bar.com">foo@bar.com</a>'
|
||||
>>> sanitize_html('<a href="http://foo.com" onmouseover="alert()">foo</a>')
|
||||
u'<a href="http://foo.com">foo</a>'
|
||||
>>> sanitize_html('<a href="javascript:alert()">foo</a>')
|
||||
u'<a href="javascript:alert()">foo'
|
||||
>>> sanitize_html('[http://foo.com foo]')
|
||||
u'<a href="http://foo.com">foo</a>'
|
||||
>>> sanitize_html('<div style="direction: rtl">foo</div>')
|
||||
u'<div style="direction: rtl">foo</div>'
|
||||
>>> sanitize_html('<script>alert()</script>')
|
||||
u'<script>alert()</script>'
|
||||
>>> sanitize_html("'foo' < 'bar' && \"foo\" > \"bar\"")
|
||||
u'\'foo\' < \'bar\' && "foo" > "bar"'
|
||||
>>> sanitize_html('<b>foo')
|
||||
u'<b>foo</b>'
|
||||
>>> sanitize_html('<b>foo</b></b>')
|
||||
u'<b>foo</b>'
|
||||
>>> sanitize_html('Anniversary of Daoud's Republic')
|
||||
u"Anniversary of Daoud's Republic"
|
||||
'''
|
||||
if not tags:
|
||||
valid_url = '^((https?:\/\/|\/|mailto:).*?)'
|
||||
tags = [
|
||||
# inline formatting
|
||||
{'name': 'b'},
|
||||
{'name': 'bdi'},
|
||||
{'name': 'code'},
|
||||
{'name': 'em'},
|
||||
{'name': 'i'},
|
||||
{'name': 'q'},
|
||||
{'name': 's'},
|
||||
{'name': 'span'},
|
||||
{'name': 'strong'},
|
||||
{'name': 'sub'},
|
||||
{'name': 'sup'},
|
||||
{'name': 'u'},
|
||||
# block formatting
|
||||
{'name': 'blockquote'},
|
||||
{'name': 'cite'},
|
||||
{
|
||||
'name': 'div',
|
||||
'optional': ['style'],
|
||||
'validation': {
|
||||
'style': '^direction: rtl$'
|
||||
}
|
||||
},
|
||||
{'name': 'h1'},
|
||||
{'name': 'h2'},
|
||||
{'name': 'h3'},
|
||||
{'name': 'h4'},
|
||||
{'name': 'h5'},
|
||||
{'name': 'h6'},
|
||||
{'name': 'p'},
|
||||
{'name': 'pre'},
|
||||
# lists
|
||||
{'name': 'li'},
|
||||
{'name': 'ol'},
|
||||
{'name': 'ul'},
|
||||
# tables
|
||||
{'name': 'table'},
|
||||
{'name': 'tbody'},
|
||||
{'name': 'td'},
|
||||
{'name': 'tfoot'},
|
||||
{'name': 'th'},
|
||||
{'name': 'thead'},
|
||||
{'name': 'tr'},
|
||||
# other
|
||||
{'name': '[]'},
|
||||
{
|
||||
'name': 'a',
|
||||
'required': ['href'],
|
||||
'validation': {
|
||||
'href': valid_url
|
||||
}
|
||||
},
|
||||
{'name': 'br'},
|
||||
{
|
||||
'name': 'iframe',
|
||||
'optional': ['width', 'height'],
|
||||
'required': ['src'],
|
||||
'validation': {
|
||||
'width': '^\d+$',
|
||||
'height': '^\d+$',
|
||||
'src': valid_url
|
||||
}
|
||||
},
|
||||
{
|
||||
'name': 'img',
|
||||
'optional': ['width', 'height'],
|
||||
'required': ['src'],
|
||||
'validation': {
|
||||
'width': '^\d+$',
|
||||
'height': '^\d+$',
|
||||
'src': valid_url
|
||||
},
|
||||
},
|
||||
{'name': 'figure'},
|
||||
{'name': 'figcaption'}
|
||||
]
|
||||
|
||||
tag_re = re.compile('<(/)?([^\ /]+)(.*?)(/)?>')
|
||||
attr_re = re.compile('([^=\ ]+)="([^"]+)"')
|
||||
|
||||
escaped = {}
|
||||
level = 0
|
||||
non_closing_tags = ['img', 'br']
|
||||
required_attributes = {}
|
||||
validation = {}
|
||||
valid_attributes = {}
|
||||
valid_tags = set([tag['name'] for tag in tags if tag['name'] != '[]'])
|
||||
|
||||
for tag in tags:
|
||||
valid_attributes[tag['name']] = tag.get('required', []) \
|
||||
+ tag.get('optional', []) \
|
||||
+ global_attributes
|
||||
required_attributes[tag['name']] = tag.get('required', [])
|
||||
validation[tag['name']] = tag.get('validation', {})
|
||||
|
||||
if '[]' in validation:
|
||||
html = re.sub(
|
||||
re.compile('\[((https?:\/\/|\/).+?) (.+?)\]', re.IGNORECASE),
|
||||
'<a href="\\1">\\3</a>', html);
|
||||
|
||||
parts = split_tags(html)
|
||||
for i, part in enumerate(parts):
|
||||
is_tag = i % 2
|
||||
if is_tag:
|
||||
t = tag_re.findall(part)
|
||||
if not t:
|
||||
parts[i] = escape_html(decode_html(part))
|
||||
continue
|
||||
closing, name, attributes, end = t[0]
|
||||
closing = closing != ''
|
||||
a = attr_re.findall(attributes)
|
||||
attrs = dict(a)
|
||||
|
||||
if not closing and not name in non_closing_tags:
|
||||
level += 1
|
||||
|
||||
if not attrs and attributes or name not in valid_tags:
|
||||
valid = False
|
||||
else:
|
||||
valid = True
|
||||
for key in set(attrs) - set(valid_attributes[name]):
|
||||
del attrs[key]
|
||||
for key in required_attributes[tag['name']]:
|
||||
if not key in attrs:
|
||||
valid = False
|
||||
|
||||
if valid:
|
||||
for attr in attrs:
|
||||
if attr in validation[name]:
|
||||
if not re.compile(validation[name][attr]).findall(attrs[attr]):
|
||||
valid = False
|
||||
break
|
||||
|
||||
if valid and closing:
|
||||
valid = not escaped.get(level)
|
||||
else:
|
||||
escaped[level] = not valid
|
||||
if closing:
|
||||
level -= 1
|
||||
if valid:
|
||||
parts[i] = '<%s%s%s>' % (
|
||||
('/' if closing else ''),
|
||||
name,
|
||||
(' ' + ' '.join(['%s="%s"' % (key, attrs[key]) for key, value in a if key in attrs])
|
||||
if not closing and attrs else '')
|
||||
)
|
||||
else:
|
||||
parts[i] = escape_html(decode_html(part))
|
||||
else:
|
||||
parts[i] = escape_html(decode_html(part))
|
||||
html = ''.join(parts)
|
||||
html = html.replace('\n\n', '<br/><br/>')
|
||||
html = add_links(html)
|
||||
return sanitize_fragment(html)
|
||||
|
||||
def split_tags(string):
|
||||
tags = []
|
||||
def collect(match):
|
||||
tags.append(match.group(0))
|
||||
return '\0'
|
||||
strings = re.sub('<[^<>]+>', collect, string).split('\0')
|
||||
tags.append('')
|
||||
return [item for sublist in zip(strings, tags) for item in sublist][:-1]
|
||||
|
||||
def sanitize_fragment(html):
|
||||
'''
|
||||
#html5lib reorders arguments, so not usable
|
||||
import html5lib
|
||||
return html5lib.parseFragment(html).toxml().decode('utf-8')
|
||||
'''
|
||||
if not html:
|
||||
return u''
|
||||
import lxml.html
|
||||
body = lxml.html.document_fromstring(html).find('body')
|
||||
html = lxml.html.tostring(body, encoding='utf-8')[6:-7].decode('utf-8')
|
||||
if html.startswith('<p>') and html.endswith('</p>'):
|
||||
html = html[3:-4]
|
||||
return html
|
||||
246
Shared/lib/python3.4/site-packages/ox/image.py
Normal file
246
Shared/lib/python3.4/site-packages/ox/image.py
Normal file
|
|
@ -0,0 +1,246 @@
|
|||
#!/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
|
||||
from __future__ import division
|
||||
from hashlib import sha1
|
||||
import Image
|
||||
import ImageDraw
|
||||
import ImageFont
|
||||
|
||||
ZONE_INDEX = []
|
||||
for pixel_index in range(64):
|
||||
x, y = pixel_index % 8, int(pixel_index / 8)
|
||||
ZONE_INDEX.append(int(x / 2) + int(y / 4) * 4)
|
||||
|
||||
def drawText(image, position, text, font_file, font_size, color):
|
||||
draw = ImageDraw.Draw(image)
|
||||
font = ImageFont.truetype(font_file, font_size, encoding='unic')
|
||||
draw.text(position, text, fill=color, font=font)
|
||||
return draw.textsize(text, font=font)
|
||||
|
||||
def getHSL(rgb):
|
||||
rgb = map(lambda x: x / 255, rgb)
|
||||
maximum = max(rgb)
|
||||
minimum = min(rgb)
|
||||
hsl = [0.0, 0.0, 0.0]
|
||||
hsl[2] = (maximum + minimum) / 2
|
||||
if maximum == minimum:
|
||||
hsl[0] = 0.0
|
||||
hsl[1] = 0.0
|
||||
else:
|
||||
if maximum == rgb[0]:
|
||||
hsl[0] = (60 * (rgb[1] - rgb[2]) / (maximum - minimum) + 360) % 360
|
||||
elif maximum == rgb[1]:
|
||||
hsl[0] = 60 * (rgb[2] - rgb[0]) / (maximum - minimum) + 120
|
||||
else:
|
||||
hsl[0] = 60 * (rgb[0] - rgb[1]) / (maximum - minimum) + 240
|
||||
if hsl[2] <= 0.5:
|
||||
hsl[1] = (maximum - minimum) / (2 * hsl[2])
|
||||
else:
|
||||
hsl[1] = (maximum - minimum) / (2 - 2 * hsl[2])
|
||||
return tuple(hsl)
|
||||
|
||||
def getImageHash(image_file, mode):
|
||||
image = Image.open(image_file).convert('RGB').resize((8, 8), Image.ANTIALIAS)
|
||||
image_hash = 0
|
||||
if mode == 'color':
|
||||
# divide the image into 8 zones:
|
||||
# 0 0 1 1 2 2 3 3
|
||||
# 0 0 1 1 2 2 3 3
|
||||
# 0 0 1 1 2 2 3 3
|
||||
# 0 0 1 1 2 2 3 3
|
||||
# 4 4 5 5 6 6 7 7
|
||||
# 4 4 5 5 6 6 7 7
|
||||
# 4 4 5 5 6 6 7 7
|
||||
# 4 4 5 5 6 6 7 7
|
||||
image_data = image.getdata()
|
||||
zone_values = [[] for i in range(8)]
|
||||
for pixel_index, pixel_value in enumerate(image_data):
|
||||
zone_values[ZONE_INDEX[pixel_index]].append(pixel_value)
|
||||
for zone_index, pixel_values in enumerate(zone_values):
|
||||
# get the mean for each color channel
|
||||
mean = map(lambda x: int(round(sum(x) / 8)), zip(*pixel_values))
|
||||
# store the mean color of each zone as an 8-bit value:
|
||||
# RRRGGGBB
|
||||
color_index = sum((
|
||||
int(mean[0] / 32) << 5,
|
||||
int(mean[1] / 32) << 2,
|
||||
int(mean[2] / 64)
|
||||
))
|
||||
image_hash += color_index * pow(2, zone_index * 8)
|
||||
elif mode == 'shape':
|
||||
# pixels brighter than the mean register as 1,
|
||||
# pixels equal to or darker than the mean as 0
|
||||
image_data = image.convert('L').getdata()
|
||||
image_mean = sum(image_data) / 64
|
||||
for pixel_index, pixel_value in enumerate(image_data):
|
||||
if pixel_value > image_mean:
|
||||
image_hash += pow(2, pixel_index)
|
||||
image_hash = hex(image_hash)[2:].upper()
|
||||
if image_hash.endswith('L'):
|
||||
image_hash = image_hash[:-1]
|
||||
image_hash = '0' * (16 - len(image_hash)) + image_hash
|
||||
return image_hash
|
||||
|
||||
def getImageHeat(image_file):
|
||||
image = Image.open(image_file).convert('RGB').resize((16, 16), Image.ANTIALIAS)
|
||||
pixel = image.load()
|
||||
image_heat = 0
|
||||
for y in range(image.size[1]):
|
||||
for x in range(image.size[0]):
|
||||
pixel_heat = []
|
||||
for y_ in range(max(y - 1, 0), min(y + 2, image.size[1])):
|
||||
for x_ in range(max(x - 1, 0), min(x + 2, image.size[0])):
|
||||
if x != x_ or y != y_:
|
||||
for c in range(3):
|
||||
pixel_heat.append(abs(pixel[x, y][c] - pixel[x_, y_][c]))
|
||||
image_heat += sum(pixel_heat) / len(pixel_heat)
|
||||
return image_heat / 256
|
||||
|
||||
def getImageHSL(image_file):
|
||||
image = Image.open(image_file).convert('RGB').resize((1, 1), Image.ANTIALIAS)
|
||||
return getHSL(image.getpixel((0, 0)))
|
||||
|
||||
def getRGB(hsl):
|
||||
hsl = list(hsl)
|
||||
hsl[0] /= 360
|
||||
rgb = [0, 0, 0]
|
||||
if hsl[1] == 0:
|
||||
rgb = [hsl[2], hsl[2], hsl[2]]
|
||||
else:
|
||||
if hsl[2] < 1/2:
|
||||
v2 = hsl[2] * (1 + hsl[1])
|
||||
else:
|
||||
v2 = hsl[1] + hsl[2] - (hsl[1] * hsl[2])
|
||||
v1 = 2 * hsl[2] - v2
|
||||
for i in range(3):
|
||||
v3 = hsl[0] + (1 - i) * 1/3;
|
||||
if v3 < 0:
|
||||
v3 += 1
|
||||
elif v3 > 1:
|
||||
v3 -= 1
|
||||
if v3 < 1/6:
|
||||
rgb[i] = v1 + ((v2 - v1) * 6 * v3)
|
||||
elif v3 < 1/2:
|
||||
rgb[i] = v2
|
||||
elif v3 < 2/3:
|
||||
rgb[i] = v1 + ((v2 - v1) * 6 * (2/3 - v3))
|
||||
else:
|
||||
rgb[i] = v1
|
||||
return tuple(map(lambda x: int(x * 255), rgb))
|
||||
|
||||
def getTextSize(image, text, font_file, font_size):
|
||||
draw = ImageDraw.Draw(image)
|
||||
font = ImageFont.truetype(font_file, font_size, encoding='unic')
|
||||
return draw.textsize(text, font=font)
|
||||
|
||||
def wrapText(text, max_width, max_lines, font_file, font_size):
|
||||
# wraps text to max_width and max_lines
|
||||
def get_min_width():
|
||||
# returns the width of the longest non-hyphenated word
|
||||
min_width = 0
|
||||
for word in words:
|
||||
width = get_width(word)
|
||||
if width <= max_width and width > min_width:
|
||||
min_width = width
|
||||
return min_width
|
||||
def get_width(string):
|
||||
return draw.textsize(string, font=font)[0]
|
||||
image = Image.new('RGB', (1, 1))
|
||||
draw = ImageDraw.Draw(image)
|
||||
font = ImageFont.truetype(font_file, font_size, encoding='unic')
|
||||
ellipsis = '…'.decode('utf-8')
|
||||
separators = ['-', '+', '/', ':']
|
||||
if get_width(text) <= max_width:
|
||||
# text fits in one line
|
||||
lines = [text]
|
||||
else:
|
||||
lines = ['']
|
||||
words = []
|
||||
spaces = []
|
||||
test_words = text.split(' ')
|
||||
for word in test_words:
|
||||
if get_width(word) <= max_width:
|
||||
# word fits in one line
|
||||
words.append(word)
|
||||
spaces.append(' ')
|
||||
else:
|
||||
# word does not fit in one line
|
||||
position = 0
|
||||
test_word = word
|
||||
for separator in separators:
|
||||
test_word = test_word.replace(separator, ' ')
|
||||
parts = test_word.split(' ')
|
||||
for i, part in enumerate(parts):
|
||||
words.append(part)
|
||||
if i < len(parts) - 1:
|
||||
position += len(part) + 1
|
||||
spaces.append(word[position - 1])
|
||||
else:
|
||||
spaces.append(' ')
|
||||
if max_lines:
|
||||
# test if the same number of lines can be achieved with shorter
|
||||
# lines, without hyphenating words that are not yet hyphenated
|
||||
best_lines = len(wrapText(text, max_width, 0, font_file, font_size))
|
||||
test_lines = best_lines
|
||||
min_width = get_min_width()
|
||||
while test_lines == best_lines and max_width >= min_width:
|
||||
max_width -= 1
|
||||
test_lines = len(wrapText(text, max_width, 0, font_file, font_size))
|
||||
max_width += 1
|
||||
for i, word in enumerate(words):
|
||||
line = len(lines) - 1
|
||||
word_width = get_width(word)
|
||||
if word_width <= max_width:
|
||||
# word fits in one line
|
||||
test = (lines[line] + word + spaces[i]).strip()
|
||||
if get_width(test) <= max_width:
|
||||
# word fits in current line
|
||||
lines[line] = test + (' ' if spaces[i] == ' ' else '')
|
||||
elif max_lines == 0 or line < max_lines - 1:
|
||||
# word fits in next line
|
||||
lines.append(word + spaces[i])
|
||||
else:
|
||||
# word does not fit in last line
|
||||
test = lines[line].strip() + ellipsis
|
||||
if get_width(test) <= max_width:
|
||||
# ellipsis fits in last line
|
||||
lines[line] = test
|
||||
else:
|
||||
# ellipsis does not fit in last line
|
||||
test_words = lines[line].split(' ')
|
||||
while get_width(test) > max_width:
|
||||
test_words.pop()
|
||||
test = ' '.join(test_words) + ellipsis
|
||||
if test == ellipsis:
|
||||
# ellipsis does not fit after first word of last line
|
||||
test = lines[line][:-1] + ellipsis
|
||||
while get_width(test) > max_width:
|
||||
test = test[:-2] + ellipsis
|
||||
lines[line] = test
|
||||
break
|
||||
else:
|
||||
# word does not fit in one line
|
||||
chars = list(word)
|
||||
for char in chars:
|
||||
line = len(lines) - 1
|
||||
test = (lines[line] + char + '-').strip()
|
||||
if get_width(test) <= max_width:
|
||||
# char fits in current line
|
||||
lines[line] = test[:-1]
|
||||
elif max_lines == 0 or line < max_lines - 1:
|
||||
# char fits in next line
|
||||
if test[-3] == ' ':
|
||||
lines[line] = test[:-3]
|
||||
else:
|
||||
lines[line] = test[:-2] + '-'
|
||||
lines.append(char)
|
||||
else:
|
||||
# char does not fit in last line
|
||||
test = lines[line] + char + ellipsis
|
||||
while get_width(test) > max_width:
|
||||
test = test[:-2] + ellipsis
|
||||
lines[line] = test
|
||||
lines[line] += ' '
|
||||
return lines
|
||||
246
Shared/lib/python3.4/site-packages/ox/iso.py
Normal file
246
Shared/lib/python3.4/site-packages/ox/iso.py
Normal file
|
|
@ -0,0 +1,246 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
# GPL 2008
|
||||
|
||||
_iso639_languages = [
|
||||
("Unknown", "", "", "und"),
|
||||
("Afar", "", "aa", "aar"),
|
||||
("Abkhazian", "", "ab", "abk"),
|
||||
("Afrikaans", "", "af", "afr"),
|
||||
("Akan", "", "ak", "aka"),
|
||||
("Albanian", "", "sq", "sqi"),
|
||||
("Amharic", "", "am", "amh"),
|
||||
("Arabic", "", "ar", "ara"),
|
||||
("Aragonese", "", "an", "arg"),
|
||||
("Armenian", "", "hy", "hye"),
|
||||
("Assamese", "", "as", "asm"),
|
||||
("Avaric", "", "av", "ava"),
|
||||
("Avestan", "", "ae", "ave"),
|
||||
("Aymara", "", "ay", "aym"),
|
||||
("Azerbaijani", "", "az", "aze"),
|
||||
("Bashkir", "", "ba", "bak"),
|
||||
("Bambara", "", "bm", "bam"),
|
||||
("Basque", "", "eu", "eus"),
|
||||
("Belarusian", "", "be", "bel"),
|
||||
("Bengali", "", "bn", "ben"),
|
||||
("Bihari", "", "bh", "bih"),
|
||||
("Bislama", "", "bi", "bis"),
|
||||
("Bosnian", "", "bs", "bos"),
|
||||
("Breton", "", "br", "bre"),
|
||||
("Bulgarian", "", "bg", "bul"),
|
||||
("Burmese", "", "my", "mya"),
|
||||
("Catalan", "", "ca", "cat"),
|
||||
("Chamorro", "", "ch", "cha"),
|
||||
("Chechen", "", "ce", "che"),
|
||||
("Chinese", "", "zh", "zho"),
|
||||
("Church Slavic", "", "cu", "chu"),
|
||||
("Chuvash", "", "cv", "chv"),
|
||||
("Cornish", "", "kw", "cor"),
|
||||
("Corsican", "", "co", "cos"),
|
||||
("Cree", "", "cr", "cre"),
|
||||
("Czech", "", "cs", "ces"),
|
||||
("Danish", "Dansk", "da", "dan"),
|
||||
("Divehi", "", "dv", "div"),
|
||||
("Dutch", "Nederlands", "nl", "nld"),
|
||||
("Dzongkha", "", "dz", "dzo"),
|
||||
("English", "English", "en", "eng"),
|
||||
("Esperanto", "", "eo", "epo"),
|
||||
("Estonian", "", "et", "est"),
|
||||
("Ewe", "", "ee", "ewe"),
|
||||
("Faroese", "", "fo", "fao"),
|
||||
("Fijian", "", "fj", "fij"),
|
||||
("Finnish", "Suomi", "fi", "fin"),
|
||||
("French", "Francais", "fr", "fra"),
|
||||
("Western Frisian", "", "fy", "fry"),
|
||||
("Fulah", "", "ff", "ful"),
|
||||
("Georgian", "", "ka", "kat"),
|
||||
("German", "Deutsch", "de", "deu"),
|
||||
("Gaelic (Scots)", "", "gd", "gla"),
|
||||
("Irish", "", "ga", "gle"),
|
||||
("Galician", "", "gl", "glg"),
|
||||
("Manx", "", "gv", "glv"),
|
||||
("Greek, Modern", "", "el", "ell"),
|
||||
("Guarani", "", "gn", "grn"),
|
||||
("Gujarati", "", "gu", "guj"),
|
||||
("Haitian", "", "ht", "hat"),
|
||||
("Hausa", "", "ha", "hau"),
|
||||
("Hebrew", "", "he", "heb"),
|
||||
("Herero", "", "hz", "her"),
|
||||
("Hindi", "", "hi", "hin"),
|
||||
("Hiri Motu", "", "ho", "hmo"),
|
||||
("Hungarian", "Magyar", "hu", "hun"),
|
||||
("Igbo", "", "ig", "ibo"),
|
||||
("Icelandic", "Islenska", "is", "isl"),
|
||||
("Ido", "", "io", "ido"),
|
||||
("Sichuan Yi", "", "ii", "iii"),
|
||||
("Inuktitut", "", "iu", "iku"),
|
||||
("Interlingue", "", "ie", "ile"),
|
||||
("Interlingua", "", "ia", "ina"),
|
||||
("Indonesian", "", "id", "ind"),
|
||||
("Inupiaq", "", "ik", "ipk"),
|
||||
("Italian", "Italiano", "it", "ita"),
|
||||
("Javanese", "", "jv", "jav"),
|
||||
("Japanese", "", "ja", "jpn"),
|
||||
("Kalaallisut (Greenlandic)", "", "kl", "kal"),
|
||||
("Kannada", "", "kn", "kan"),
|
||||
("Kashmiri", "", "ks", "kas"),
|
||||
("Kanuri", "", "kr", "kau"),
|
||||
("Kazakh", "", "kk", "kaz"),
|
||||
("Central Khmer", "", "km", "khm"),
|
||||
("Kikuyu", "", "ki", "kik"),
|
||||
("Kinyarwanda", "", "rw", "kin"),
|
||||
("Kirghiz", "", "ky", "kir"),
|
||||
("Komi", "", "kv", "kom"),
|
||||
("Kongo", "", "kg", "kon"),
|
||||
("Korean", "", "ko", "kor"),
|
||||
("Kuanyama", "", "kj", "kua"),
|
||||
("Kurdish", "", "ku", "kur"),
|
||||
("Lao", "", "lo", "lao"),
|
||||
("Latin", "", "la", "lat"),
|
||||
("Latvian", "", "lv", "lav"),
|
||||
("Limburgan", "", "li", "lim"),
|
||||
("Lingala", "", "ln", "lin"),
|
||||
("Lithuanian", "", "lt", "lit"),
|
||||
("Luxembourgish", "", "lb", "ltz"),
|
||||
("Luba-Katanga", "", "lu", "lub"),
|
||||
("Ganda", "", "lg", "lug"),
|
||||
("Macedonian", "", "mk", "mkd"),
|
||||
("Marshallese", "", "mh", "mah"),
|
||||
("Malayalam", "", "ml", "mal"),
|
||||
("Maori", "", "mi", "mri"),
|
||||
("Marathi", "", "mr", "mar"),
|
||||
("Malay", "", "ms", "msa"),
|
||||
("Malagasy", "", "mg", "mlg"),
|
||||
("Maltese", "", "mt", "mlt"),
|
||||
("Moldavian", "", "mo", "mol"),
|
||||
("Mongolian", "", "mn", "mon"),
|
||||
("Nauru", "", "na", "nau"),
|
||||
("Navajo", "", "nv", "nav"),
|
||||
("Ndebele, South", "", "nr", "nbl"),
|
||||
("Ndebele, North", "", "nd", "nde"),
|
||||
("Ndonga", "", "ng", "ndo"),
|
||||
("Nepali", "", "ne", "nep"),
|
||||
("Norwegian Nynorsk", "", "nn", "nno"),
|
||||
("Norwegian Bokmål", "", "nb", "nob"),
|
||||
("Norwegian", "Norsk", "no", "nor"),
|
||||
("Chichewa; Nyanja", "", "ny", "nya"),
|
||||
("Occitan (post 1500); Provençal", "", "oc", "oci"),
|
||||
("Ojibwa", "", "oj", "oji"),
|
||||
("Oriya", "", "or", "ori"),
|
||||
("Oromo", "", "om", "orm"),
|
||||
("Ossetian; Ossetic", "", "os", "oss"),
|
||||
("Panjabi", "", "pa", "pan"),
|
||||
("Persian", "", "fa", "fas"),
|
||||
("Pali", "", "pi", "pli"),
|
||||
("Polish", "", "pl", "pol"),
|
||||
("Portuguese", "Portugues", "pt", "por"),
|
||||
("Pushto", "", "ps", "pus"),
|
||||
("Quechua", "", "qu", "que"),
|
||||
("Romansh", "", "rm", "roh"),
|
||||
("Romanian", "", "ro", "ron"),
|
||||
("Rundi", "", "rn", "run"),
|
||||
("Russian", "", "ru", "rus"),
|
||||
("Sango", "", "sg", "sag"),
|
||||
("Sanskrit", "", "sa", "san"),
|
||||
("Serbian", "", "sr", "srp"),
|
||||
("Croatian", "Hrvatski", "hr", "hrv"),
|
||||
("Sinhala", "", "si", "sin"),
|
||||
("Slovak", "", "sk", "slk"),
|
||||
("Slovenian", "", "sl", "slv"),
|
||||
("Northern Sami", "", "se", "sme"),
|
||||
("Samoan", "", "sm", "smo"),
|
||||
("Shona", "", "sn", "sna"),
|
||||
("Sindhi", "", "sd", "snd"),
|
||||
("Somali", "", "so", "som"),
|
||||
("Sotho, Southern", "", "st", "sot"),
|
||||
("Spanish", "Espanol", "es", "spa"),
|
||||
("Sardinian", "", "sc", "srd"),
|
||||
("Swati", "", "ss", "ssw"),
|
||||
("Sundanese", "", "su", "sun"),
|
||||
("Swahili", "", "sw", "swa"),
|
||||
("Swedish", "Svenska", "sv", "swe"),
|
||||
("Tahitian", "", "ty", "tah"),
|
||||
("Tamil", "", "ta", "tam"),
|
||||
("Tatar", "", "tt", "tat"),
|
||||
("Telugu", "", "te", "tel"),
|
||||
("Tajik", "", "tg", "tgk"),
|
||||
("Tagalog", "", "tl", "tgl"),
|
||||
("Thai", "", "th", "tha"),
|
||||
("Tibetan", "", "bo", "bod"),
|
||||
("Tigrinya", "", "ti", "tir"),
|
||||
("Tonga (Tonga Islands)", "", "to", "ton"),
|
||||
("Tswana", "", "tn", "tsn"),
|
||||
("Tsonga", "", "ts", "tso"),
|
||||
("Turkmen", "", "tk", "tuk"),
|
||||
("Turkish", "", "tr", "tur"),
|
||||
("Twi", "", "tw", "twi"),
|
||||
("Uighur", "", "ug", "uig"),
|
||||
("Ukrainian", "", "uk", "ukr"),
|
||||
("Urdu", "", "ur", "urd"),
|
||||
("Uzbek", "", "uz", "uzb"),
|
||||
("Venda", "", "ve", "ven"),
|
||||
("Vietnamese", "", "vi", "vie"),
|
||||
("Volapük", "", "vo", "vol"),
|
||||
("Welsh", "", "cy", "cym"),
|
||||
("Walloon", "", "wa", "wln"),
|
||||
("Wolof", "", "wo", "wol"),
|
||||
("Xhosa", "", "xh", "xho"),
|
||||
("Yiddish", "", "yi", "yid"),
|
||||
("Yoruba", "", "yo", "yor"),
|
||||
("Zhuang", "", "za", "zha"),
|
||||
("Zulu", "", "zu", "zul"),
|
||||
]
|
||||
|
||||
def codeToLang(code):
|
||||
if code:
|
||||
code = code.lower()
|
||||
if len(code) == 2:
|
||||
for l in _iso639_languages:
|
||||
if l[2] == code:
|
||||
return l[0]
|
||||
elif len(code) == 3:
|
||||
for l in _iso639_languages:
|
||||
if l[3] == code:
|
||||
return l[0]
|
||||
return None
|
||||
|
||||
def langTo3Code(lang):
|
||||
if lang:
|
||||
lang = langEnglishName(lang)
|
||||
if lang:
|
||||
lang=lang.lower()
|
||||
for l in _iso639_languages:
|
||||
if l[0].lower() == lang:
|
||||
return l[3]
|
||||
return None
|
||||
|
||||
def langTo2Code(lang):
|
||||
if lang:
|
||||
lang = langEnglishName(lang)
|
||||
if lang:
|
||||
lang=lang.lower()
|
||||
for l in _iso639_languages:
|
||||
if l[0].lower() == lang:
|
||||
return l[2]
|
||||
return None
|
||||
|
||||
def langCode2To3(code):
|
||||
return langTo3Code(codeToLang(code))
|
||||
|
||||
def langCode3To2(code):
|
||||
return langTo2Code(codeToLang(code))
|
||||
|
||||
def langEnglishName(lang):
|
||||
lang = lang.lower()
|
||||
for l in _iso639_languages:
|
||||
if l[1].lower() == lang or l[0].lower() == lang:
|
||||
return l[0]
|
||||
return None
|
||||
|
||||
def languages2Letter():
|
||||
languages = []
|
||||
for l in _iso639_languages:
|
||||
if l[2]:
|
||||
languages.append(l[2])
|
||||
return languages
|
||||
|
||||
183
Shared/lib/python3.4/site-packages/ox/js.py
Normal file
183
Shared/lib/python3.4/site-packages/ox/js.py
Normal file
|
|
@ -0,0 +1,183 @@
|
|||
#!/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
|
||||
from ox.utils import json
|
||||
|
||||
def minify(source, comment=''):
|
||||
# see https://github.com/douglascrockford/JSMin/blob/master/README
|
||||
def get_next_non_whitespace_token():
|
||||
pass
|
||||
tokens = tokenize(source)
|
||||
length = len(tokens)
|
||||
minified = '/*' + comment + '*/' if comment else ''
|
||||
for i, token in enumerate(tokens):
|
||||
if token['type'] in ['linebreak', 'whitespace']:
|
||||
prevToken = None if i == 0 else tokens[i - 1]
|
||||
next = i + 1
|
||||
while next < length and tokens[next]['type'] in ['comment', 'linebreak', 'whitespace']:
|
||||
next += 1
|
||||
nextToken = None if next == length else tokens[next]
|
||||
if token['type'] == 'linebreak':
|
||||
# replace a linebreak between two tokens that are identifiers or
|
||||
# numbers or strings or unary operators or grouping operators
|
||||
# with a single newline, otherwise remove it
|
||||
if prevToken and nextToken\
|
||||
and (prevToken['type'] in ['identifier', 'number', 'string']\
|
||||
or prevToken['value'] in ['++', '--', ')', ']', '}'])\
|
||||
and (nextToken['type'] in ['identifier', 'number', 'string']\
|
||||
or nextToken['value'] in ['+', '-', '++', '--', '~', '!', '(', '[', '{']):
|
||||
minified += '\n'
|
||||
elif token['type'] == 'whitespace':
|
||||
# replace whitespace between two tokens that are identifiers or
|
||||
# numbers, or between a token that ends with "+" or "-" and one that
|
||||
# begins with "+" or "-", with a single space, otherwise remove it
|
||||
if prevToken and nextToken\
|
||||
and ((prevToken['type'] in ['identifier', 'number']\
|
||||
and nextToken['type'] in ['identifier', 'number'])
|
||||
or (prevToken['value'] in ['+', '-', '++', '--']
|
||||
and nextToken['value'] in ['+', '-', '++', '--'])):
|
||||
minified += ' '
|
||||
elif token['type'] != 'comment':
|
||||
# remove comments and leave all other tokens untouched
|
||||
minified += token['value']
|
||||
return minified
|
||||
|
||||
def parse_JSONC(source):
|
||||
return json.loads(minify(source))
|
||||
|
||||
def tokenize(source):
|
||||
# see https://github.com/mozilla/narcissus/blob/master/lib/jslex.js
|
||||
IDENTIFIER = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ$_'
|
||||
KEYWORD = [
|
||||
'break',
|
||||
'case', 'catch', 'class', 'const', 'continue',
|
||||
'debugger', 'default', 'delete', 'do',
|
||||
'else', 'enum', 'export', 'extends',
|
||||
'finally', 'for', 'function',
|
||||
'if', 'implements', 'import', 'in', 'instanceof', 'interface',
|
||||
'let', 'module',
|
||||
'new',
|
||||
'package', 'private', 'protected', 'public',
|
||||
'return',
|
||||
'super', 'switch', 'static',
|
||||
'this', 'throw', 'try', 'typeof',
|
||||
'var', 'void',
|
||||
'yield',
|
||||
'while', 'with'
|
||||
]
|
||||
LINEBREAK = '\n\r'
|
||||
NUMBER = '01234567890'
|
||||
OPERATOR = [
|
||||
# arithmetic
|
||||
'+', '-', '*', '/', '%', '++', '--',
|
||||
# assignment
|
||||
'=', '+=', '-=', '*=', '/=', '%=',
|
||||
'&=', '|=', '^=', '<<=', '>>=', '>>>=',
|
||||
# bitwise
|
||||
'&', '|', '^', '~', '<<', '>>', '>>>',
|
||||
# comparison
|
||||
'==', '!=', '===', '!==', '>', '>=', '<', '<=',
|
||||
# conditional
|
||||
'?', ':',
|
||||
# grouping
|
||||
'(', ')', '[', ']', '{', '}',
|
||||
# logical
|
||||
'&&', '||', '!',
|
||||
# other
|
||||
'.', ',', ';'
|
||||
]
|
||||
REGEXP = 'abcdefghijklmnopqrstuvwxyz'
|
||||
STRING = '\'"'
|
||||
WHITESPACE = ' \t'
|
||||
def is_regexp():
|
||||
# checks if a forward slash is the beginning of a regexp,
|
||||
# as opposed to the beginning of an operator
|
||||
i = len(tokens) - 1
|
||||
# scan back to the previous significant token,
|
||||
# or to the beginnig of the source
|
||||
while i >= 0 and tokens[i]['type'] in ['comment', 'linebreak', 'whitespace']:
|
||||
i -= 1
|
||||
if i == -1:
|
||||
# source begins with forward slash
|
||||
is_regexp = True
|
||||
else:
|
||||
token = tokens[i]
|
||||
is_regexp = (
|
||||
token['type'] == 'identifier' and token['value'] in KEYWORD
|
||||
) or (
|
||||
token['type'] == 'operator' and not token['value'] in ['++', '--', ')', ']', '}']
|
||||
)
|
||||
return is_regexp
|
||||
column = 1
|
||||
cursor = 0
|
||||
length = len(source)
|
||||
tokens = []
|
||||
line = 1
|
||||
while cursor < length:
|
||||
char = source[cursor]
|
||||
start = cursor
|
||||
cursor += 1
|
||||
if char == '/' and cursor < length - 1 and source[cursor] in '/*':
|
||||
type = 'comment'
|
||||
cursor += 1
|
||||
while cursor < length:
|
||||
cursor += 1
|
||||
if source[start + 1] == '/' and source[cursor] == '\n':
|
||||
break
|
||||
elif source[start + 1] == '*' and source[cursor:cursor + 2] == '*/':
|
||||
cursor += 2
|
||||
break
|
||||
elif char in IDENTIFIER:
|
||||
type = 'identifier'
|
||||
while cursor < length and source[cursor] in IDENTIFIER + NUMBER:
|
||||
cursor += 1
|
||||
elif char in LINEBREAK:
|
||||
type = 'linebreak'
|
||||
while cursor < length and source[cursor] in LINEBREAK:
|
||||
cursor += 1
|
||||
elif char in NUMBER:
|
||||
type = 'number'
|
||||
while cursor < length and source[cursor] in NUMBER + '.':
|
||||
cursor += 1
|
||||
elif char == '/' and is_regexp():
|
||||
type = 'regexp'
|
||||
while cursor < length and source[cursor] != '/':
|
||||
cursor += (2 if source[cursor] == '\\' else 1)
|
||||
cursor += 1
|
||||
while cursor < length and source[cursor] in REGEXP:
|
||||
cursor += 1
|
||||
elif char in OPERATOR:
|
||||
type = 'operator'
|
||||
if cursor < length:
|
||||
string = char + source[cursor]
|
||||
while cursor < length and string in OPERATOR:
|
||||
cursor += 1
|
||||
string += source[cursor]
|
||||
elif char in STRING:
|
||||
type = 'string'
|
||||
while cursor < length and source[cursor] != source[start]:
|
||||
cursor += (2 if source[cursor] == '\\' else 1)
|
||||
cursor += 1
|
||||
elif char in WHITESPACE:
|
||||
type = 'whitespace'
|
||||
while cursor < length and source[cursor] in WHITESPACE:
|
||||
cursor += 1
|
||||
value = source[start:cursor]
|
||||
tokens.append({
|
||||
'column': column,
|
||||
'line': line,
|
||||
'type': type,
|
||||
'value': value
|
||||
})
|
||||
if type == 'comment':
|
||||
lines = value.split('\n');
|
||||
column = len(lines[-1])
|
||||
line += len(lines) - 1
|
||||
elif type == 'linebreak':
|
||||
column = 1
|
||||
column = 1
|
||||
line += len(value)
|
||||
else:
|
||||
column += len(value)
|
||||
return tokens
|
||||
21
Shared/lib/python3.4/site-packages/ox/jsonc.py
Normal file
21
Shared/lib/python3.4/site-packages/ox/jsonc.py
Normal file
|
|
@ -0,0 +1,21 @@
|
|||
#!/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
from __future__ import with_statement, print_function
|
||||
|
||||
from .js import minify
|
||||
from .utils import json
|
||||
|
||||
|
||||
def load(f):
|
||||
return loads(f.read())
|
||||
|
||||
def loads(source):
|
||||
try:
|
||||
minified = minify(source)
|
||||
return json.loads(minified)
|
||||
except json.JSONDecodeError as e:
|
||||
s = minified.split('\n')
|
||||
context = s[e.lineno-1][max(0, e.colno-1):e.colno+30]
|
||||
msg = e.msg + ' at ' + context
|
||||
raise json.JSONDecodeError(msg, minified, e.pos)
|
||||
28
Shared/lib/python3.4/site-packages/ox/location.py
Normal file
28
Shared/lib/python3.4/site-packages/ox/location.py
Normal file
|
|
@ -0,0 +1,28 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
import math
|
||||
|
||||
|
||||
def center(lat_sw, lng_sw, lat_ne=None, lng_ne=None):
|
||||
if not lat_ne and not lng_ne:
|
||||
return min(lat_sw, lng_sw) + abs(lat_sw-lng_sw)/2
|
||||
else:
|
||||
return (center(lat_sw,lng_sw), center(lat_ne, lng_ne))
|
||||
|
||||
def area(lat_sw, lng_sw, lat_ne, lng_ne):
|
||||
return (lat_ne - lat_sw) * (lng_ne - lng_sw)
|
||||
|
||||
def latlngspan2latlng(lat, lng, latSpan, lngSpan):
|
||||
return dict(
|
||||
lat_sw = lat - latSpan, lng_sw = lng - lngSpan,
|
||||
lat_ne = lat + latSpan, lng_ne = lng + latSpan
|
||||
)
|
||||
|
||||
def parse_location_string(location_string):
|
||||
l = location_string.split('+')
|
||||
if len(l) == 1:
|
||||
l = location_string.split(';')
|
||||
l = [i.strip() for i in l]
|
||||
l = filter(lambda x: x, l)
|
||||
return l
|
||||
|
||||
486
Shared/lib/python3.4/site-packages/ox/movie.py
Normal file
486
Shared/lib/python3.4/site-packages/ox/movie.py
Normal file
|
|
@ -0,0 +1,486 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
# GPL 2012
|
||||
|
||||
from __future__ import division
|
||||
|
||||
import hashlib
|
||||
import os
|
||||
import re
|
||||
import unicodedata
|
||||
|
||||
from .normalize import normalize_name
|
||||
from .text import get_sort_name, find_re
|
||||
from .file import EXTENSIONS
|
||||
|
||||
__all__ = ['parse_movie_path', 'create_movie_path', 'get_oxid']
|
||||
|
||||
LANGUAGES = ['en', 'fr', 'de', 'es', 'it']
|
||||
|
||||
'''
|
||||
Naming scheme:
|
||||
X/[Group, The; Lastname, Firstname/]The Title[ (YEAR[-[YEAR]])]/
|
||||
The Title[ ([SXX][EYY[+ZZ|-ZZ]])[ Episode Title]][.Version][.Part XY[.Part Title][.en][.fr].xyz
|
||||
'''
|
||||
|
||||
def format_path(data, directory_key='director'):
|
||||
def format_underscores(string):
|
||||
return re.sub('^\.|\.$|:|/|\?|<|>', '_', string)
|
||||
director = data['directorSort'] or ['Unknown Director']
|
||||
title = data['seriesTitle' if data['isEpisode'] else 'title'] or 'Untitled'
|
||||
year = data['seriesYear' if data['isEpisode'] else 'year'] or None
|
||||
parts = map(format_underscores, filter(lambda x: x != None, [
|
||||
u'; '.join(director[:10]),
|
||||
u'%s%s' % (title, u' (%s)' % year if year else ''),
|
||||
u'%s%s%s%s%s%s' % (
|
||||
data['title'] or 'Untitled',
|
||||
u'.%s' % data['version'] if data['version'] else '',
|
||||
u'.Part %s' % data['part'] if data['part'] else '',
|
||||
u'.%s' % data['partTitle'] if data['partTitle'] else '',
|
||||
u'.%s' % data['language'] if data['language'] else '',
|
||||
u'.%s' % data['extension'] if data['extension'] else ''
|
||||
)
|
||||
]))
|
||||
if data.get('subdirectory'):
|
||||
parts.insert(-1, data['subdirectory'])
|
||||
return unicodedata.normalize('NFD', u'/'.join(parts))
|
||||
|
||||
def parse_item_files(files):
|
||||
# parses a list of file objects associated with one item (file objects
|
||||
# as returned by parse_path, but extended with 'path' and 'time')
|
||||
# and returns a list of version objects (in case of english-only subtitles,
|
||||
# version[i]['files'][j]['normalizedPath'] will be modified)
|
||||
def get_file_key(file):
|
||||
return '\n'.join([
|
||||
file['version'] or '',
|
||||
file['part'] or '',
|
||||
file['language'] or '',
|
||||
file['extension'] or ''
|
||||
])
|
||||
def get_version_key(file, extension=True):
|
||||
return '%s/%s-part/%s' % (
|
||||
file['version'] or '',
|
||||
'single' if file['part'] == None else 'multi',
|
||||
file['extension'] if extension else ''
|
||||
)
|
||||
# filter out duplicate files (keep shortest path, sorted alphabetically)
|
||||
# since same version+part+language+extension can still differ in part title,
|
||||
# ''/'en' or 'mpg'/'mpeg', or have an unparsed section in their path
|
||||
unique_files = []
|
||||
duplicate_files = []
|
||||
for key in [get_file_key(file) for file in files]:
|
||||
key_files = sorted(
|
||||
sorted([file for file in files if get_file_key(file) == key]),
|
||||
key=lambda x: len(x['path'])
|
||||
)
|
||||
unique_files.append(key_files[0])
|
||||
duplicate_files += key_files[1:]
|
||||
# determine versions ('version.single|multi-part.videoextension')
|
||||
version_files = {}
|
||||
time = {}
|
||||
video_files = [file for file in unique_files if file['type'] == 'video']
|
||||
versions = set([file['version'] for file in video_files])
|
||||
for version in versions:
|
||||
for file in [file for file in video_files if file['version'] == version]:
|
||||
version_key = get_version_key(file)
|
||||
version_files[version_key] = (version_files[version_key] if version_key in version_files else []) + [file]
|
||||
time[version_key] = sorted([time[version_key], file['time']])[-1] if version_key in time else file['time']
|
||||
# determine preferred video extension (newest)
|
||||
extension = {}
|
||||
for key in set(['/'.join(version_key.split('/')[:-1]) + '/' for version_key in version_files]):
|
||||
extensions = set([version_key.split('/')[-1] for version_key in version_files if version_key.startswith(key)])
|
||||
extension[key] = sorted(extensions, key=lambda x: time[key + x])[-1]
|
||||
# associate other (non-video) files
|
||||
other_files = [file for file in unique_files if file['type'] != 'video']
|
||||
versions = set([file['version'] for file in other_files])
|
||||
for version in versions:
|
||||
for file in [file for file in other_files if file['version'] == version]:
|
||||
key = get_version_key(file, extension=False)
|
||||
if key in extension:
|
||||
version_files[key + extension[key]].append(file)
|
||||
else:
|
||||
version_files[key] = (version_files[key] if key in version_files else []) + [file]
|
||||
extension[key] = ''
|
||||
# determine main files (video + srt)
|
||||
full = {}
|
||||
language = {}
|
||||
main_files = {}
|
||||
for version_key in version_files:
|
||||
parts = sorted(list(set([file['part'] for file in version_files[version_key]])))
|
||||
# determine if all parts have one video file
|
||||
video_files = [file for file in version_files[version_key] if file['type'] == 'video']
|
||||
full[version_key] = len(video_files) == len(parts)
|
||||
main_files[version_key] = video_files if full[version_key] else []
|
||||
# determine preferred subtitle language
|
||||
language[version_key] = None
|
||||
subtitle_files = [file for file in version_files[version_key] if file['extension'] == 'srt']
|
||||
for subtitle_language in sorted(
|
||||
list(set([file['language'] for file in subtitle_files])),
|
||||
key=lambda x: LANGUAGES.index(x) if x in LANGUAGES else x
|
||||
):
|
||||
language_files = [file for file in subtitle_files if file['language'] == subtitle_language]
|
||||
if len(subtitle_files) == len(parts):
|
||||
language[version_key] = subtitle_language
|
||||
main_files[version_key] += language_files
|
||||
break
|
||||
# determine main version (best srt language, then video time)
|
||||
main_version = None
|
||||
full_version_keys = sorted(
|
||||
[version_key for version_key in version_files if full[version_key]],
|
||||
key=lambda x: time[x],
|
||||
reverse=True
|
||||
)
|
||||
if full_version_keys:
|
||||
language_version_keys = sorted(
|
||||
[version_key for version_key in full_version_keys if language[version_key]],
|
||||
key=lambda x: LANGUAGES.index(language[x]) if language[x] in LANGUAGES else language[x]
|
||||
)
|
||||
main_version = language_version_keys[0] if language_version_keys else full_version_keys[0]
|
||||
# add duplicate files
|
||||
for file in duplicate_files:
|
||||
key = get_version_key(file, extension=False)
|
||||
version_key = '%s%s' % (key, extension[key] if key in extension else '')
|
||||
version_files[version_key] = (version_files[version_key] if version_key in version_files else []) + [file]
|
||||
# remove unneeded '.en'
|
||||
for version_key in version_files:
|
||||
for extension in EXTENSIONS['subtitle']:
|
||||
subtitle_files = [file for file in version_files[version_key] if file['extension'] == extension]
|
||||
subtitle_languages = list(set([file['language'] for file in subtitle_files]))
|
||||
if len(subtitle_languages) == 1 and subtitle_languages[0] == LANGUAGES[0]:
|
||||
for subtitle_file in subtitle_files:
|
||||
subtitle_file['normalizedPath'] = format_path(dict(subtitle_file, **{'language': None}))
|
||||
# return data
|
||||
data = []
|
||||
for version_key in version_files:
|
||||
data.append({
|
||||
'files': sorted(
|
||||
[dict(file, isMainFile=file in main_files[version_key]) for file in version_files[version_key]],
|
||||
key=lambda x: x['path']
|
||||
),
|
||||
'isFullVersion': full[version_key],
|
||||
'isMainVersion': version_key == main_version,
|
||||
'subtitleLanguage': language[version_key] if version_key in language else None,
|
||||
'version': version_key
|
||||
})
|
||||
return data
|
||||
|
||||
def parse_path(path, directory_key='director'):
|
||||
'''
|
||||
# all keys
|
||||
>>> parse_path('Frost, Mark; Lynch, David/Twin Peaks (1991)/Twin Peaks (S01E01) Pilot.European Version.Part 1.Welcome to Twin Peaks.en.fr.MPEG')['normalizedPath']
|
||||
'Frost, Mark; Lynch, David/Twin Peaks (1991)/Twin Peaks (S01E00) Pilot.European Version.Part 1.Welcome to Twin Peaks.en.fr.mpg'
|
||||
|
||||
# pop directory title off file name
|
||||
>>> parse_path("Unknown Director/www.xxx.com.._/www.xxx.com....Director's Cut.avi")['version']
|
||||
"Director's Cut"
|
||||
|
||||
# handle dots
|
||||
>>> parse_path("Unknown Director/Unknown Title (2000)/... Mr. .com....Director's Cut.srt")['version']
|
||||
"Director's Cut"
|
||||
|
||||
# multiple years, season zero, multiple episodes, dots in episode title and part title
|
||||
>>> parse_path('Groening, Matt/The Simpsons (1989-2012)/The Simpsons (S00E01-02) D.I.Y..Uncensored Version.Part 1.D.I.Y..de.avi')['normalizedPath']
|
||||
'Groening, Matt/The Simpsons (1989-2012)/The Simpsons (S01E01+02) D.I.Y..Uncensored Version.Part 1.D.I.Y..de.avi'
|
||||
|
||||
# handle underscores
|
||||
>>> parse_path('Unknown Director/_com_ 1_0 _ NaN.._/_com_ 1_0 _ NaN....avi')['title']
|
||||
'.com: 1/0 / NaN...'
|
||||
|
||||
# TODO: '.com.avi'
|
||||
'''
|
||||
def parse_title(string):
|
||||
return title, year
|
||||
def parse_type(string):
|
||||
for type in EXTENSIONS:
|
||||
if string in EXTENSIONS[type]:
|
||||
return type
|
||||
return None
|
||||
def parse_underscores(string):
|
||||
# '^_' or '_$' is '.'
|
||||
string = re.sub('^_', '.', string)
|
||||
string = re.sub('_$', '.', string)
|
||||
# '_.foo$' or '_ (' is '?'
|
||||
string = re.sub('_(?=(\.\w+$| \())', '?', string)
|
||||
# ' _..._ ' is '<...>'
|
||||
string = re.sub('(?<= )_(.+)_(?= )', '<\g<1>>', string)
|
||||
# 'foo_bar' or 'foo _ bar' is '/'
|
||||
string = re.sub('(?<=\w)_(?=\w)', '/', string)
|
||||
string = re.sub(' _ ', ' / ', string)
|
||||
# 'foo_ ' is ':'
|
||||
string = re.sub('(?<=\w)_ ', ': ', string)
|
||||
return string
|
||||
data = {}
|
||||
parts = map(lambda x: parse_underscores(x.strip()), path.split('/'))
|
||||
# subdirectory
|
||||
if len(parts) > 4:
|
||||
data['subdirectory'] = '/'.join(parts[3:-1])
|
||||
parts = parts[:3] + parts[-1:]
|
||||
else:
|
||||
data['subdirectory'] = None
|
||||
length = len(parts)
|
||||
director, title, file = [
|
||||
parts[-3] if length > 2 else None,
|
||||
parts[-2] if length > 1 else None,
|
||||
parts[-1]
|
||||
]
|
||||
# directorSort, director
|
||||
data['directorSort'] = data['director'] = []
|
||||
if director:
|
||||
data['directorSort'] = filter(
|
||||
lambda x: x != 'Unknown Director',
|
||||
director.split('; ')
|
||||
)
|
||||
data['director'] = map(
|
||||
lambda x: ' '.join(reversed(x.split(', '))),
|
||||
data['directorSort']
|
||||
)
|
||||
# title, year
|
||||
data['title'] = data['year'] = None
|
||||
if title:
|
||||
match = re.search(' \(\d{4}(-(\d{4})?)?\)$', title)
|
||||
data['title'] = title[:-len(match.group(0))] if match else title
|
||||
data['year'] = match.group(0)[2:-1] if match else None
|
||||
file_title = re.sub('[/:]', '_', data['title'])
|
||||
# (remove title from beginning of filename if the rest contains a dot)
|
||||
file = re.sub('^' + re.escape(file_title) + '(?=.*\.)', '', file)
|
||||
# (split by nospace+dot+word, but remove spaces preceding extension)
|
||||
parts = re.split('(?<!\s)\.(?=\w)', re.sub('\s+(?=.\w+$)', '', file))
|
||||
title, parts, extension = [
|
||||
parts[0],
|
||||
parts[1:-1],
|
||||
parts[-1] if len(parts) > 1 else None
|
||||
]
|
||||
if not data['title'] and title:
|
||||
data['title'] = title
|
||||
# season, episode, episodes, episodeTitle
|
||||
data['season'] = data['episode'] = data['episodeTitle'] = None
|
||||
data['episodes'] = []
|
||||
match = re.search(' \((S\d{2})?(E\d{2}([+-]\d{2})?)?\)(.+)?', title)
|
||||
if match:
|
||||
if match.group(1):
|
||||
data['season'] = int(match.group(1)[1:])
|
||||
if match.group(2):
|
||||
if len(match.group(2)) == 3:
|
||||
data['episode'] = int(match.group(2)[1:])
|
||||
else:
|
||||
data['episodes'] = range(int(match.group(2)[1:3]), int(match.group(2)[-2:]) + 1)
|
||||
if match.group(4):
|
||||
data['episodeTitle'] = match.group(4)[1:]
|
||||
while data['episodeTitle'] and len(parts) and re.search('^\w+\.*$', parts[0]) and not re.search('^[a-z]{2}$', parts[0]):
|
||||
data['episodeTitle'] += '.%s' % parts.pop(0)
|
||||
# isEpisode, seriesTitle, seriesYear
|
||||
data['isEpisode'] = False
|
||||
data['seriesTitle'] = data['seriesYear'] = None
|
||||
if data['season'] != None or data['episode'] != None or data['episodes']:
|
||||
data['isEpisode'] = True
|
||||
data['seriesTitle'] = data['title']
|
||||
season = 'S%02d' % data['season'] if data['season'] != None else ''
|
||||
episode = ''
|
||||
if data['episode'] != None:
|
||||
episode = 'E%02d' % data['episode']
|
||||
elif data['episodes']:
|
||||
episode = 'E%02d%s%02d' % (
|
||||
data['episodes'][0], '+' if len(data['episodes']) == 2 else '-', data['episodes'][-1]
|
||||
)
|
||||
episodeTitle = ' %s' % data['episodeTitle'] if data['episodeTitle'] else ''
|
||||
data['title'] += ' (%s%s)%s' % (season, episode, episodeTitle)
|
||||
data['seriesYear'] = data['year']
|
||||
data['year'] = None
|
||||
# version
|
||||
data['version'] = parts.pop(0) if len(parts) and re.search('^[A-Z0-9]', parts[0]) and not re.search('^Part .', parts[0]) else None
|
||||
# part
|
||||
data['part'] = parts.pop(0)[5:] if len(parts) and re.search('^Part .', parts[0]) else None
|
||||
# partTitle
|
||||
data['partTitle'] = parts.pop(0) if len(parts) and re.search('^[A-Z0-9]', parts[0]) and data['part'] else None
|
||||
while data['partTitle'] and len(parts) and not re.search('^[a-z]{2}$', parts[0]):
|
||||
data['partTitle'] += '.%s' % parts.pop(0)
|
||||
# language
|
||||
data['language'] = parts.pop(0) if len(parts) and re.search('^[a-z]{2}$', parts[0]) else None
|
||||
# extension
|
||||
data['extension'] = re.sub('^mpeg$', 'mpg', extension.lower()) if extension else None
|
||||
# type
|
||||
data['type'] = parse_type(data['extension'])
|
||||
# normalizedPath
|
||||
data['normalizedPath'] = format_path(data)
|
||||
return data
|
||||
|
||||
|
||||
def parse_movie_path(path):
|
||||
"""
|
||||
"A/Abrams, J.J.; Lieber, Jeffrey; Lindelof, Damon/Lost (2004)/Lost.Season 3.Episode 21.Greatest Hits.avi"
|
||||
"B/Balada, Ivan/Metrum (1967)/Metrum.Part 1.en.srt"
|
||||
"N/Nakata, Hideo/L - Change the World (2008)/L - Change the World.Part 2.srt"
|
||||
"R/Reitz, Edgar/Heimat (1984-2006)/Heimat.Season 2.Episode 8.The Wedding.Part 2.avi"
|
||||
"F/Feuillade, Louis/Les vampires (1915)/Les vampires.Episode 10.Part 2.avi"
|
||||
title: 'Les vampires', year: '1915', episode: 10, part: 2
|
||||
|
||||
"G/Godard, Jean-Luc/Histoire(s) du cinema_ Toutes les histoires (1988)/Histoire(s) du cinema_ Toutes les histoires.avi"
|
||||
"G/Godard, Jean-Luc/Six fois deux (1976)/Six fois deux.Part 1A.Y a personne.avi"
|
||||
"G/Godard, Jean-Luc; Miéville, Anne-Marie/France_tour_detour_deux_enfants (1977)/France_tour_detour_deux_enfants.Part 5.Impression_Dictée.avi"
|
||||
|
||||
"L/Labarthe, André S_/Cinéastes de notre temps (1964-)/Cinéastes de notre temps.Episode.Jean Renoir le patron, première partie_ La Recherche du relatif.avi"
|
||||
"S/Scott, Ridley/Blade Runner (1982)/Blade Runner.Directors's Cut.avi"
|
||||
|
||||
or
|
||||
|
||||
T/Title (Year)/Title.avi
|
||||
"""
|
||||
episodeTitle = episodeYear = seriesTitle = None
|
||||
episodeDirector = []
|
||||
parts = path.split('/')
|
||||
|
||||
#title/year
|
||||
if len(parts) == 4:
|
||||
title = parts[2]
|
||||
elif len(parts) > 1:
|
||||
title = parts[1]
|
||||
else:
|
||||
title = parts[0]
|
||||
title = title.replace('_ ', ': ')
|
||||
if title.endswith('_'):
|
||||
title = title[:-1] + '.'
|
||||
if title.startswith('_'):
|
||||
title = '.' + title[1:]
|
||||
|
||||
year = find_re(title, '(\(\d{4}\))')
|
||||
if not year:
|
||||
year = find_re(title, '(\(\d{4}-\d*\))')
|
||||
if year and title.endswith(year):
|
||||
title = title[:-len(year)].strip()
|
||||
year = year[1:-1]
|
||||
if '-' in year:
|
||||
year = find_re(year, '\d{4}')
|
||||
|
||||
#director
|
||||
if len(parts) == 4:
|
||||
director = parts[1]
|
||||
if director.endswith('_'):
|
||||
director = "%s." % director[:-1]
|
||||
director = director.split('; ')
|
||||
director = [normalize_name(d).strip() for d in director]
|
||||
director = filter(lambda d: d not in ('Unknown Director', 'Various Directors'), director)
|
||||
else:
|
||||
director = []
|
||||
|
||||
#extension/language
|
||||
fileparts = [x.replace('||', '. ') for x in parts[-1].replace('. ', '||').split('.')]
|
||||
extension = len(fileparts) > 1 and fileparts[-1] or ''
|
||||
|
||||
if len(fileparts) > 1 and len(fileparts[-2]) == 2:
|
||||
language = fileparts[-2]
|
||||
else:
|
||||
language = ''
|
||||
|
||||
#season/episode/episodeTitle
|
||||
match = re.compile('(.+?) \((S(\d+))?(E(\d+))?\)( (.+?))?\.').match(parts[-1])
|
||||
if match:
|
||||
seriesTitle = match.group(1)
|
||||
season = match.group(3)
|
||||
episode = match.group(5)
|
||||
episodeTitle = (match.group(6) or '').strip()
|
||||
if episode != None:
|
||||
episode = int(episode)
|
||||
if season != None:
|
||||
season = int(season)
|
||||
if episode and not season:
|
||||
season = 1
|
||||
else:
|
||||
season = find_re(parts[-1], '\.Season (\d+)\.')
|
||||
if season:
|
||||
season = int(season)
|
||||
else:
|
||||
season = None
|
||||
|
||||
episode = find_re(parts[-1], '\.Episode[s]* ([\d+]+)\.')
|
||||
if episode:
|
||||
episode = episode.split('+')[0]
|
||||
episode = int(episode)
|
||||
else:
|
||||
episode = None
|
||||
|
||||
if episode and 'Episode %d'%episode in fileparts:
|
||||
episodeTitle = fileparts.index('Episode %d' % episode) + 1
|
||||
episodeTitle = fileparts[episodeTitle]
|
||||
if episodeTitle == extension or episodeTitle.startswith('Part'):
|
||||
episodeTitle = None
|
||||
|
||||
if not season and 'Episode' in fileparts:
|
||||
episodeTitle = fileparts.index('Episode') + 1
|
||||
episodeTitle = fileparts[episodeTitle]
|
||||
if episodeTitle == extension or episodeTitle.startswith('Part'):
|
||||
episodeTitle = None
|
||||
else:
|
||||
season = 1
|
||||
|
||||
if season:
|
||||
seriesTitle = title
|
||||
title = u'%s (S%02d)' % (seriesTitle, season)
|
||||
if isinstance(episode, int):
|
||||
title = u'%s (S%02dE%02d)' % (seriesTitle, season, episode)
|
||||
if episodeTitle:
|
||||
title = u'%s %s' % (title, episodeTitle)
|
||||
|
||||
#part
|
||||
part = find_re(parts[-1], '\.Part (\d+)\.')
|
||||
if part:
|
||||
part = int(part)
|
||||
else:
|
||||
part = 0
|
||||
|
||||
return {
|
||||
'director': director,
|
||||
'episodeDirector': episodeDirector,
|
||||
'episode': episode,
|
||||
'episodeTitle': episodeTitle,
|
||||
'episodeYear': episodeYear,
|
||||
'extension': extension,
|
||||
'language': language,
|
||||
'part': part,
|
||||
'season': season,
|
||||
'seriesTitle': seriesTitle,
|
||||
'title': title,
|
||||
'year': year,
|
||||
}
|
||||
|
||||
def create_movie_path(title, director, year,
|
||||
season, episode, episodeTitle, episodeDirector, episodeYear,
|
||||
part, language, extension):
|
||||
'''
|
||||
{
|
||||
title: '', director: [''], year: '',
|
||||
season: int, episode: int, episodeTitle: '', episodeDirector: [''], episodeYear: '',
|
||||
part: int, language: '', extension: '', extra: bool
|
||||
})
|
||||
'''
|
||||
partTitle = None
|
||||
director = '; '.join(map(get_sort_name, director))
|
||||
episodeDirector = '; '.join(map(get_sort_name, episodeDirector))
|
||||
filename = [title]
|
||||
if season:
|
||||
filename += ['Season %d' % season]
|
||||
if episode:
|
||||
filename += ['Episode %d' % episode]
|
||||
if episodeTitle:
|
||||
filename += [episodeTitle]
|
||||
if part:
|
||||
filename += ['Part %s' % part]
|
||||
if partTitle:
|
||||
filename += [partTitle]
|
||||
if extension:
|
||||
filename += [extension]
|
||||
filename = '.'.join(filename)
|
||||
path = os.path.join(director[0], director, '%s (%s)' % (title, year), filename)
|
||||
return path
|
||||
|
||||
def get_oxid(title, director=[], year='',
|
||||
season='', episode='', episode_title='', episode_director=[], episode_year=''):
|
||||
def get_hash(string):
|
||||
return hashlib.sha1(string.encode('utf-8')).hexdigest().upper()
|
||||
director = ', '.join(director)
|
||||
episode_director = ', '.join(episode_director)
|
||||
if not season and not episode and not episode_title:
|
||||
oxid = get_hash(director)[:8] + get_hash('\n'.join([title, str(year)]))[:8]
|
||||
else:
|
||||
oxid = get_hash('\n'.join([director, title, str(year), str(season)]))[:8] + \
|
||||
get_hash('\n'.join([str(episode), episode_director, episode_title, str(episode_year)]))[:8]
|
||||
return u'0x' + oxid
|
||||
157
Shared/lib/python3.4/site-packages/ox/net.py
Normal file
157
Shared/lib/python3.4/site-packages/ox/net.py
Normal file
|
|
@ -0,0 +1,157 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
# GPL 2008
|
||||
from __future__ import with_statement, print_function
|
||||
import os
|
||||
import gzip
|
||||
import re
|
||||
from six import BytesIO
|
||||
import struct
|
||||
from six.moves import urllib
|
||||
|
||||
from chardet.universaldetector import UniversalDetector
|
||||
|
||||
|
||||
DEBUG = False
|
||||
# Default headers for HTTP requests.
|
||||
DEFAULT_HEADERS = {
|
||||
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:28.0) Gecko/20100101 Firefox/28.0',
|
||||
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||
'Accept-Language': 'en-us,en;q=0.5',
|
||||
'Accept-Encoding': 'gzip'
|
||||
}
|
||||
|
||||
def status(url, data=None, headers=DEFAULT_HEADERS):
|
||||
try:
|
||||
f = open_url(url, data, headers)
|
||||
s = f.code
|
||||
except urllib.error.HTTPError as e:
|
||||
s = e.code
|
||||
return s
|
||||
|
||||
def exists(url, data=None, headers=DEFAULT_HEADERS):
|
||||
s = status(url, data, headers)
|
||||
if s >= 200 and s < 400:
|
||||
return True
|
||||
return False
|
||||
|
||||
def get_headers(url, data=None, headers=DEFAULT_HEADERS):
|
||||
try:
|
||||
f = open_url(url, data, headers)
|
||||
f.headers['Status'] = "%s" % f.code
|
||||
headers = f.headers
|
||||
f.close()
|
||||
except urllib.error.HTTPError as e:
|
||||
e.headers['Status'] = "%s" % e.code
|
||||
headers = e.headers
|
||||
return dict(headers)
|
||||
|
||||
def open_url(url, data=None, headers=DEFAULT_HEADERS):
|
||||
if isinstance(url, bytes):
|
||||
url = url.decode('utf-8')
|
||||
url = url.replace(' ', '%20')
|
||||
req = urllib.request.Request(url, data, headers)
|
||||
return urllib.request.urlopen(req)
|
||||
|
||||
def read_url(url, data=None, headers=DEFAULT_HEADERS, return_headers=False, unicode=False):
|
||||
if DEBUG:
|
||||
print('ox.net.read_url', url)
|
||||
f = open_url(url, data, headers)
|
||||
result = f.read()
|
||||
f.close()
|
||||
if f.headers.get('content-encoding', None) == 'gzip':
|
||||
result = gzip.GzipFile(fileobj=BytesIO(result)).read()
|
||||
if unicode:
|
||||
ctype = f.headers.get('content-type', '').lower()
|
||||
if 'charset' in ctype:
|
||||
encoding = ctype.split('charset=')[-1]
|
||||
else:
|
||||
encoding = detect_encoding(result)
|
||||
if not encoding:
|
||||
encoding = 'latin-1'
|
||||
result = result.decode(encoding)
|
||||
if return_headers:
|
||||
f.headers['Status'] = "%s" % f.code
|
||||
headers = {}
|
||||
for key in f.headers:
|
||||
headers[key.lower()] = f.headers[key]
|
||||
return headers, result
|
||||
return result
|
||||
|
||||
def detect_encoding(data):
|
||||
data_lower = data.lower().decode('utf-8', 'ignore')
|
||||
charset = re.compile('content="text/html; charset=(.*?)"').findall(data_lower)
|
||||
if not charset:
|
||||
charset = re.compile('meta charset="(.*?)"').findall(data_lower)
|
||||
if charset:
|
||||
return charset[0].lower()
|
||||
detector = UniversalDetector()
|
||||
p = 0
|
||||
l = len(data)
|
||||
s = 1024
|
||||
while p < l:
|
||||
detector.feed(data[p:p+s])
|
||||
if detector.done:
|
||||
break
|
||||
p += s
|
||||
detector.close()
|
||||
return detector.result['encoding']
|
||||
|
||||
def save_url(url, filename, overwrite=False):
|
||||
if not os.path.exists(filename) or overwrite:
|
||||
dirname = os.path.dirname(filename)
|
||||
if not os.path.exists(dirname):
|
||||
os.makedirs(dirname)
|
||||
data = read_url(url)
|
||||
f = open(filename, 'w')
|
||||
f.write(data)
|
||||
f.close()
|
||||
|
||||
def oshash(url):
|
||||
def get_size(url):
|
||||
req = urllib.request.Request(url, headers=DEFAULT_HEADERS.copy())
|
||||
req.get_method = lambda : 'HEAD'
|
||||
u = urllib.request.urlopen(req)
|
||||
if u.code != 200 or not 'Content-Length' in u.headers:
|
||||
raise IOError
|
||||
return int(u.headers['Content-Length'])
|
||||
|
||||
def get_range(url, start, end):
|
||||
headers = DEFAULT_HEADERS.copy()
|
||||
headers['Range'] = 'bytes=%s-%s' % (start, end)
|
||||
req = urllib.request.Request(url, headers=headers)
|
||||
u = urllib.request.urlopen(req)
|
||||
return u.read()
|
||||
|
||||
try:
|
||||
longlongformat = 'q' # long long
|
||||
bytesize = struct.calcsize(longlongformat)
|
||||
|
||||
filesize = get_size(url)
|
||||
hash = filesize
|
||||
head = get_range(url, 0, min(filesize, 65536))
|
||||
if filesize > 65536:
|
||||
tail = get_range(url, filesize-65536, filesize)
|
||||
if filesize < 65536:
|
||||
for offset in range(0, filesize, bytesize):
|
||||
buffer = head[offset:offset+bytesize]
|
||||
(l_value,)= struct.unpack(longlongformat, buffer)
|
||||
hash += l_value
|
||||
hash = hash & 0xFFFFFFFFFFFFFFFF #cut off 64bit overflow
|
||||
else:
|
||||
for offset in range(0, 65536, bytesize):
|
||||
buffer = head[offset:offset+bytesize]
|
||||
(l_value,)= struct.unpack(longlongformat, buffer)
|
||||
hash += l_value
|
||||
hash = hash & 0xFFFFFFFFFFFFFFFF #cut of 64bit overflow
|
||||
for offset in range(0, 65536, bytesize):
|
||||
buffer = tail[offset:offset+bytesize]
|
||||
(l_value,)= struct.unpack(longlongformat, buffer)
|
||||
hash += l_value
|
||||
hash = hash & 0xFFFFFFFFFFFFFFFF
|
||||
returnedhash = "%016x" % hash
|
||||
return returnedhash
|
||||
except(IOError):
|
||||
return "IOError"
|
||||
|
||||
201
Shared/lib/python3.4/site-packages/ox/normalize.py
Normal file
201
Shared/lib/python3.4/site-packages/ox/normalize.py
Normal file
|
|
@ -0,0 +1,201 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
# GPL 2008
|
||||
import re
|
||||
import unicodedata
|
||||
|
||||
_articles = ('the', 'la', 'a', 'die', 'der', 'le', 'el',
|
||||
"l'", 'il', 'das', 'les', 'o', 'ein', 'i', 'un', 'los', 'de',
|
||||
'an', 'una', 'las', 'eine', 'den', 'gli', 'het', 'os', 'lo',
|
||||
'az', 'det', 'ha-', 'een', 'ang', 'oi', 'ta', 'al-', 'dem',
|
||||
'mga', 'uno', "un'", 'ett', u'\xcf', 'eines', u'\xc7', 'els',
|
||||
u'\xd4\xef', u'\xcf\xe9')
|
||||
|
||||
# Articles in a dictionary.
|
||||
_articlesDict = dict([(x, x) for x in _articles])
|
||||
_spArticles = []
|
||||
for article in _articles:
|
||||
if article[-1] not in ("'", '-'): article += ' '
|
||||
_spArticles.append(article)
|
||||
|
||||
_noarticles = (
|
||||
'los angeles',
|
||||
'i am ',
|
||||
'i be area',
|
||||
'i call ',
|
||||
'i come ',
|
||||
'i confess',
|
||||
'i hired ',
|
||||
'i killed ',
|
||||
'i know ',
|
||||
'i live ',
|
||||
'i love',
|
||||
'i married',
|
||||
'i never',
|
||||
'i shot',
|
||||
'i start',
|
||||
'i was',
|
||||
)
|
||||
|
||||
def canonical_title(title):
|
||||
"""Return the title in the canonic format 'Movie Title, The'.
|
||||
|
||||
>>> canonical_title('The Movie Title')
|
||||
'Movie Title, The'
|
||||
|
||||
>>> canonical_title('Los Angeles Plays Itself')
|
||||
'Los Angeles Plays Itself'
|
||||
"""
|
||||
try:
|
||||
if _articlesDict.has_key(title.split(', ')[-1].lower()): return title
|
||||
except IndexError: pass
|
||||
ltitle = title.lower()
|
||||
for start in _noarticles:
|
||||
if ltitle.startswith(start):
|
||||
return title
|
||||
for article in _spArticles:
|
||||
if ltitle.startswith(article):
|
||||
lart = len(article)
|
||||
title = '%s, %s' % (title[lart:], title[:lart])
|
||||
if article[-1] == ' ': title = title[:-1]
|
||||
break
|
||||
## XXX: an attempt using a dictionary lookup.
|
||||
##for artSeparator in (' ', "'", '-'):
|
||||
## article = _articlesDict.get(ltitle.split(artSeparator)[0])
|
||||
## if article is not None:
|
||||
## lart = len(article)
|
||||
## # check titles like "una", "I'm Mad" and "L'abbacchio".
|
||||
## if title[lart:] == '' or (artSeparator != ' ' and
|
||||
## title[lart:][1] != artSeparator): continue
|
||||
## title = '%s, %s' % (title[lart:], title[:lart])
|
||||
## if artSeparator == ' ': title = title[1:]
|
||||
## break
|
||||
return title
|
||||
|
||||
def normalize_title(title):
|
||||
"""Return the title in the normal "The Title" format.
|
||||
|
||||
>>> normalize_title('Movie Title, The')
|
||||
'The Movie Title'
|
||||
"""
|
||||
stitle = title.split(', ')
|
||||
if len(stitle) > 1 and _articlesDict.has_key(stitle[-1].lower()):
|
||||
sep = ' '
|
||||
if stitle[-1][-1] in ("'", '-'): sep = ''
|
||||
title = '%s%s%s' % (stitle[-1], sep, ', '.join(stitle[:-1]))
|
||||
return title
|
||||
|
||||
def normalize_imdbid(imdbId):
|
||||
"""Return 7 digit imdbId.
|
||||
|
||||
>>> normalize_imdbid('http://www.imdb.com/title/tt0159206/')
|
||||
'0159206'
|
||||
>>> normalize_imdbid(159206)
|
||||
'0159206'
|
||||
>>> normalize_imdbid('tt0159206')
|
||||
'0159206'
|
||||
"""
|
||||
if isinstance(imdbId, basestring):
|
||||
imdbId = re.sub('.*(\d{7}).*', '\\1', imdbId)
|
||||
elif isinstance(imdbId, int):
|
||||
imdbId = "%07d" % imdbId
|
||||
return imdbId
|
||||
|
||||
|
||||
# Common suffixes in surnames.
|
||||
_sname_suffixes = (
|
||||
'al', 'ben', 'da', 'de', 'del', 'den', 'der', 'des', 'di', 'dos', 'du',
|
||||
'e', 'el', 'la', 'le', 'the', 'vom', 'von', 'van', 'y'
|
||||
)
|
||||
|
||||
def canonical_name(name):
|
||||
"""Return the given name in canonical "Surname, Name" format.
|
||||
It assumes that name is in the 'Name Surname' format.
|
||||
|
||||
>>> canonical_name('Jean Luc Godard')
|
||||
'Godard, Jean Luc'
|
||||
|
||||
>>> canonical_name('Ivan Ivanov-Vano')
|
||||
'Ivanov-Vano, Ivan'
|
||||
|
||||
>>> canonical_name('Gus Van Sant')
|
||||
'Van Sant, Gus'
|
||||
|
||||
>>> canonical_name('Brian De Palma')
|
||||
'De Palma, Brian'
|
||||
"""
|
||||
|
||||
# XXX: some statistics (over 1852406 names):
|
||||
# - just a surname: 51921
|
||||
# - single surname, single name: 1792759
|
||||
# - composed surname, composed name: 7726
|
||||
# - composed surname, single name: 55623
|
||||
# (2: 49259, 3: 5502, 4: 551)
|
||||
# - single surname, composed name: 186604
|
||||
# (2: 178315, 3: 6573, 4: 1219, 5: 352)
|
||||
# Don't convert names already in the canonical format.
|
||||
if name in ('Unknown Director', ):
|
||||
return name
|
||||
if name.find(', ') != -1: return name
|
||||
sname = name.split(' ')
|
||||
snl = len(sname)
|
||||
if snl == 2:
|
||||
# Just a name and a surname: how boring...
|
||||
name = '%s, %s' % (sname[1], sname[0])
|
||||
elif snl > 2:
|
||||
lsname = [x.lower() for x in sname]
|
||||
if snl == 3: _indexes = (0, snl-2)
|
||||
else: _indexes = (0, snl-2, snl-3)
|
||||
# Check for common surname prefixes at the beginning and near the end.
|
||||
for index in _indexes:
|
||||
if lsname[index] not in _sname_suffixes: continue
|
||||
try:
|
||||
# Build the surname.
|
||||
surn = '%s %s' % (sname[index], sname[index+1])
|
||||
del sname[index]
|
||||
del sname[index]
|
||||
try:
|
||||
# Handle the "Jr." after the name.
|
||||
if lsname[index+2].startswith('jr'):
|
||||
surn += ' %s' % sname[index]
|
||||
del sname[index]
|
||||
except (IndexError, ValueError):
|
||||
pass
|
||||
name = '%s, %s' % (surn, ' '.join(sname))
|
||||
break
|
||||
except ValueError:
|
||||
continue
|
||||
else:
|
||||
name = '%s, %s' % (sname[-1], ' '.join(sname[:-1]))
|
||||
return name
|
||||
|
||||
def normalize_name(name):
|
||||
"""Return a name in the normal "Name Surname" format.
|
||||
|
||||
>>> normalize_name('Godard, Jean Luc')
|
||||
'Jean Luc Godard'
|
||||
|
||||
>>> normalize_name('Ivanov-Vano, Ivan')
|
||||
'Ivan Ivanov-Vano'
|
||||
|
||||
>>> normalize_name('Van Sant, Gus')
|
||||
'Gus Van Sant'
|
||||
|
||||
>>> normalize_name('De Palma, Brian')
|
||||
'Brian De Palma'
|
||||
"""
|
||||
sname = name.split(', ')
|
||||
if len(sname) == 2:
|
||||
name = '%s %s' % (sname[1], sname[0])
|
||||
return name
|
||||
|
||||
def normalize_path(path):
|
||||
path = path.replace(':', '_').replace('/', '_')
|
||||
if path.endswith('.'): path = path[:-1] + '_'
|
||||
return path
|
||||
|
||||
def strip_accents(s):
|
||||
if isinstance(s, str):
|
||||
s = unicode(s)
|
||||
return ''.join((c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn'))
|
||||
|
||||
32
Shared/lib/python3.4/site-packages/ox/oembed.py
Normal file
32
Shared/lib/python3.4/site-packages/ox/oembed.py
Normal file
|
|
@ -0,0 +1,32 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# ci:si:et:sw=4:sts=4:ts=4
|
||||
import re
|
||||
|
||||
from . import cache
|
||||
from .text import find_re
|
||||
from .utils import json, ET
|
||||
|
||||
def get_embed_code(url, maxwidth=None, maxheight=None):
|
||||
embed = {}
|
||||
header = cache.get_headers(url)
|
||||
if header.get('content-type', '').startswith('text/html'):
|
||||
html = cache.read_url(url)
|
||||
json_oembed = filter(lambda l: 'json+oembed' in l, re.compile('<link.*?>').findall(html))
|
||||
xml_oembed = filter(lambda l: 'xml+oembed' in l, re.compile('<link.*?>').findall(html))
|
||||
if json_oembed:
|
||||
oembed_url = find_re(json_oembed[0], 'href="(.*?)"')
|
||||
if maxwidth:
|
||||
oembed_url += '&maxwidth=%d' % maxwidth
|
||||
if maxheight:
|
||||
oembed_url += '&maxheight=%d' % maxheight
|
||||
embed = json.loads(cache.read_url(oembed_url))
|
||||
elif xml_oembed:
|
||||
oembed_url = find_re(json_oembed[0], 'href="(.*?)"')
|
||||
if maxwidth:
|
||||
oembed_url += '&maxwidth=%d' % maxwidth
|
||||
if maxheight:
|
||||
oembed_url += '&maxheight=%d' % maxheight
|
||||
data = cache.read_url(oembed_url)
|
||||
for e in ET.fromstring(data):
|
||||
embed[e.tag] = e.text
|
||||
return embed
|
||||
106
Shared/lib/python3.4/site-packages/ox/srt.py
Normal file
106
Shared/lib/python3.4/site-packages/ox/srt.py
Normal file
|
|
@ -0,0 +1,106 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
from __future__ import with_statement, division, print_function
|
||||
import chardet
|
||||
import re
|
||||
import codecs
|
||||
|
||||
import ox
|
||||
|
||||
|
||||
__all__ = []
|
||||
|
||||
|
||||
def _detect_encoding(fp):
|
||||
bomDict={ # bytepattern : name
|
||||
(0x00, 0x00, 0xFE, 0xFF): "utf_32_be",
|
||||
(0xFF, 0xFE, 0x00, 0x00): "utf_32_le",
|
||||
(0xFE, 0xFF, None, None): "utf_16_be",
|
||||
(0xFF, 0xFE, None, None): "utf_16_le",
|
||||
(0xEF, 0xBB, 0xBF, None): "utf_8",
|
||||
}
|
||||
|
||||
# go to beginning of file and get the first 4 bytes
|
||||
oldFP = fp.tell()
|
||||
fp.seek(0)
|
||||
(byte1, byte2, byte3, byte4) = tuple(map(ord, fp.read(4)))
|
||||
|
||||
# try bom detection using 4 bytes, 3 bytes, or 2 bytes
|
||||
bomDetection = bomDict.get((byte1, byte2, byte3, byte4))
|
||||
if not bomDetection:
|
||||
bomDetection = bomDict.get((byte1, byte2, byte3, None))
|
||||
if not bomDetection:
|
||||
bomDetection = bomDict.get((byte1, byte2, None, None))
|
||||
## if BOM detected, we're done :-)
|
||||
fp.seek(oldFP)
|
||||
if bomDetection:
|
||||
return bomDetection
|
||||
encoding = 'latin-1'
|
||||
#more character detecting magick using http://chardet.feedparser.org/
|
||||
fp.seek(0)
|
||||
rawdata = fp.read()
|
||||
#if data can be decoded as utf-8 use that, try chardet otherwise
|
||||
#chardet detects utf-8 as ISO-8859-2 most of the time
|
||||
try:
|
||||
data = unicode(rawdata, 'utf-8')
|
||||
encoding = 'utf-8'
|
||||
except:
|
||||
encoding = chardet.detect(rawdata)['encoding']
|
||||
fp.seek(oldFP)
|
||||
return encoding
|
||||
|
||||
|
||||
def load(filename, offset=0):
|
||||
'''
|
||||
filename path to an srt file
|
||||
offset in seconds shift all in/out points by offset
|
||||
|
||||
returns list with objects that have in,out,value and id
|
||||
'''
|
||||
srt = []
|
||||
|
||||
def parse_time(t):
|
||||
return offset + ox.time2ms(t.replace(',', '.')) / 1000
|
||||
|
||||
with open(filename) as f:
|
||||
encoding = _detect_encoding(f)
|
||||
data = f.read()
|
||||
try:
|
||||
data = unicode(data, encoding)
|
||||
except:
|
||||
try:
|
||||
data = unicode(data, 'latin-1')
|
||||
except:
|
||||
print("failed to detect encoding, giving up")
|
||||
return srt
|
||||
|
||||
data = data.replace('\r\n', '\n')
|
||||
srts = re.compile('(\d\d:\d\d:\d\d[,.]\d\d\d)\s*?-->\s*?(\d\d:\d\d:\d\d[,.]\d\d\d).*?\n(.*?)\n\n', re.DOTALL)
|
||||
i = 0
|
||||
for s in srts.findall(data):
|
||||
_s = {'id': str(i),
|
||||
'in': parse_time(s[0]),
|
||||
'out': parse_time(s[1]),
|
||||
'value': s[2].strip()
|
||||
}
|
||||
srt.append(_s)
|
||||
i += 1
|
||||
return srt
|
||||
|
||||
def encode(data):
|
||||
'''
|
||||
encodes list of objects with in,out,value into srt
|
||||
result is utf-8 encoded bytestring
|
||||
'''
|
||||
srt = u''
|
||||
i = 1
|
||||
for s in data:
|
||||
srt += '%d\r\n%s --> %s\r\n%s\r\n\r\n' % (
|
||||
i,
|
||||
ox.format_duration(s['in']*1000, years=False).replace('.', ','),
|
||||
ox.format_duration(s['out']*1000, years=False).replace('.', ','),
|
||||
s['value'].replace('\n', '\r\n').strip()
|
||||
)
|
||||
i += 1
|
||||
return codecs.BOM_UTF8 + srt.encode('utf-8')
|
||||
|
||||
593
Shared/lib/python3.4/site-packages/ox/text.py
Normal file
593
Shared/lib/python3.4/site-packages/ox/text.py
Normal file
|
|
@ -0,0 +1,593 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
# GPL 2008
|
||||
import math
|
||||
import re
|
||||
import unicodedata
|
||||
|
||||
ARTICLES = list(set([
|
||||
# def sg, def pl, indef sg, indef pl (each m/f/n)
|
||||
'der', 'die', 'das', 'ein', 'eine', # de
|
||||
'the', 'a', 'an', # en
|
||||
'el', 'la', 'lo', 'los', 'las', 'un', 'una', 'unos', 'unas', # es
|
||||
'le', "l'", 'la', 'les', 'un', 'une', 'des', # fr
|
||||
'il', 'lo', "l'" 'la', '_i', 'gli', 'le', # it
|
||||
'de', 'het', 'een', # nl
|
||||
'o', 'a', 'os', '_as', 'um', 'uma', '_uns', 'umas' # pt
|
||||
# some _disabled because of collisions
|
||||
]))
|
||||
# see http://en.wikipedia.org/wiki/List_of_common_Chinese_surnames
|
||||
# and http://en.wikipedia.org/wiki/List_of_Korean_family_names
|
||||
ASIAN_NAMES = [
|
||||
'chan', 'chang', 'chao',
|
||||
'chen', 'cheong', 'cheung',
|
||||
'chong', 'choo',
|
||||
'chu', 'chun',
|
||||
'hou', 'hsieh', 'hsu', 'hu', 'huang',
|
||||
'kuo',
|
||||
'li', 'liang', 'lin', 'liu',
|
||||
'_park',
|
||||
'sun', 'sung',
|
||||
'tsao',
|
||||
'wang', 'Wong',
|
||||
'yang', 'yeong', 'yeung'
|
||||
]
|
||||
PREFIXES = [
|
||||
'al', 'bin', 'da', 'de', 'del', 'dem', 'den', 'der', 'di', 'dos', 'du',
|
||||
'e', 'el', 'la', 'san', 'the', 'van', 'vom', 'von', 'y', 'zu'
|
||||
]
|
||||
MIDFIXES = ['und']
|
||||
SUFFIXES = ['ii', 'iii', 'jr', 'jr.', 'ph.d.', 'phd', 'sr', 'sr.']
|
||||
|
||||
UA_ALIASES = {
|
||||
'browser': {
|
||||
'Chrome': '(CriOS|CrMo)',
|
||||
'Firefox': '(Fennec|Firebird|Iceweasel|Minefield|Namoroka|Phoenix|SeaMonkey|Shiretoko)',
|
||||
'Nokia Browser': '(OviBrowser)'
|
||||
},
|
||||
'robot': {},
|
||||
'system': {
|
||||
'BSD': '(FreeBSD|NetBSD|OpenBSD)',
|
||||
'Linux': '(CrOS|MeeGo|webOS)',
|
||||
'Unix': '(AIX|HP-UX|IRIX|SunOS)'
|
||||
}
|
||||
}
|
||||
UA_NAMES = {
|
||||
'browser': {
|
||||
'chromeframe': 'Chrome Frame',
|
||||
'FBForIPhone': 'WebKit',
|
||||
'Gecko': 'Mozilla',
|
||||
'IEMobile': 'Internet Explorer',
|
||||
'konqueror': 'Konqueror',
|
||||
'Mozilla': 'Netscape',
|
||||
'MSIE': 'Internet Explorer',
|
||||
'NokiaBrowser': 'Nokia Browser',
|
||||
'Trident': 'Internet Explorer'
|
||||
},
|
||||
'robot': {},
|
||||
'system': {
|
||||
'BB': 'BlackBerry',
|
||||
'CPU OS': 'iOS',
|
||||
'iPhone': 'iOS',
|
||||
'iPhone OS': 'iOS',
|
||||
'J2ME/MIDP': 'Java',
|
||||
'Mac_PowerPC': 'Mac OS',
|
||||
'Mac_PPC': 'Mac OS',
|
||||
'Macintosh': 'Mac OS',
|
||||
'PLAYSTATION': 'PlayStation',
|
||||
'S': 'Nokia',
|
||||
'Series': 'Nokia',
|
||||
'Win': 'Windows',
|
||||
'Windows Phone OS': 'Windows Phone',
|
||||
'X11': 'Linux'
|
||||
}
|
||||
}
|
||||
UA_REGEXPS = {
|
||||
'browser': [
|
||||
'(Camino)\/(\d+)',
|
||||
'(Chimera)\/(\d+)',
|
||||
'(chromeframe)\/(\d+)',
|
||||
'(Epiphany)\/(\d+)', # before Chrome, Chromium and Safari
|
||||
'(Chromium)\/(\d+)', # before Chrome
|
||||
'(Chrome)\/(\d+)',
|
||||
'(FBForIPhone)',
|
||||
'(Firefox)\/(\d+)',
|
||||
'(Galeon)\/(\d+)',
|
||||
'(IEMobile)\/(\d+)',
|
||||
'(iCab) (\d+)',
|
||||
'(iCab)\/(\d+)',
|
||||
'(konqueror)\/(\d+)',
|
||||
'(Konqueror)\/(\d+)',
|
||||
'(Lynx)\/(\d+)',
|
||||
'(Netscape)\d?\/(\d+)',
|
||||
'(NokiaBrowser)\/(\d+)',
|
||||
'(OmniWeb)\/(\d+)',
|
||||
'(Opera)\/.+Version\/(\d+)',
|
||||
'(OviBrowser)\/(\d+)',
|
||||
'Version\/(\d+).+(Safari)',
|
||||
'(WebKit)\/(\d+)',
|
||||
'(MSIE) (\d\d?(?!\d))', # last, since Opera used to mask as MSIE
|
||||
'(Trident)\/.*?rv:(\d+)',
|
||||
'(Gecko)',
|
||||
'(Mozilla)\/(3|4)'
|
||||
],
|
||||
'robot': [
|
||||
'(BingPreview)\/(\d+)',
|
||||
'(Google Web Preview).+Chrome\/(\d+)',
|
||||
'(Googlebot)\/(\d+)',
|
||||
'(WebCrawler)\/(\d+)',
|
||||
'(Yahoo! Slurp)\/(\d+)'
|
||||
],
|
||||
'system': [
|
||||
'(Android) (\d+)',
|
||||
'(Android)',
|
||||
'(BB)(\d+)',
|
||||
'(BeOS)',
|
||||
'(BlackBerry) (\d+)',
|
||||
'(BlackBerry)',
|
||||
'(Darwin)',
|
||||
'(BSD) (FreeBSD|NetBSD|OpenBSD)',
|
||||
'(CPU OS) (\d+)',
|
||||
'(iPhone OS) (\d+)',
|
||||
'(iPhone)', # Opera
|
||||
'(J2ME\/MIDP)',
|
||||
'(Linux).+(CentOS|CrOS|Debian|Fedora|Gentoo|Mandriva|MeeGo|Mint|Red Hat|SUSE|Ubuntu|webOS)',
|
||||
'(CentOS|CrOS|Debian|Fedora|Gentoo|Mandriva|MeeGo|Mint|Red Hat|SUSE|Ubuntu|webOS).+(Linux)',
|
||||
'(Linux)',
|
||||
'(Mac OS X) (10.\d+)',
|
||||
'(Mac OS X)',
|
||||
'(Mac_PowerPC)',
|
||||
'(Mac_PPC)',
|
||||
'(Macintosh)',
|
||||
'Nintendo (Wii).+NX\/(\d+)',
|
||||
'(PLAYSTATION) (\d+)',
|
||||
'(PlayStation) Vita (\d+)',
|
||||
'(RIM Tablet OS) (\d+)',
|
||||
'(S)(60);',
|
||||
'(Series) ?(40|60)',
|
||||
'(Symbian OS)',
|
||||
'(SymbianOS)\/(\d+)',
|
||||
'(SymbOS)',
|
||||
'(OS\/2)',
|
||||
'(Unix) (AIX|HP-UX|IRIX|SunOS)',
|
||||
'(Unix)',
|
||||
'(Windows) (NT \d\.\d)',
|
||||
'(Windows Phone) (\d+)',
|
||||
'(Windows Phone OS) (\d+)',
|
||||
'(Windows) (3\.1|95|98|2000|2003|CE|ME|Mobile|NT|XP)', # Opera
|
||||
'(Win) (9x 4\.90)', # Firefox
|
||||
'(Win)(16)', # Firefox
|
||||
'(Win)(9\d)', # Firefox
|
||||
'(Win)(NT)', # Firefox
|
||||
'(Win)(NT4\.0)', # Firefox
|
||||
'(X11)'
|
||||
]
|
||||
}
|
||||
UA_VERSIONS = {
|
||||
'browser': {},
|
||||
'robot': {},
|
||||
'system': {
|
||||
'10.0': '10.0 (Cheetah)',
|
||||
'10.1': '10.1 (Puma)',
|
||||
'10.2': '10.2 (Jaguar)',
|
||||
'10.3': '10.3 (Panther)',
|
||||
'10.4': '10.4 (Tiger)',
|
||||
'10.5': '10.5 (Leopard)',
|
||||
'10.6': '10.6 (Snow Leopard)',
|
||||
'10.7': '10.7 (Lion)',
|
||||
'10.8': '10.8 (Mountain Lion)',
|
||||
'10.9': '10.9 (Mavericks)',
|
||||
'10.10': '10.10 (Yosemite)',
|
||||
'40': 'Series 40',
|
||||
'60': 'Series 60',
|
||||
'NT 3.1': 'NT 3.1 (3.1)',
|
||||
'NT 3.5': 'NT 3.5 (NT)',
|
||||
'NT 4.0': 'NT 4.0 (NT)',
|
||||
'NT 4.1': 'NT 4.1 (98)',
|
||||
'9x 4.90': 'NT 4.9 (ME)',
|
||||
'NT 5.0': 'NT 5.0 (2000)',
|
||||
'NT 5.1': 'NT 5.1 (XP)',
|
||||
'NT 5.2': 'NT 5.2 (2003)',
|
||||
'NT 6.0': 'NT 6.0 (Vista)',
|
||||
'NT 6.1': 'NT 6.1 (7)',
|
||||
'NT 6.2': 'NT 6.2 (8)',
|
||||
'NT 6.3': 'NT 6.3 (8.1)',
|
||||
'16': 'NT 3.1 (3.1)',
|
||||
'3.1': 'NT 3.1 (3.1)',
|
||||
'95': 'NT 4.0 (95)',
|
||||
'NT': 'NT 4.0 (NT)',
|
||||
'NT4.0': 'NT 4.0 (NT)',
|
||||
'98': 'NT 4.1 (98)',
|
||||
'ME': 'NT 4.9 (ME)',
|
||||
'2000': 'NT 5.0 (2000)',
|
||||
'XP': 'NT 5.1 (XP)',
|
||||
'2003': 'NT 5.2 (2003)'
|
||||
}
|
||||
}
|
||||
|
||||
def get_sort_name(name):
|
||||
"""
|
||||
|
||||
>>> get_sort_name('Alfred Hitchcock')
|
||||
'Hitchcock, Alfred'
|
||||
|
||||
>>> get_sort_name('Jean-Luc Godard')
|
||||
'Godard, Jean-Luc'
|
||||
|
||||
>>> get_sort_name('Rainer Werner Fassbinder')
|
||||
'Fassbinder, Rainer Werner'
|
||||
|
||||
>>> get_sort_name('Brian De Palma')
|
||||
'De Palma, Brian'
|
||||
|
||||
>>> get_sort_name('Johan van der Keuken')
|
||||
'van der Keuken, Johan'
|
||||
|
||||
>>> get_sort_name('Edward D. Wood Jr.')
|
||||
'Wood Jr., Edward D.'
|
||||
|
||||
>>> get_sort_name('Bing Wang')
|
||||
'Wang Bing'
|
||||
|
||||
>>> get_sort_name('Frank Capra III')
|
||||
'Capra III, Frank'
|
||||
|
||||
>>> get_sort_name('The Queen of England')
|
||||
'Queen of England, The'
|
||||
|
||||
>>> get_sort_name('Sham 69')
|
||||
'Sham 69'
|
||||
|
||||
>>> get_sort_name('Scorsese, Martin')
|
||||
'Scorsese, Martin'
|
||||
"""
|
||||
if not ' ' in name or ', ' in name:
|
||||
return name
|
||||
if name.lower().startswith('the '):
|
||||
return get_sort_title(name)
|
||||
def add_name():
|
||||
if len(first_names):
|
||||
last_names.insert(0, first_names.pop())
|
||||
def find_name(names):
|
||||
return len(first_names) and first_names[-1].lower() in names
|
||||
first_names = name.split(' ')
|
||||
last_names = []
|
||||
if re.search('^[0-9]+$', first_names[-1]):
|
||||
add_name()
|
||||
if find_name(SUFFIXES):
|
||||
add_name()
|
||||
add_name()
|
||||
if find_name(MIDFIXES):
|
||||
add_name()
|
||||
add_name()
|
||||
while find_name(PREFIXES):
|
||||
add_name()
|
||||
name = ' '.join(last_names)
|
||||
if len(first_names):
|
||||
separator = ' ' if last_names[0].lower() in ASIAN_NAMES else ', '
|
||||
name += separator + ' '.join(first_names)
|
||||
return name
|
||||
|
||||
def get_sort_title(title):
|
||||
"""
|
||||
|
||||
>>> get_sort_title('Themroc')
|
||||
'Themroc'
|
||||
|
||||
>>> get_sort_title('Die Hard')
|
||||
'Hard, Die'
|
||||
|
||||
>>> get_sort_title("L'atalante")
|
||||
"atalante, L'"
|
||||
|
||||
"""
|
||||
for article in ARTICLES:
|
||||
spaces = 0 if article.endswith("'") else 1
|
||||
if title.lower().startswith(article + ' ' * spaces):
|
||||
length = len(article)
|
||||
return title[length + spaces:] + ', ' + title[:length]
|
||||
return title
|
||||
|
||||
def find_re(string, regexp):
|
||||
result = re.compile(regexp, re.DOTALL).findall(string)
|
||||
if result:
|
||||
return result[0].strip()
|
||||
return ''
|
||||
|
||||
def find_string(string, string0='', string1 = ''):
|
||||
"""Return the string between string0 and string1.
|
||||
|
||||
If string0 or string1 is left out, begining or end of string is used.
|
||||
|
||||
>>> find_string('i am not there', string1=' not there')
|
||||
'i am'
|
||||
|
||||
>>> find_string('i am not there', 'i am ', ' there')
|
||||
'not'
|
||||
|
||||
>>> find_string('i am not there', 'i am not t')
|
||||
'here'
|
||||
|
||||
"""
|
||||
if string0:
|
||||
string0 = re.escape(string0)
|
||||
else:
|
||||
string0 = '^'
|
||||
if string1:
|
||||
string1 = re.escape(string1)
|
||||
else:
|
||||
string1 = '$'
|
||||
return find_re(string, string0 + '(.*?)' + string1)
|
||||
|
||||
def parse_useragent(useragent):
|
||||
data = {}
|
||||
for key in UA_REGEXPS:
|
||||
for alias, regexp in UA_ALIASES[key].items():
|
||||
alias = alias if key == 'browser' else alias + ' \\1'
|
||||
useragent = re.sub(regexp, alias, useragent)
|
||||
for regexp in UA_REGEXPS[key]:
|
||||
data[key] = {'name': '', 'version': '', 'string': ''}
|
||||
match = re.compile(regexp).search(useragent)
|
||||
if match:
|
||||
matches = list(match.groups())
|
||||
if len(matches) == 1:
|
||||
matches.append('')
|
||||
swap = re.match('^\d', matches[0]) or matches[1] == 'Linux'
|
||||
name = matches[1 if swap else 0]
|
||||
version = matches[0 if swap else 1].replace('_', '.')
|
||||
name = UA_NAMES[key][name] if name in UA_NAMES[key] else name
|
||||
version = UA_VERSIONS[key][version] if version in UA_VERSIONS[key] else version
|
||||
string = name
|
||||
if version:
|
||||
string = string + ' ' + (
|
||||
'(' + version + ')' if name in ['BSD', 'Linux', 'Unix'] else version
|
||||
)
|
||||
data[key] = {
|
||||
'name': name,
|
||||
'version': version,
|
||||
'string': string
|
||||
}
|
||||
break;
|
||||
return data
|
||||
|
||||
def remove_special_characters(text):
|
||||
"""
|
||||
Removes special characters inserted by Word.
|
||||
"""
|
||||
text = text.replace(u'\u2013', '-')
|
||||
text = text.replace(u'\u2026O', "'")
|
||||
text = text.replace(u'\u2019', "'")
|
||||
text = text.replace(u'', "'")
|
||||
text = text.replace(u'', "'")
|
||||
text = text.replace(u'', "-")
|
||||
return text
|
||||
|
||||
def wrap(text, width):
|
||||
"""
|
||||
A word-wrap function that preserves existing line breaks and most spaces in
|
||||
the text. Expects that existing line breaks are posix newlines (\n).
|
||||
See http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/148061
|
||||
"""
|
||||
return reduce(lambda line, word, width=width: '%s%s%s' %
|
||||
(line,
|
||||
' \n'[(len(line[line.rfind('\n')+1:])
|
||||
+ len(word.split('\n',1)[0]
|
||||
) >= width)],
|
||||
word),
|
||||
text.split(' ')
|
||||
)
|
||||
|
||||
def wrap_string(string, length=80, separator='\n', balance=False):
|
||||
'''
|
||||
>>> wrap_string(u"Anticonstitutionellement, Paris s'eveille", 16)
|
||||
u"Anticonstitution\\nellement, Paris \\ns'eveille"
|
||||
>>> wrap_string(u'All you can eat', 12, '\\n', True)
|
||||
u'All you \\ncan eat'
|
||||
'''
|
||||
words = string.split(' ')
|
||||
if balance:
|
||||
# balance lines: test if same number of lines
|
||||
# can be achieved with a shorter line length
|
||||
lines = wrap_string(string, length, separator, False).split(separator)
|
||||
if len(lines) > 1:
|
||||
while length > max([len(x) for x in words]):
|
||||
length -= 1
|
||||
if len(wrap_string(string, length, separator, False).split(separator)) > len(lines):
|
||||
length += 1
|
||||
break
|
||||
lines = ['']
|
||||
for word in words:
|
||||
if len(lines[len(lines) - 1] + word + u' ') <= length + 1:
|
||||
# word fits in current line
|
||||
lines[len(lines) - 1] += word + u' ';
|
||||
else:
|
||||
if len(word) <= length:
|
||||
# word fits in next line
|
||||
lines.append(word + u' ')
|
||||
else:
|
||||
# word is longer than line
|
||||
position = length - len(lines[len(lines) - 1])
|
||||
lines[len(lines) - 1] += word[0:position]
|
||||
for i in range(position, len(word), length):
|
||||
lines.append(word[i:i+length]);
|
||||
lines[len(lines) - 1] += u' '
|
||||
return separator.join(lines).strip()
|
||||
|
||||
def truncate_string(string, length, padding='...', position='right'):
|
||||
# >>> truncate_string('anticonstitutionellement', 16, '...', 'left')
|
||||
# '...utionellement'
|
||||
# >>> truncate_string('anticonstitutionellement', 16, '...', 'center')
|
||||
# 'anticon...lement'
|
||||
# >>> truncate_string('anticonstitutionellement', 16, '...', 'right')
|
||||
# 'anticonstitut...'
|
||||
stringLength = len(string);
|
||||
paddingLength = len(padding)
|
||||
if stringLength > length:
|
||||
if position == 'left':
|
||||
string = '%s%s' % (padding, string[stringLength + paddingLength - length:])
|
||||
elif position == 'center':
|
||||
left = int(math.ceil(float(length - paddingLength) / 2))
|
||||
right = int(stringLength - math.floor(float(length - paddingLength) / 2))
|
||||
string = '%s%s%s' % (string[:left], padding, string[right:])
|
||||
elif position == 'right':
|
||||
string = '%s%s' % (string[:length - paddingLength], padding)
|
||||
return string;
|
||||
|
||||
def truncate_words(s, num):
|
||||
"""Truncates a string after a certain number of chacters, but ends with a word
|
||||
|
||||
>>> truncate_words('Truncates a string after a certain number of chacters, but ends with a word', 23)
|
||||
'Truncates a string...'
|
||||
>>> truncate_words('Truncates a string', 23)
|
||||
'Truncates a string'
|
||||
|
||||
"""
|
||||
length = int(num)
|
||||
if len(s) <= length:
|
||||
return s
|
||||
words = s.split()
|
||||
ts = ""
|
||||
while words and len(ts) + len(words[0]) < length:
|
||||
ts += " " + words.pop(0)
|
||||
if words:
|
||||
ts += "..."
|
||||
return ts.strip()
|
||||
|
||||
def trim_string(string, num):
|
||||
"""Truncates a string after a certain number of chacters, adding ... at -10 characters
|
||||
|
||||
>>> trim_string('Truncates a string after a certain number of chacters', 23)
|
||||
'Truncates ...f chacters'
|
||||
>>> trim_string('Truncates a string', 23)
|
||||
'Truncates a string'
|
||||
"""
|
||||
if len(string) > num:
|
||||
string = string[:num - 13] + '...' + string[-10:]
|
||||
return string
|
||||
|
||||
def get_valid_filename(s):
|
||||
"""
|
||||
Returns the given string converted to a string that can be used for a clean
|
||||
filename. Specifically, leading and trailing spaces are removed;
|
||||
all non-filename-safe characters are removed.
|
||||
|
||||
>>> get_valid_filename("john's portrait in 2004.jpg")
|
||||
'john_s_portrait_in_2004.jpg'
|
||||
"""
|
||||
s = s.strip()
|
||||
s = s.replace(' ', '_')
|
||||
s = re.sub(r'[^-A-Za-z0-9_.\[\]\ ]', '_', s)
|
||||
s = s.replace('__', '_').replace('__', '_')
|
||||
return s
|
||||
|
||||
def get_text_list(list_, last_word='or'):
|
||||
"""
|
||||
>>> get_text_list([u'a', u'b', u'c', u'd'])
|
||||
u'a, b, c or d'
|
||||
>>> get_text_list([u'a', u'b', u'c'], 'and')
|
||||
u'a, b and c'
|
||||
>>> get_text_list([u'a', u'b'], 'and')
|
||||
u'a and b'
|
||||
>>> get_text_list([u'a'])
|
||||
u'a'
|
||||
>>> get_text_list([])
|
||||
''
|
||||
"""
|
||||
if len(list_) == 0: return ''
|
||||
if len(list_) == 1: return list_[0]
|
||||
return u'%s %s %s' % (u', '.join([unicode(i) for i in list_][:-1]), last_word, list_[-1])
|
||||
|
||||
def get_list_text(text, last_word='or'):
|
||||
"""
|
||||
>>> get_list_text(u'a, b, c or d')
|
||||
[u'a', u'b', u'c', u'd']
|
||||
>>> get_list_text(u'a, b and c', u'and')
|
||||
[u'a', u'b', u'c']
|
||||
>>> get_list_text(u'a and b', u'and')
|
||||
[u'a', u'b']
|
||||
>>> get_list_text(u'a')
|
||||
[u'a']
|
||||
>>> get_list_text(u'')
|
||||
[]
|
||||
"""
|
||||
list_ = []
|
||||
if text:
|
||||
list_ = text.split(u', ')
|
||||
if list_:
|
||||
i=len(list_)-1
|
||||
last = list_[i].split(last_word)
|
||||
if len(last) == 2:
|
||||
list_[i] = last[0].strip()
|
||||
list_.append(last[1].strip())
|
||||
return list_
|
||||
|
||||
def normalize_newlines(text):
|
||||
return re.sub(r'\r\n|\r|\n', '\n', text)
|
||||
|
||||
def recapitalize(text):
|
||||
"Recapitalizes text, placing caps after end-of-sentence punctuation."
|
||||
#capwords = ()
|
||||
text = text.lower()
|
||||
capsRE = re.compile(r'(?:^|(?<=[\.\?\!] ))([a-z])')
|
||||
text = capsRE.sub(lambda x: x.group(1).upper(), text)
|
||||
#for capword in capwords:
|
||||
# capwordRE = re.compile(r'\b%s\b' % capword, re.I)
|
||||
# text = capwordRE.sub(capword, text)
|
||||
return text
|
||||
|
||||
def phone2numeric(phone):
|
||||
"Converts a phone number with letters into its numeric equivalent."
|
||||
letters = re.compile(r'[A-PR-Y]', re.I)
|
||||
char2number = lambda m: {'a': '2', 'c': '2', 'b': '2', 'e': '3',
|
||||
'd': '3', 'g': '4', 'f': '3', 'i': '4', 'h': '4', 'k': '5',
|
||||
'j': '5', 'm': '6', 'l': '5', 'o': '6', 'n': '6', 'p': '7',
|
||||
's': '7', 'r': '7', 'u': '8', 't': '8', 'w': '9', 'v': '8',
|
||||
'y': '9', 'x': '9'}.get(m.group(0).lower())
|
||||
return letters.sub(char2number, phone)
|
||||
|
||||
def compress_string(s):
|
||||
import cStringIO, gzip
|
||||
zbuf = cStringIO.StringIO()
|
||||
zfile = gzip.GzipFile(mode='wb', compresslevel=6, fileobj=zbuf)
|
||||
zfile.write(s)
|
||||
zfile.close()
|
||||
return zbuf.getvalue()
|
||||
|
||||
smart_split_re = re.compile('("(?:[^"\\\\]*(?:\\\\.[^"\\\\]*)*)"|\'(?:[^\'\\\\]*(?:\\\\.[^\'\\\\]*)*)\'|[^\\s]+)')
|
||||
def smart_split(text):
|
||||
"""
|
||||
Generator that splits a string by spaces, leaving quoted phrases together.
|
||||
Supports both single and double quotes, and supports escaping quotes with
|
||||
backslashes. In the output, strings will keep their initial and trailing
|
||||
quote marks.
|
||||
>>> list(smart_split('This is "a person\\'s" test.'))
|
||||
['This', 'is', '"a person\\'s"', 'test.']
|
||||
"""
|
||||
for bit in smart_split_re.finditer(text):
|
||||
bit = bit.group(0)
|
||||
if bit[0] == '"':
|
||||
yield '"' + bit[1:-1].replace('\\"', '"').replace('\\\\', '\\') + '"'
|
||||
elif bit[0] == "'":
|
||||
yield "'" + bit[1:-1].replace("\\'", "'").replace("\\\\", "\\") + "'"
|
||||
else:
|
||||
yield bit
|
||||
|
||||
def words(text):
|
||||
"""
|
||||
returns words in text, removing punctuation
|
||||
"""
|
||||
text = text.split()
|
||||
return map(lambda x: re.sub("(([.!?:-_]|'s)$)", '', x), text)
|
||||
|
||||
def sort_string(string):
|
||||
string = string.replace(u'Æ', 'AE').replace(u'Ø', 'O').replace(u'Þ', 'Th')
|
||||
|
||||
#pad numbered titles
|
||||
string = re.sub('(\d),(\d{3})', '\\1\\2', string)
|
||||
string = re.sub('(\d+)', lambda x: '%010d' % int(x.group(0)), string)
|
||||
return unicodedata.normalize('NFKD', string)
|
||||
|
||||
def sorted_strings(strings, key=None):
|
||||
if not key:
|
||||
key = lambda k: sort_string(k)
|
||||
return sorted(strings, key=key)
|
||||
74
Shared/lib/python3.4/site-packages/ox/torrent/__init__.py
Normal file
74
Shared/lib/python3.4/site-packages/ox/torrent/__init__.py
Normal file
|
|
@ -0,0 +1,74 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
# GPL 2007-2012
|
||||
|
||||
from threading import Event
|
||||
from hashlib import sha1
|
||||
import os
|
||||
|
||||
from .bencode import bencode, bdecode
|
||||
|
||||
__all__ = ['create_torrent', 'get_info_hash', 'get_torrent_info', 'get_files', 'get_torrent_size']
|
||||
|
||||
def create_torrent(file, url, params = {}, flag = Event(),
|
||||
progress = lambda x: None, progress_percent = 1):
|
||||
"Creates a torrent for a given file, using url as tracker url"
|
||||
from makemetafile import make_meta_file
|
||||
return make_meta_file(file, url, params, flag, progress, progress_percent)
|
||||
|
||||
def get_info_hash(torrentFile):
|
||||
"Returns Torrent Info Hash from torrent file"
|
||||
metainfo_file = open(torrentFile, 'rb')
|
||||
metainfo = bdecode(metainfo_file.read())
|
||||
info = metainfo['info']
|
||||
return sha1(bencode(info)).hexdigest()
|
||||
|
||||
def get_torrent_info(data=None, file=None):
|
||||
if file:
|
||||
if not isinstance(file, bytes):
|
||||
file = file.encode('utf-8')
|
||||
with open(file, 'rb') as f:
|
||||
data = f.read()
|
||||
|
||||
"Returns Torrent Info from torrent file"
|
||||
tinfo = {}
|
||||
metainfo = bdecode(data)
|
||||
info = metainfo['info']
|
||||
piece_length = info['piece length']
|
||||
if 'length' in info:
|
||||
# let's assume we just have one file
|
||||
file_length = info['length']
|
||||
else:
|
||||
# let's assume we have a directory structure
|
||||
file_length = 0;
|
||||
for f in info['files']:
|
||||
file_length += f['length']
|
||||
for key in info:
|
||||
if key != 'pieces':
|
||||
tinfo[key] = info[key]
|
||||
for key in metainfo:
|
||||
if key != 'info':
|
||||
tinfo[key] = metainfo[key]
|
||||
tinfo['size'] = file_length
|
||||
tinfo['hash'] = sha1(bencode(info)).hexdigest()
|
||||
tinfo['announce'] = metainfo['announce']
|
||||
if file:
|
||||
tinfo['timestamp'] = os.stat(file).st_ctime
|
||||
return tinfo
|
||||
|
||||
def get_files(data):
|
||||
files = []
|
||||
info = get_torrent_info(data=data)
|
||||
if 'files' in info:
|
||||
for f in info['files']:
|
||||
path = [info['name'], ]
|
||||
path.extend(f['path'])
|
||||
files.append(os.path.join(*path))
|
||||
else:
|
||||
files.append(info['name'])
|
||||
return files
|
||||
|
||||
def get_torrent_size(file):
|
||||
"Returns Size of files in torrent file in bytes"
|
||||
return get_torrent_info(file=file)['size']
|
||||
|
||||
320
Shared/lib/python3.4/site-packages/ox/torrent/bencode.py
Normal file
320
Shared/lib/python3.4/site-packages/ox/torrent/bencode.py
Normal file
|
|
@ -0,0 +1,320 @@
|
|||
# Written by Petru Paler, Uoti Urpala, Ross Cohen and John Hoffman
|
||||
# see LICENSE.txt for license information
|
||||
|
||||
from types import IntType, LongType, StringType, ListType, TupleType, DictType
|
||||
try:
|
||||
from types import BooleanType
|
||||
except ImportError:
|
||||
BooleanType = None
|
||||
try:
|
||||
from types import UnicodeType
|
||||
except ImportError:
|
||||
UnicodeType = None
|
||||
from cStringIO import StringIO
|
||||
|
||||
def decode_int(x, f):
|
||||
f += 1
|
||||
newf = x.index('e', f)
|
||||
try:
|
||||
n = int(x[f:newf])
|
||||
except:
|
||||
n = long(x[f:newf])
|
||||
if x[f] == '-':
|
||||
if x[f + 1] == '0':
|
||||
raise ValueError
|
||||
elif x[f] == '0' and newf != f+1:
|
||||
raise ValueError
|
||||
return (n, newf+1)
|
||||
|
||||
def decode_string(x, f):
|
||||
colon = x.index(':', f)
|
||||
try:
|
||||
n = int(x[f:colon])
|
||||
except (OverflowError, ValueError):
|
||||
n = long(x[f:colon])
|
||||
if x[f] == '0' and colon != f+1:
|
||||
raise ValueError
|
||||
colon += 1
|
||||
return (x[colon:colon+n], colon+n)
|
||||
|
||||
def decode_unicode(x, f):
|
||||
s, f = decode_string(x, f+1)
|
||||
return (s.decode('UTF-8'),f)
|
||||
|
||||
def decode_list(x, f):
|
||||
r, f = [], f+1
|
||||
while x[f] != 'e':
|
||||
v, f = decode_func[x[f]](x, f)
|
||||
r.append(v)
|
||||
return (r, f + 1)
|
||||
|
||||
def decode_dict(x, f):
|
||||
r, f = {}, f+1
|
||||
lastkey = None
|
||||
while x[f] != 'e':
|
||||
k, f = decode_string(x, f)
|
||||
#why is this needed
|
||||
#if lastkey >= k:
|
||||
# raise ValueError
|
||||
lastkey = k
|
||||
r[k], f = decode_func[x[f]](x, f)
|
||||
return (r, f + 1)
|
||||
|
||||
decode_func = {}
|
||||
decode_func['l'] = decode_list
|
||||
decode_func['d'] = decode_dict
|
||||
decode_func['i'] = decode_int
|
||||
decode_func['0'] = decode_string
|
||||
decode_func['1'] = decode_string
|
||||
decode_func['2'] = decode_string
|
||||
decode_func['3'] = decode_string
|
||||
decode_func['4'] = decode_string
|
||||
decode_func['5'] = decode_string
|
||||
decode_func['6'] = decode_string
|
||||
decode_func['7'] = decode_string
|
||||
decode_func['8'] = decode_string
|
||||
decode_func['9'] = decode_string
|
||||
#decode_func['u'] = decode_unicode
|
||||
|
||||
def bdecode(x, sloppy = 1):
|
||||
try:
|
||||
r, l = decode_func[x[0]](x, 0)
|
||||
# except (IndexError, KeyError):
|
||||
except (IndexError, KeyError, ValueError):
|
||||
raise ValueError, "bad bencoded data"
|
||||
if not sloppy and l != len(x):
|
||||
raise ValueError, "bad bencoded data"
|
||||
return r
|
||||
|
||||
def test_bdecode():
|
||||
try:
|
||||
bdecode('0:0:')
|
||||
assert 0
|
||||
except ValueError:
|
||||
pass
|
||||
try:
|
||||
bdecode('ie')
|
||||
assert 0
|
||||
except ValueError:
|
||||
pass
|
||||
try:
|
||||
bdecode('i341foo382e')
|
||||
assert 0
|
||||
except ValueError:
|
||||
pass
|
||||
assert bdecode('i4e') == 4L
|
||||
assert bdecode('i0e') == 0L
|
||||
assert bdecode('i123456789e') == 123456789L
|
||||
assert bdecode('i-10e') == -10L
|
||||
try:
|
||||
bdecode('i-0e')
|
||||
assert 0
|
||||
except ValueError:
|
||||
pass
|
||||
try:
|
||||
bdecode('i123')
|
||||
assert 0
|
||||
except ValueError:
|
||||
pass
|
||||
try:
|
||||
bdecode('')
|
||||
assert 0
|
||||
except ValueError:
|
||||
pass
|
||||
try:
|
||||
bdecode('i6easd')
|
||||
assert 0
|
||||
except ValueError:
|
||||
pass
|
||||
try:
|
||||
bdecode('35208734823ljdahflajhdf')
|
||||
assert 0
|
||||
except ValueError:
|
||||
pass
|
||||
try:
|
||||
bdecode('2:abfdjslhfld')
|
||||
assert 0
|
||||
except ValueError:
|
||||
pass
|
||||
assert bdecode('0:') == ''
|
||||
assert bdecode('3:abc') == 'abc'
|
||||
assert bdecode('10:1234567890') == '1234567890'
|
||||
try:
|
||||
bdecode('02:xy')
|
||||
assert 0
|
||||
except ValueError:
|
||||
pass
|
||||
try:
|
||||
bdecode('l')
|
||||
assert 0
|
||||
except ValueError:
|
||||
pass
|
||||
assert bdecode('le') == []
|
||||
try:
|
||||
bdecode('leanfdldjfh')
|
||||
assert 0
|
||||
except ValueError:
|
||||
pass
|
||||
assert bdecode('l0:0:0:e') == ['', '', '']
|
||||
try:
|
||||
bdecode('relwjhrlewjh')
|
||||
assert 0
|
||||
except ValueError:
|
||||
pass
|
||||
assert bdecode('li1ei2ei3ee') == [1, 2, 3]
|
||||
assert bdecode('l3:asd2:xye') == ['asd', 'xy']
|
||||
assert bdecode('ll5:Alice3:Bobeli2ei3eee') == [['Alice', 'Bob'], [2, 3]]
|
||||
try:
|
||||
bdecode('d')
|
||||
assert 0
|
||||
except ValueError:
|
||||
pass
|
||||
try:
|
||||
bdecode('defoobar')
|
||||
assert 0
|
||||
except ValueError:
|
||||
pass
|
||||
assert bdecode('de') == {}
|
||||
assert bdecode('d3:agei25e4:eyes4:bluee') == {'age': 25, 'eyes': 'blue'}
|
||||
assert bdecode('d8:spam.mp3d6:author5:Alice6:lengthi100000eee') == {'spam.mp3': {'author': 'Alice', 'length': 100000}}
|
||||
try:
|
||||
bdecode('d3:fooe')
|
||||
assert 0
|
||||
except ValueError:
|
||||
pass
|
||||
try:
|
||||
bdecode('di1e0:e')
|
||||
assert 0
|
||||
except ValueError:
|
||||
pass
|
||||
try:
|
||||
bdecode('d1:b0:1:a0:e')
|
||||
assert 0
|
||||
except ValueError:
|
||||
pass
|
||||
try:
|
||||
bdecode('d1:a0:1:a0:e')
|
||||
assert 0
|
||||
except ValueError:
|
||||
pass
|
||||
try:
|
||||
bdecode('i03e')
|
||||
assert 0
|
||||
except ValueError:
|
||||
pass
|
||||
try:
|
||||
bdecode('l01:ae')
|
||||
assert 0
|
||||
except ValueError:
|
||||
pass
|
||||
try:
|
||||
bdecode('9999:x')
|
||||
assert 0
|
||||
except ValueError:
|
||||
pass
|
||||
try:
|
||||
bdecode('l0:')
|
||||
assert 0
|
||||
except ValueError:
|
||||
pass
|
||||
try:
|
||||
bdecode('d0:0:')
|
||||
assert 0
|
||||
except ValueError:
|
||||
pass
|
||||
try:
|
||||
bdecode('d0:')
|
||||
assert 0
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
bencached_marker = []
|
||||
|
||||
class Bencached:
|
||||
def __init__(self, s):
|
||||
self.marker = bencached_marker
|
||||
self.bencoded = s
|
||||
|
||||
BencachedType = type(Bencached('')) # insufficient, but good as a filter
|
||||
|
||||
def encode_bencached(x,r):
|
||||
assert x.marker == bencached_marker
|
||||
r.append(x.bencoded)
|
||||
|
||||
def encode_int(x,r):
|
||||
r.extend(('i',str(x),'e'))
|
||||
|
||||
def encode_bool(x,r):
|
||||
encode_int(int(x),r)
|
||||
|
||||
def encode_string(x,r):
|
||||
r.extend((str(len(x)),':',x))
|
||||
|
||||
def encode_unicode(x,r):
|
||||
#r.append('u')
|
||||
encode_string(x.encode('UTF-8'),r)
|
||||
|
||||
def encode_list(x,r):
|
||||
r.append('l')
|
||||
for e in x:
|
||||
encode_func[type(e)](e, r)
|
||||
r.append('e')
|
||||
|
||||
def encode_dict(x,r):
|
||||
r.append('d')
|
||||
ilist = x.items()
|
||||
ilist.sort()
|
||||
for k,v in ilist:
|
||||
r.extend((str(len(k)),':',k))
|
||||
encode_func[type(v)](v, r)
|
||||
r.append('e')
|
||||
|
||||
encode_func = {}
|
||||
encode_func[BencachedType] = encode_bencached
|
||||
encode_func[IntType] = encode_int
|
||||
encode_func[LongType] = encode_int
|
||||
encode_func[StringType] = encode_string
|
||||
encode_func[ListType] = encode_list
|
||||
encode_func[TupleType] = encode_list
|
||||
encode_func[DictType] = encode_dict
|
||||
if BooleanType:
|
||||
encode_func[BooleanType] = encode_bool
|
||||
if UnicodeType:
|
||||
encode_func[UnicodeType] = encode_unicode
|
||||
|
||||
def bencode(x):
|
||||
r = []
|
||||
try:
|
||||
encode_func[type(x)](x, r)
|
||||
except:
|
||||
print "*** error *** could not encode type %s (value: %s)" % (type(x), x)
|
||||
assert 0
|
||||
return ''.join(r)
|
||||
|
||||
def test_bencode():
|
||||
assert bencode(4) == 'i4e'
|
||||
assert bencode(0) == 'i0e'
|
||||
assert bencode(-10) == 'i-10e'
|
||||
assert bencode(12345678901234567890L) == 'i12345678901234567890e'
|
||||
assert bencode('') == '0:'
|
||||
assert bencode('abc') == '3:abc'
|
||||
assert bencode('1234567890') == '10:1234567890'
|
||||
assert bencode([]) == 'le'
|
||||
assert bencode([1, 2, 3]) == 'li1ei2ei3ee'
|
||||
assert bencode([['Alice', 'Bob'], [2, 3]]) == 'll5:Alice3:Bobeli2ei3eee'
|
||||
assert bencode({}) == 'de'
|
||||
assert bencode({'age': 25, 'eyes': 'blue'}) == 'd3:agei25e4:eyes4:bluee'
|
||||
assert bencode({'spam.mp3': {'author': 'Alice', 'length': 100000}}) == 'd8:spam.mp3d6:author5:Alice6:lengthi100000eee'
|
||||
try:
|
||||
bencode({1: 'foo'})
|
||||
assert 0
|
||||
except AssertionError:
|
||||
pass
|
||||
|
||||
|
||||
try:
|
||||
import psyco
|
||||
psyco.bind(bdecode)
|
||||
psyco.bind(bencode)
|
||||
except ImportError:
|
||||
pass
|
||||
100
Shared/lib/python3.4/site-packages/ox/torrent/btformats.py
Normal file
100
Shared/lib/python3.4/site-packages/ox/torrent/btformats.py
Normal file
|
|
@ -0,0 +1,100 @@
|
|||
# Written by Bram Cohen
|
||||
# see LICENSE.txt for license information
|
||||
|
||||
from types import StringType, LongType, IntType, ListType, DictType
|
||||
from re import compile
|
||||
|
||||
reg = compile(r'^[^/\\.~][^/\\]*$')
|
||||
|
||||
ints = (LongType, IntType)
|
||||
|
||||
def check_info(info):
|
||||
if type(info) != DictType:
|
||||
raise ValueError, 'bad metainfo - not a dictionary'
|
||||
pieces = info.get('pieces')
|
||||
if type(pieces) != StringType or len(pieces) % 20 != 0:
|
||||
raise ValueError, 'bad metainfo - bad pieces key'
|
||||
piecelength = info.get('piece length')
|
||||
if type(piecelength) not in ints or piecelength <= 0:
|
||||
raise ValueError, 'bad metainfo - illegal piece length'
|
||||
name = info.get('name')
|
||||
if type(name) != StringType:
|
||||
raise ValueError, 'bad metainfo - bad name'
|
||||
if not reg.match(name):
|
||||
raise ValueError, 'name %s disallowed for security reasons' % name
|
||||
if info.has_key('files') == info.has_key('length'):
|
||||
raise ValueError, 'single/multiple file mix'
|
||||
if info.has_key('length'):
|
||||
length = info.get('length')
|
||||
if type(length) not in ints or length < 0:
|
||||
raise ValueError, 'bad metainfo - bad length'
|
||||
else:
|
||||
files = info.get('files')
|
||||
if type(files) != ListType:
|
||||
raise ValueError
|
||||
for f in files:
|
||||
if type(f) != DictType:
|
||||
raise ValueError, 'bad metainfo - bad file value'
|
||||
length = f.get('length')
|
||||
if type(length) not in ints or length < 0:
|
||||
raise ValueError, 'bad metainfo - bad length'
|
||||
path = f.get('path')
|
||||
if type(path) != ListType or path == []:
|
||||
raise ValueError, 'bad metainfo - bad path'
|
||||
for p in path:
|
||||
if type(p) != StringType:
|
||||
raise ValueError, 'bad metainfo - bad path dir'
|
||||
if not reg.match(p):
|
||||
raise ValueError, 'path %s disallowed for security reasons' % p
|
||||
for i in xrange(len(files)):
|
||||
for j in xrange(i):
|
||||
if files[i]['path'] == files[j]['path']:
|
||||
raise ValueError, 'bad metainfo - duplicate path'
|
||||
|
||||
def check_message(message):
|
||||
if type(message) != DictType:
|
||||
raise ValueError
|
||||
check_info(message.get('info'))
|
||||
if type(message.get('announce')) != StringType:
|
||||
raise ValueError
|
||||
|
||||
def check_peers(message):
|
||||
if type(message) != DictType:
|
||||
raise ValueError
|
||||
if message.has_key('failure reason'):
|
||||
if type(message['failure reason']) != StringType:
|
||||
raise ValueError
|
||||
return
|
||||
peers = message.get('peers')
|
||||
if type(peers) == ListType:
|
||||
for p in peers:
|
||||
if type(p) != DictType:
|
||||
raise ValueError
|
||||
if type(p.get('ip')) != StringType:
|
||||
raise ValueError
|
||||
port = p.get('port')
|
||||
if type(port) not in ints or p <= 0:
|
||||
raise ValueError
|
||||
if p.has_key('peer id'):
|
||||
id = p['peer id']
|
||||
if type(id) != StringType or len(id) != 20:
|
||||
raise ValueError
|
||||
elif type(peers) != StringType or len(peers) % 6 != 0:
|
||||
raise ValueError
|
||||
interval = message.get('interval', 1)
|
||||
if type(interval) not in ints or interval <= 0:
|
||||
raise ValueError
|
||||
minint = message.get('min interval', 1)
|
||||
if type(minint) not in ints or minint <= 0:
|
||||
raise ValueError
|
||||
if type(message.get('tracker id', '')) != StringType:
|
||||
raise ValueError
|
||||
npeers = message.get('num peers', 0)
|
||||
if type(npeers) not in ints or npeers < 0:
|
||||
raise ValueError
|
||||
dpeers = message.get('done peers', 0)
|
||||
if type(dpeers) not in ints or dpeers < 0:
|
||||
raise ValueError
|
||||
last = message.get('last', 0)
|
||||
if type(last) not in ints or last < 0:
|
||||
raise ValueError
|
||||
270
Shared/lib/python3.4/site-packages/ox/torrent/makemetafile.py
Normal file
270
Shared/lib/python3.4/site-packages/ox/torrent/makemetafile.py
Normal file
|
|
@ -0,0 +1,270 @@
|
|||
# Written by Bram Cohen
|
||||
# multitracker extensions by John Hoffman
|
||||
# see LICENSE.txt for license information
|
||||
|
||||
from os.path import getsize, split, join, abspath, isdir
|
||||
from os import listdir
|
||||
from hashlib import sha1 as sha
|
||||
from copy import copy
|
||||
from string import strip
|
||||
from bencode import bencode
|
||||
from btformats import check_info
|
||||
from threading import Event
|
||||
from time import time
|
||||
from traceback import print_exc
|
||||
try:
|
||||
from sys import getfilesystemencoding
|
||||
ENCODING = getfilesystemencoding()
|
||||
except:
|
||||
from sys import getdefaultencoding
|
||||
ENCODING = getdefaultencoding()
|
||||
|
||||
defaults = [
|
||||
('announce_list', '',
|
||||
'a list of announce URLs - explained below'),
|
||||
('httpseeds', '',
|
||||
'a list of http seed URLs - explained below'),
|
||||
('piece_size_pow2', 0,
|
||||
"which power of 2 to set the piece size to (0 = automatic)"),
|
||||
('comment', '',
|
||||
"optional human-readable comment to put in .torrent"),
|
||||
('filesystem_encoding', '',
|
||||
"optional specification for filesystem encoding " +
|
||||
"(set automatically in recent Python versions)"),
|
||||
('target', '',
|
||||
"optional target file for the torrent")
|
||||
]
|
||||
|
||||
default_piece_len_exp = 18
|
||||
|
||||
ignore = ['core', 'CVS']
|
||||
|
||||
def print_announcelist_details():
|
||||
print (' announce_list = optional list of redundant/backup tracker URLs, in the format:')
|
||||
print (' url[,url...][|url[,url...]...]')
|
||||
print (' where URLs separated by commas are all tried first')
|
||||
print (' before the next group of URLs separated by the pipe is checked.')
|
||||
print (" If none is given, it is assumed you don't want one in the metafile.")
|
||||
print (' If announce_list is given, clients which support it')
|
||||
print (' will ignore the <announce> value.')
|
||||
print (' Examples:')
|
||||
print (' http://tracker1.com|http://tracker2.com|http://tracker3.com')
|
||||
print (' (tries trackers 1-3 in order)')
|
||||
print (' http://tracker1.com,http://tracker2.com,http://tracker3.com')
|
||||
print (' (tries trackers 1-3 in a randomly selected order)')
|
||||
print (' http://tracker1.com|http://backup1.com,http://backup2.com')
|
||||
print (' (tries tracker 1 first, then tries between the 2 backups randomly)')
|
||||
print ('')
|
||||
print (' httpseeds = optional list of http-seed URLs, in the format:')
|
||||
print (' url[|url...]')
|
||||
|
||||
def make_meta_file(file, url, params = {}, flag = Event(),
|
||||
progress = lambda x: None, progress_percent = 1):
|
||||
if params.has_key('piece_size_pow2'):
|
||||
piece_len_exp = params['piece_size_pow2']
|
||||
else:
|
||||
piece_len_exp = default_piece_len_exp
|
||||
if params.has_key('target') and params['target'] != '':
|
||||
f = params['target']
|
||||
else:
|
||||
a, b = split(file)
|
||||
if b == '':
|
||||
f = a + '.torrent'
|
||||
else:
|
||||
f = join(a, b + '.torrent')
|
||||
|
||||
if piece_len_exp == 0: # automatic
|
||||
size = calcsize(file)
|
||||
if size > 8L*1024*1024*1024: # > 8 gig =
|
||||
piece_len_exp = 21 # 2 meg pieces
|
||||
elif size > 2*1024*1024*1024: # > 2 gig =
|
||||
piece_len_exp = 20 # 1 meg pieces
|
||||
elif size > 512*1024*1024: # > 512M =
|
||||
piece_len_exp = 19 # 512K pieces
|
||||
elif size > 64*1024*1024: # > 64M =
|
||||
piece_len_exp = 18 # 256K pieces
|
||||
elif size > 16*1024*1024: # > 16M =
|
||||
piece_len_exp = 17 # 128K pieces
|
||||
elif size > 4*1024*1024: # > 4M =
|
||||
piece_len_exp = 16 # 64K pieces
|
||||
else: # < 4M =
|
||||
piece_len_exp = 15 # 32K pieces
|
||||
piece_length = 2 ** piece_len_exp
|
||||
|
||||
encoding = None
|
||||
if params.has_key('filesystem_encoding'):
|
||||
encoding = params['filesystem_encoding']
|
||||
if not encoding:
|
||||
encoding = ENCODING
|
||||
if not encoding:
|
||||
encoding = 'ascii'
|
||||
|
||||
info = makeinfo(file, piece_length, encoding, flag, progress, progress_percent)
|
||||
if flag.isSet():
|
||||
return
|
||||
check_info(info)
|
||||
h = open(f, 'wb')
|
||||
data = {'info': info, 'announce': strip(url), 'creation date': long(time())}
|
||||
|
||||
if params.has_key('comment') and params['comment']:
|
||||
data['comment'] = params['comment']
|
||||
|
||||
if params.has_key('real_announce_list'): # shortcut for progs calling in from outside
|
||||
data['announce-list'] = params['real_announce_list']
|
||||
elif params.has_key('announce_list') and params['announce_list']:
|
||||
l = []
|
||||
for tier in params['announce_list'].split('|'):
|
||||
l.append(tier.split(','))
|
||||
data['announce-list'] = l
|
||||
|
||||
if params.has_key('real_httpseeds'): # shortcut for progs calling in from outside
|
||||
data['httpseeds'] = params['real_httpseeds']
|
||||
elif params.has_key('httpseeds') and params['httpseeds']:
|
||||
data['httpseeds'] = params['httpseeds'].split('|')
|
||||
|
||||
if params.has_key('url-list') and params['url-list']:
|
||||
data['url-list'] = params['url-list'].split('|')
|
||||
|
||||
if params.has_key('playtime') and params['playtime']:
|
||||
data['info']['playtime'] = params['playtime']
|
||||
|
||||
h.write(bencode(data))
|
||||
h.close()
|
||||
|
||||
def calcsize(file):
|
||||
if not isdir(file):
|
||||
return getsize(file)
|
||||
total = 0L
|
||||
for s in subfiles(abspath(file)):
|
||||
total += getsize(s[1])
|
||||
return total
|
||||
|
||||
|
||||
def uniconvertl(l, e):
|
||||
r = []
|
||||
try:
|
||||
for s in l:
|
||||
r.append(uniconvert(s, e))
|
||||
except UnicodeError:
|
||||
raise UnicodeError('bad filename: '+join(*l))
|
||||
return r
|
||||
|
||||
def uniconvert(s, e):
|
||||
try:
|
||||
if s.__class__.__name__ != 'unicode':
|
||||
s = unicode(s,e)
|
||||
except UnicodeError:
|
||||
raise UnicodeError('bad filename: '+s)
|
||||
return s.encode('utf-8')
|
||||
|
||||
def makeinfo(file, piece_length, encoding, flag, progress, progress_percent=1):
|
||||
file = abspath(file)
|
||||
if isdir(file):
|
||||
subs = subfiles(file)
|
||||
subs.sort()
|
||||
pieces = []
|
||||
sh = sha()
|
||||
done = 0L
|
||||
fs = []
|
||||
totalsize = 0.0
|
||||
totalhashed = 0L
|
||||
for p, f in subs:
|
||||
totalsize += getsize(f)
|
||||
|
||||
for p, f in subs:
|
||||
pos = 0L
|
||||
size = getsize(f)
|
||||
fs.append({'length': size, 'path': uniconvertl(p, encoding)})
|
||||
h = open(f, 'rb')
|
||||
while pos < size:
|
||||
a = min(size - pos, piece_length - done)
|
||||
sh.update(h.read(a))
|
||||
if flag.isSet():
|
||||
return
|
||||
done += a
|
||||
pos += a
|
||||
totalhashed += a
|
||||
|
||||
if done == piece_length:
|
||||
pieces.append(sh.digest())
|
||||
done = 0
|
||||
sh = sha()
|
||||
if progress_percent:
|
||||
progress(totalhashed / totalsize)
|
||||
else:
|
||||
progress(a)
|
||||
h.close()
|
||||
if done > 0:
|
||||
pieces.append(sh.digest())
|
||||
return {'pieces': ''.join(pieces),
|
||||
'piece length': piece_length, 'files': fs,
|
||||
'name': uniconvert(split(file)[1], encoding) }
|
||||
else:
|
||||
size = getsize(file)
|
||||
pieces = []
|
||||
p = 0L
|
||||
h = open(file, 'rb')
|
||||
while p < size:
|
||||
x = h.read(min(piece_length, size - p))
|
||||
if flag.isSet():
|
||||
return
|
||||
pieces.append(sha(x).digest())
|
||||
p += piece_length
|
||||
if p > size:
|
||||
p = size
|
||||
if progress_percent:
|
||||
progress(float(p) / size)
|
||||
else:
|
||||
progress(min(piece_length, size - p))
|
||||
h.close()
|
||||
return {'pieces': ''.join(pieces),
|
||||
'piece length': piece_length, 'length': size,
|
||||
'name': uniconvert(split(file)[1], encoding) }
|
||||
|
||||
def subfiles(d):
|
||||
r = []
|
||||
stack = [([], d)]
|
||||
while len(stack) > 0:
|
||||
p, n = stack.pop()
|
||||
if isdir(n):
|
||||
for s in listdir(n):
|
||||
if s not in ignore and s[:1] != '.':
|
||||
stack.append((copy(p) + [s], join(n, s)))
|
||||
else:
|
||||
r.append((p, n))
|
||||
return r
|
||||
|
||||
|
||||
def completedir(dir, url, params = {}, flag = Event(),
|
||||
vc = lambda x: None, fc = lambda x: None):
|
||||
files = listdir(dir)
|
||||
files.sort()
|
||||
ext = '.torrent'
|
||||
if params.has_key('target'):
|
||||
target = params['target']
|
||||
else:
|
||||
target = ''
|
||||
|
||||
togen = []
|
||||
for f in files:
|
||||
if f[-len(ext):] != ext and (f + ext) not in files:
|
||||
togen.append(join(dir, f))
|
||||
|
||||
total = 0
|
||||
for i in togen:
|
||||
total += calcsize(i)
|
||||
|
||||
subtotal = [0]
|
||||
def callback(x, subtotal = subtotal, total = total, vc = vc):
|
||||
subtotal[0] += x
|
||||
vc(float(subtotal[0]) / total)
|
||||
for i in togen:
|
||||
fc(i)
|
||||
try:
|
||||
t = split(i)[-1]
|
||||
if t not in ignore and t[0] != '.':
|
||||
if target != '':
|
||||
params['target'] = join(target,t+ext)
|
||||
make_meta_file(i, url, params, flag, progress = callback, progress_percent = 0)
|
||||
except ValueError:
|
||||
print_exc()
|
||||
20
Shared/lib/python3.4/site-packages/ox/utils.py
Normal file
20
Shared/lib/python3.4/site-packages/ox/utils.py
Normal file
|
|
@ -0,0 +1,20 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
|
||||
try:
|
||||
from django.utils import datetime
|
||||
except:
|
||||
from datetime import datetime
|
||||
|
||||
try:
|
||||
import simplejson as json
|
||||
except ImportError:
|
||||
try:
|
||||
import json
|
||||
except ImportError:
|
||||
from django.utils import simplejson as json
|
||||
|
||||
try:
|
||||
import xml.etree.ElementTree as ET
|
||||
except:
|
||||
import elementtree.ElementTree as ET
|
||||
9
Shared/lib/python3.4/site-packages/ox/web/__init__.py
Normal file
9
Shared/lib/python3.4/site-packages/ox/web/__init__.py
Normal file
|
|
@ -0,0 +1,9 @@
|
|||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
# encoding: utf-8
|
||||
__version__ = '1.0.0'
|
||||
|
||||
from . import imdb
|
||||
from . import wikipedia
|
||||
from . import google
|
||||
from . import piratecinema
|
||||
from . import oxdb
|
||||
20
Shared/lib/python3.4/site-packages/ox/web/abebooks.py
Normal file
20
Shared/lib/python3.4/site-packages/ox/web/abebooks.py
Normal file
|
|
@ -0,0 +1,20 @@
|
|||
from ox.cache import read_url
|
||||
import re
|
||||
import lxml.html
|
||||
|
||||
def get_data(id):
|
||||
info = {}
|
||||
base = 'http://www.abebooks.com'
|
||||
url = '%s/servlet/SearchResults?isbn=%s&sts=t' % (base, id)
|
||||
data = read_url(url)
|
||||
urls = re.compile('href="(/servlet/BookDetailsPL[^"]+)"').findall(data)
|
||||
if urls:
|
||||
details = '%s%s' % (base, urls[0])
|
||||
data = read_url(details)
|
||||
doc = lxml.html.document_fromstring(data)
|
||||
for e in doc.xpath("//*[contains(@id, 'biblio')]"):
|
||||
key = e.attrib['id'].replace('biblio-', '')
|
||||
value = e.text_content()
|
||||
if value and key not in ('bookcondition', 'binding'):
|
||||
info[key] = value
|
||||
return info
|
||||
85
Shared/lib/python3.4/site-packages/ox/web/allmovie.py
Normal file
85
Shared/lib/python3.4/site-packages/ox/web/allmovie.py
Normal file
|
|
@ -0,0 +1,85 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
import re
|
||||
|
||||
from ox import strip_tags, find_re
|
||||
from ox.cache import read_url
|
||||
|
||||
|
||||
def get_id(url):
|
||||
return url.split("/")[-1]
|
||||
|
||||
def get_data(id):
|
||||
'''
|
||||
>>> get_data('129689')['cast'][1][1]
|
||||
u'Marianne'
|
||||
>>> get_data('129689')['credits'][0][0]
|
||||
u'Jean-Luc Godard'
|
||||
>>> get_data('129689')['posters'][0]
|
||||
u'http://image.allmusic.com/00/adg/cov200/dru800/u812/u81260bbffr.jpg'
|
||||
>>> get_data('129689')['rating']
|
||||
u'4.5'
|
||||
'''
|
||||
if id.startswith('http'):
|
||||
id = get_id(id)
|
||||
data = {
|
||||
"url": get_url(id)
|
||||
}
|
||||
html = read_url(data["url"], unicode=True)
|
||||
data['aka'] = parse_list(html, 'AKA')
|
||||
data['category'] = find_re(html, '<dt>category</dt>.*?<dd>(.*?)</dd>')
|
||||
data['countries'] = parse_list(html, 'countries')
|
||||
data['director'] = parse_entry(html, 'directed by')
|
||||
data['genres'] = parse_list(html, 'genres')
|
||||
data['keywords'] = parse_list(html, 'keywords')
|
||||
data['posters'] = [find_re(html, '<img src="(http://cps-.*?)"')]
|
||||
data['produced'] = parse_list(html, 'produced by')
|
||||
data['rating'] = find_re(html, 'Stars" title="(.*?) Stars"')
|
||||
data['released'] = parse_entry(html, 'released by')
|
||||
data['releasedate'] = parse_list(html, 'release date')
|
||||
data['runtime'] = parse_entry(html, 'run time').replace('min.', '').strip()
|
||||
data['set'] = parse_entry(html, 'set in')
|
||||
data['synopsis'] = strip_tags(find_re(html, '<div class="toggle-text" itemprop="description">(.*?)</div>')).strip()
|
||||
data['themes'] = parse_list(html, 'themes')
|
||||
data['types'] = parse_list(html, 'types')
|
||||
data['year'] = find_re(html, '<span class="year">.*?(\d+)')
|
||||
#data['stills'] = [re.sub('_derived.*?/', '', i) for i in re.compile('<a href="#" title="movie still".*?<img src="(.*?)"', re.DOTALL).findall(html)]
|
||||
data['stills'] = re.compile('<a href="#" title="movie still".*?<img src="(.*?)"', re.DOTALL).findall(html)
|
||||
#html = read_url("http://allmovie.com/work/%s/cast" % id, unicode=True)
|
||||
#data['cast'] = parse_table(html)
|
||||
#html = read_url("http://allmovie.com/work/%s/credits" % id, unicode=True)
|
||||
#data['credits'] = parse_table(html)
|
||||
html = read_url("http://allmovie.com/work/%s/review" % id, unicode=True)
|
||||
data['review'] = strip_tags(find_re(html, '<div class="toggle-text" itemprop="description">(.*?)</div>')).strip()
|
||||
return data
|
||||
|
||||
def get_url(id):
|
||||
return "http://allmovie.com/work/%s" % id
|
||||
|
||||
def parse_entry(html, title):
|
||||
html = find_re(html, '<dt>%s</dt>.*?<dd>(.*?)</dd>' % title)
|
||||
return strip_tags(html).strip()
|
||||
|
||||
def parse_list(html, title):
|
||||
html = find_re(html, '<dt>%s</dt>.*?<dd>(.*?)</dd>' % title.lower())
|
||||
r = map(strip_tags, re.compile('<li>(.*?)</li>', re.DOTALL).findall(html))
|
||||
if not r and html:
|
||||
r = [strip_tags(html)]
|
||||
return r
|
||||
|
||||
def parse_table(html):
|
||||
return [
|
||||
[
|
||||
strip_tags(r).strip().replace(' ', '')
|
||||
for r in x.split('<td width="305">-')
|
||||
]
|
||||
for x in find_re(html, '<div id="results-table">(.*?)</table>').split('</tr>')[:-1]
|
||||
]
|
||||
|
||||
def parse_text(html, title):
|
||||
return strip_tags(find_re(html, '%s</td>.*?<td colspan="2"><p>(.*?)</td>' % title)).strip()
|
||||
|
||||
if __name__ == '__main__':
|
||||
print get_data('129689')
|
||||
# print get_data('177524')
|
||||
|
||||
77
Shared/lib/python3.4/site-packages/ox/web/amazon.py
Normal file
77
Shared/lib/python3.4/site-packages/ox/web/amazon.py
Normal file
|
|
@ -0,0 +1,77 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
import re
|
||||
from six.moves.urllib.parse import quote
|
||||
|
||||
from ox import find_re, strip_tags, decode_html
|
||||
from ox.cache import read_url
|
||||
|
||||
import lxml
|
||||
|
||||
|
||||
def findISBN(title, author):
|
||||
q = '%s %s' % (title, author)
|
||||
url = "http://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Dstripbooks&field-keywords=" + "%s&x=0&y=0" % quote(q)
|
||||
data = read_url(url, unicode=True)
|
||||
links = re.compile('href="(http://www.amazon.com/.*?/dp/.*?)"').findall(data)
|
||||
id = find_re(re.compile('href="(http://www.amazon.com/.*?/dp/.*?)"').findall(data)[0], '/dp/(.*?)/')
|
||||
data = get_data(id)
|
||||
if author in data['authors']:
|
||||
return data
|
||||
return {}
|
||||
|
||||
def get_data(id):
|
||||
url = "http://www.amazon.com/title/dp/%s/" % id
|
||||
data = read_url(url, unicode=True)
|
||||
|
||||
|
||||
def find_data(key):
|
||||
return find_re(data, '<li><b>%s:</b>(.*?)</li>'% key).strip()
|
||||
|
||||
r = {}
|
||||
r['amazon'] = url
|
||||
r['title'] = find_re(data, '<span id="productTitle" class="a-size-large">(.*?)</span>')
|
||||
r['authors'] = []
|
||||
doc = lxml.html.document_fromstring(data)
|
||||
for e in doc.xpath("//span[contains(@class, 'author')]"):
|
||||
print e
|
||||
for secondary in e.xpath(".//span[contains(@class, 'a-color-secondary')]"):
|
||||
if 'Author' in secondary.text:
|
||||
author = e.xpath(".//span[contains(@class, 'a-size-medium')]")
|
||||
if author:
|
||||
r['authors'].append(author[0].text.strip())
|
||||
else:
|
||||
r['authors'].append(e.xpath('.//a')[0].text.strip())
|
||||
break
|
||||
elif 'Translator' in secondary.text:
|
||||
r['translator'] = [e.xpath('.//a')[0].text]
|
||||
break
|
||||
r['publisher'] = find_data('Publisher')
|
||||
r['language'] = find_data('Language')
|
||||
r['isbn-10'] = find_data('ISBN-10')
|
||||
r['isbn-13'] = find_data('ISBN-13').replace('-', '')
|
||||
r['dimensions'] = find_re(data, '<li><b>.*?Product Dimensions:.*?</b>(.*?)</li>')
|
||||
|
||||
r['pages'] = find_data('Paperback')
|
||||
if not r['pages']:
|
||||
r['pages'] = find_data('Hardcover')
|
||||
|
||||
r['review'] = strip_tags(find_re(data, '<h3 class="productDescriptionSource">Review</h3>.*?<div class="productDescriptionWrapper">(.*?)</div>').replace('<br />', '\n')).strip()
|
||||
|
||||
for e in doc.xpath('//noscript'):
|
||||
for c in e.getchildren():
|
||||
if c.tag == 'div':
|
||||
r['description'] = strip_tags(decode_html(lxml.html.tostring(c))).strip()
|
||||
break
|
||||
|
||||
r['cover'] = re.findall('src="(.*?)" id="prodImage"', data)
|
||||
if r['cover']:
|
||||
r['cover'] = r['cover'][0].split('._BO2')[0]
|
||||
if not r['cover'].endswith('.jpg'):
|
||||
r['cover'] = r['cover'] + '.jpg'
|
||||
if 'no-image-avail-img' in r['cover']:
|
||||
del r['cover']
|
||||
else:
|
||||
del r['cover']
|
||||
return r
|
||||
|
||||
67
Shared/lib/python3.4/site-packages/ox/web/apple.py
Normal file
67
Shared/lib/python3.4/site-packages/ox/web/apple.py
Normal file
|
|
@ -0,0 +1,67 @@
|
|||
import json
|
||||
import re
|
||||
|
||||
from ox.cache import read_url
|
||||
|
||||
HEADERS = {
|
||||
'User-Agent': 'iTunes/10.4 (Macintosh; Intel Mac OS X 10.7) AppleWebKit/534.48.3',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||
'Accept-Language': 'en-us, en;q=0.50',
|
||||
'X-Apple-Store-Front': '143441-1,12',
|
||||
'X-Apple-Tz': '7200',
|
||||
'Accept-Encoding': 'gzip, deflate'
|
||||
}
|
||||
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7) '
|
||||
USER_AGENT += 'AppleWebKit/534.48.3 (KHTML, like Gecko) Version/5.1 Safari/534.48.3'
|
||||
|
||||
def get_movie_data(title, director):
|
||||
if isinstance(title, unicode):
|
||||
title = title.encode('utf-8')
|
||||
if isinstance(director, unicode):
|
||||
director = director.encode('utf-8')
|
||||
data = {}
|
||||
# itunes section (preferred source for link)
|
||||
url = 'http://ax.search.itunes.apple.com/WebObjects/MZSearch.woa/wa/advancedSearch'
|
||||
url += '?media=movie&movieTerm=' + title
|
||||
url += '&actorNames=&directorProducerName=' + director
|
||||
url += '&releaseYearTerm=&descriptionTerm=&genreIndex=1&ratingIndex=1'
|
||||
HEADERS['Referer'] = url
|
||||
html = read_url(url, headers=HEADERS, unicode=True)
|
||||
regexp = '<a href="(http://itunes.apple.com/us/movie/.*?)" class="artwork-link"><div class="artwork">'
|
||||
regexp += '<img width=".*?" height=".*?" alt=".*?" class="artwork" src="(.*?)" /></div></a>'
|
||||
results = re.compile(regexp).findall(html)
|
||||
if results:
|
||||
data['link'] = results[0][0]
|
||||
data['poster'] = results[0][1].replace('140x140', '600x600')
|
||||
html = read_url(data['link'], headers=HEADERS, unicode=True)
|
||||
results = re.compile('video-preview-url="(.*?)"').findall(html)
|
||||
if results:
|
||||
data['trailer'] = results[0]
|
||||
# trailers section (preferred source for poster and trailer)
|
||||
host = 'http://trailers.apple.com'
|
||||
url = host + '/trailers/home/scripts/quickfind.php?callback=searchCallback&q=' + title
|
||||
js = json.loads(read_url(url, unicode=True)[16:-4])
|
||||
results = js['results']
|
||||
if results:
|
||||
url = host + results[0]['location']
|
||||
if not 'link' in data:
|
||||
data['link'] = url
|
||||
headers = {
|
||||
'User-Agent': USER_AGENT
|
||||
}
|
||||
html = read_url(url, headers=headers, unicode=True)
|
||||
results = re.compile('"(' + host + '.*?poster\.jpg)"').findall(html)
|
||||
if results:
|
||||
data['poster'] = results[0].replace('poster.jpg', 'poster-xlarge.jpg')
|
||||
html = read_url(url + 'includes/playlists/web.inc', headers=headers, unicode=True)
|
||||
results = re.compile('"(' + host + '\S+\.mov)"').findall(html)
|
||||
if results:
|
||||
data['trailer'] = results[-1]
|
||||
return data
|
||||
|
||||
if __name__ == '__main__':
|
||||
print get_movie_data('Alphaville', 'Jean-Luc Godard')
|
||||
print get_movie_data('Sin City', 'Roberto Rodriguez')
|
||||
print get_movie_data('Breathless', 'Jean-Luc Godard')
|
||||
print get_movie_data('Capitalism: A Love Story', 'Michael Moore')
|
||||
print get_movie_data('Film Socialisme', 'Jean-Luc Godard')
|
||||
26
Shared/lib/python3.4/site-packages/ox/web/archive.py
Normal file
26
Shared/lib/python3.4/site-packages/ox/web/archive.py
Normal file
|
|
@ -0,0 +1,26 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
from .. import cache
|
||||
from ..utils import json
|
||||
|
||||
def get_id(url):
|
||||
return url.split("/")[-1]
|
||||
|
||||
def get_url(id):
|
||||
return "http://www.archive.org/details/%s" % id
|
||||
|
||||
def get_data(id):
|
||||
data = {}
|
||||
url = get_url(id)
|
||||
details = cache.read_url('%s?output=json' % url)
|
||||
details = json.loads(details)
|
||||
for key in ('title', 'description', 'runtime'):
|
||||
data[key] = details['metadata'][key]
|
||||
if isinstance(data[key], list):
|
||||
data[key] = data[key][0]
|
||||
data['url'] = url
|
||||
data['image'] = 'http://archive.org/download/%s/format=thumbnail' % id
|
||||
data['ogg'] = 'http://archive.org/download/%s/format=Ogg+video' % id
|
||||
data['mp4'] = 'http://archive.org/download/%s/format=512Kb+MPEG4' % id
|
||||
return data
|
||||
|
||||
71
Shared/lib/python3.4/site-packages/ox/web/arsenalberlin.py
Normal file
71
Shared/lib/python3.4/site-packages/ox/web/arsenalberlin.py
Normal file
|
|
@ -0,0 +1,71 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
|
||||
from ox import find_re, strip_tags
|
||||
from ox.cache import read_url
|
||||
|
||||
def get_data(id, language='en'):
|
||||
if language == 'de':
|
||||
url = 'http://films.arsenal-berlin.de/index.php/Detail/Object/Show/object_id/%d/lang/de_DE' % id
|
||||
else:
|
||||
url = 'http://films.arsenal-berlin.de/index.php/Detail/Object/Show/object_id/%d' % id
|
||||
html = read_url(url, unicode=True)
|
||||
if 'ID does not exist' in html:
|
||||
return None
|
||||
if 'Willkommen in der Datenbank des Arsenal' in html:
|
||||
return None
|
||||
data = {}
|
||||
data[u'id'] = id
|
||||
data[u'url'] = url
|
||||
m = re.compile('<h1>(.*?)</h1>').findall(html)
|
||||
if m:
|
||||
data[u'title'] = m[0]
|
||||
m = re.compile("<b>Director: </b><a href='.*?'>(.*?)</a>").findall(html)
|
||||
if m:
|
||||
data[u'director'] = m[0]
|
||||
|
||||
m = re.compile("caUI.initImageScroller\(\[\{url:'(.*?)'").findall(html)
|
||||
if m:
|
||||
data[u'image'] = m[0]
|
||||
|
||||
units = re.compile("<div class='unit'>(.*?)</div>", re.DOTALL).findall(html)
|
||||
for x in map(re.compile('<b>(.*?)</b>: (.*)', re.DOTALL).findall, units):
|
||||
if x:
|
||||
#data[x[0][0].lower()] = strip_tags(x[0][1])
|
||||
key = x[0][0].lower()
|
||||
data[key] = x[0][1]
|
||||
if key == "forum catalogue pdf":
|
||||
data[key] = find_re(data[key], '"(http:.*?)"')
|
||||
else:
|
||||
data[key] = strip_tags(data[key])
|
||||
if "running time (minutes)" in data:
|
||||
data[u'runtime'] = float(data.pop("running time (minutes)").replace(',', '.')) * 60
|
||||
for key in ('year', 'length in metres', 'forum participation year', 'number of reels'):
|
||||
if key in data and data[key].isdigit():
|
||||
data[key] = int(data[key])
|
||||
return data
|
||||
|
||||
def backup(filename):
|
||||
if os.path.exists(filename):
|
||||
with open(filename) as f:
|
||||
data = json.load(f)
|
||||
else:
|
||||
data = {}
|
||||
start = max(map(int, data)) or 1
|
||||
for i in range(start, 11872):
|
||||
info = get_data(i)
|
||||
if info:
|
||||
data[i] = info
|
||||
if len(data) % 10 == 0:
|
||||
print 'save', filename, len(data)
|
||||
with open(filename, 'w') as f:
|
||||
json.dump(data, f)
|
||||
else:
|
||||
print 'ignore', i
|
||||
with open(filename, 'w') as f:
|
||||
json.dump(data, f)
|
||||
return data
|
||||
|
||||
33
Shared/lib/python3.4/site-packages/ox/web/auth.py
Normal file
33
Shared/lib/python3.4/site-packages/ox/web/auth.py
Normal file
|
|
@ -0,0 +1,33 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
# GPL 2009
|
||||
import os
|
||||
|
||||
from ox.utils import json
|
||||
|
||||
def get(key):
|
||||
user_auth = os.environ.get('oxAUTH', os.path.expanduser('~/.ox/auth.json'))
|
||||
auth = {}
|
||||
if os.path.exists(user_auth):
|
||||
f = open(user_auth, "r")
|
||||
data = f.read()
|
||||
f.close()
|
||||
auth = json.loads(data)
|
||||
if key in auth:
|
||||
return auth[key]
|
||||
print "please add key %s to json file '%s'" % (key, user_auth)
|
||||
raise Exception,"no key %s found" % key
|
||||
|
||||
def update(key, value):
|
||||
user_auth = os.environ.get('oxAUTH', os.path.expanduser('~/.ox/auth.json'))
|
||||
auth = {}
|
||||
if os.path.exists(user_auth):
|
||||
f = open(user_auth, "r")
|
||||
data = f.read()
|
||||
f.close()
|
||||
auth = json.loads(data)
|
||||
auth[key] = value
|
||||
f = open(user_auth, "w")
|
||||
f.write(json.dumps(auth, indent=2))
|
||||
f.close()
|
||||
|
||||
100
Shared/lib/python3.4/site-packages/ox/web/criterion.py
Normal file
100
Shared/lib/python3.4/site-packages/ox/web/criterion.py
Normal file
|
|
@ -0,0 +1,100 @@
|
|||
# -*- coding: UTF-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
import re
|
||||
|
||||
import ox.cache
|
||||
from ox.cache import read_url
|
||||
from ox.html import strip_tags
|
||||
from ox.text import find_re
|
||||
|
||||
import imdb
|
||||
|
||||
def get_id(url):
|
||||
return url.split("/")[-1]
|
||||
|
||||
def get_url(id):
|
||||
return "http://www.criterion.com/films/%s" % id
|
||||
|
||||
def get_data(id, timeout=ox.cache.cache_timeout, get_imdb=False):
|
||||
'''
|
||||
>>> get_data('1333').get('imdbId')
|
||||
u'0060304'
|
||||
|
||||
>>> get_data('236')['posters'][0]
|
||||
u'http://s3.amazonaws.com/criterion-production/release_images/1586/ThirdManReplace.jpg'
|
||||
|
||||
>>> get_data('786')['posters'][0]
|
||||
u'http://s3.amazonaws.com/criterion-production/product_images/185/343_box_348x490.jpg'
|
||||
'''
|
||||
data = {
|
||||
"url": get_url(id)
|
||||
}
|
||||
try:
|
||||
html = read_url(data["url"], timeout=timeout, unicode=True)
|
||||
except:
|
||||
html = ox.cache.read_url(data["url"], timeout=timeout)
|
||||
data["number"] = find_re(html, "<li>Spine #(\d+)")
|
||||
|
||||
data["title"] = find_re(html, "<h1 class=\"movietitle\">(.*?)</h1>")
|
||||
data["title"] = data["title"].split(u' \u2014 The Television Version')[0]
|
||||
data["director"] = strip_tags(find_re(html, "<h2 class=\"director\">(.*?)</h2>"))
|
||||
results = find_re(html, '<div class="left_column">(.*?)</div>')
|
||||
results = re.compile("<li>(.*?)</li>").findall(results)
|
||||
data["country"] = results[0]
|
||||
data["year"] = results[1]
|
||||
data["synopsis"] = strip_tags(find_re(html, "<div class=\"content_block last\">.*?<p>(.*?)</p>"))
|
||||
|
||||
result = find_re(html, "<div class=\"purchase\">(.*?)</div>")
|
||||
if 'Blu-Ray' in result or 'Essential Art House DVD' in result:
|
||||
r = re.compile('<h3 class="section_title first">Other Editions</h3>(.*?)</div>', re.DOTALL).findall(html)
|
||||
if r:
|
||||
result = r[0]
|
||||
result = find_re(result, "<a href=\"(.*?)\"")
|
||||
if not "/boxsets/" in result:
|
||||
data["posters"] = [result]
|
||||
else:
|
||||
html_ = read_url(result, unicode=True)
|
||||
result = find_re(html_, '<a href="http://www.criterion.com/films/%s.*?">(.*?)</a>' % id)
|
||||
result = find_re(result, "src=\"(.*?)\"")
|
||||
if result:
|
||||
data["posters"] = [result.replace("_w100", "")]
|
||||
else:
|
||||
data["posters"] = []
|
||||
data['posters'] = [re.sub('(\?\d+)$', '', p) for p in data['posters']]
|
||||
result = find_re(html, "<img alt=\"Film Still\" height=\"252\" src=\"(.*?)\"")
|
||||
if result:
|
||||
data["stills"] = [result]
|
||||
data["trailers"] = []
|
||||
else:
|
||||
data["stills"] = filter(lambda x: x, [find_re(html, "\"thumbnailURL\", \"(.*?)\"")])
|
||||
data["trailers"] = filter(lambda x: x, [find_re(html, "\"videoURL\", \"(.*?)\"")])
|
||||
|
||||
if timeout == ox.cache.cache_timeout:
|
||||
timeout = -1
|
||||
if get_imdb:
|
||||
# removed year, as "title (year)" may fail to match
|
||||
data['imdbId'] = imdb.get_movie_id(data['title'], data['director'], timeout=timeout)
|
||||
return data
|
||||
|
||||
def get_ids(page=None):
|
||||
ids = []
|
||||
if page:
|
||||
url = "http://www.criterion.com/library/expanded_view?m=dvd&p=%s&pp=50&s=spine" % page
|
||||
html = read_url(url)
|
||||
results = re.compile("films/(\d+)").findall(html)
|
||||
ids += results
|
||||
results = re.compile("boxsets/(.*?)\"").findall(html)
|
||||
for result in results:
|
||||
html = read_url("http://www.criterion.com/boxsets/" + result)
|
||||
results = re.compile("films/(\d+)").findall(html)
|
||||
ids += results
|
||||
return set(ids)
|
||||
html = read_url("http://www.criterion.com/library/expanded_view?m=dvd&p=1&pp=50&s=spine", unicode=True)
|
||||
results = re.compile("\&p=(\d+)\&").findall(html)
|
||||
pages = max(map(int, results))
|
||||
for page in range(1, pages):
|
||||
ids += get_ids(page)
|
||||
return sorted(set(ids), key=int)
|
||||
|
||||
if __name__ == '__main__':
|
||||
print get_ids()
|
||||
21
Shared/lib/python3.4/site-packages/ox/web/dailymotion.py
Normal file
21
Shared/lib/python3.4/site-packages/ox/web/dailymotion.py
Normal file
|
|
@ -0,0 +1,21 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
import re
|
||||
from six.moves.urllib.parse import unquote
|
||||
from ox.cache import read_url
|
||||
|
||||
|
||||
def get_video_url(url):
|
||||
'''
|
||||
>>> get_video_url('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3opar_priere-pour-refuznik-1-jeanluc-goda_shortfilms').split('?auth')[0]
|
||||
'http://www.dailymotion.com/cdn/FLV-320x240/video/x3opar_priere-pour-refuznik-1-jean-luc-god_shortfilms.flv'
|
||||
|
||||
>>> get_video_url('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3ou94_priere-pour-refuznik-2-jeanluc-goda_shortfilms').split('?auth')[0]
|
||||
'http://www.dailymotion.com/cdn/FLV-320x240/video/x3ou94_priere-pour-refuznik-2-jean-luc-god_shortfilms.flv'
|
||||
'''
|
||||
data = read_url(url)
|
||||
video = re.compile('''video", "(.*?)"''').findall(data)
|
||||
for v in video:
|
||||
v = unquote(v).split('@@')[0]
|
||||
return v
|
||||
return ''
|
||||
22
Shared/lib/python3.4/site-packages/ox/web/duckduckgo.py
Normal file
22
Shared/lib/python3.4/site-packages/ox/web/duckduckgo.py
Normal file
|
|
@ -0,0 +1,22 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
import re
|
||||
|
||||
from six.moves import urllib
|
||||
import ox
|
||||
from ox import strip_tags, decode_html
|
||||
from ox.cache import read_url
|
||||
|
||||
|
||||
def find(query, timeout=ox.cache.cache_timeout):
|
||||
if not isinstance(query, bytes):
|
||||
query = query.encode('utf-8')
|
||||
params = urllib.parse.urlencode({'q': query})
|
||||
url = 'http://duckduckgo.com/html/?' + params
|
||||
data = read_url(url, timeout=timeout).decode('utf-8')
|
||||
results = []
|
||||
regex = '<a .*?class="large" href="(.+?)">(.*?)</a>.*?<div class="snippet">(.*?)</div>'
|
||||
for r in re.compile(regex, re.DOTALL).findall(data):
|
||||
results.append((strip_tags(decode_html(r[1])), r[0], strip_tags(decode_html(r[2]))))
|
||||
return results
|
||||
|
||||
49
Shared/lib/python3.4/site-packages/ox/web/epguides.py
Normal file
49
Shared/lib/python3.4/site-packages/ox/web/epguides.py
Normal file
|
|
@ -0,0 +1,49 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
import re
|
||||
import time
|
||||
|
||||
from ox import strip_tags, find_re
|
||||
from ox.cache import read_url
|
||||
|
||||
import google
|
||||
|
||||
|
||||
def get_show_url(title):
|
||||
'''
|
||||
Search Epguide Url for Show via Show Title.
|
||||
Use Google to search the url, this is also done on Epguide.
|
||||
'''
|
||||
for (name, url, desc) in google.find('allintitle: site:epguides.com %s' % title, 1):
|
||||
if url.startswith('http://epguides.com'):
|
||||
if re.search(title, name):
|
||||
return url
|
||||
return None
|
||||
|
||||
def get_show_data(url):
|
||||
data = read_url(url, unicode=True)
|
||||
r = {}
|
||||
r['title'] = strip_tags(find_re(data, '<h1>(.*?)</h1>'))
|
||||
r['imdb'] = find_re(data, '<h1><a href=".*?/title/tt(\d.*?)">.*?</a></h1>')
|
||||
r['episodes'] = {}
|
||||
#1. 1- 1 1001 7 Aug 05 You Can't Miss the Bear
|
||||
for episode in re.compile('(\d+?)\..*?(\d+?-.\d.*?) .*?(\d+?) .*?(.*?) <a target="_blank" href="(.*?)">(.*?)</a>').findall(data):
|
||||
air_date = episode[3].strip()
|
||||
#'22 Sep 04' -> 2004-09-22
|
||||
try:
|
||||
air_date = time.strftime('%Y-%m-%d', time.strptime(air_date, '%d %b %y'))
|
||||
except:
|
||||
pass
|
||||
s = episode[1].split('-')[0].strip()
|
||||
e = episode[1].split('-')[-1].strip()
|
||||
try:
|
||||
r['episodes']['S%02dE%02d' % (int(s), int(e))] = {
|
||||
'prod code': episode[2],
|
||||
'air date': air_date,
|
||||
'url': episode[4],
|
||||
'title':episode[5],
|
||||
}
|
||||
except:
|
||||
print "oxweb.epguides failed,", url
|
||||
return r
|
||||
|
||||
39
Shared/lib/python3.4/site-packages/ox/web/filmsdivision.py
Normal file
39
Shared/lib/python3.4/site-packages/ox/web/filmsdivision.py
Normal file
|
|
@ -0,0 +1,39 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
import re
|
||||
import string
|
||||
import subprocess
|
||||
import ox
|
||||
import os
|
||||
|
||||
def get_ids():
|
||||
result = []
|
||||
for i in string.ascii_uppercase:
|
||||
url = "http://www.filmsdivision.org/search.php?title=%s" % i
|
||||
data = ox.cache.read_url(url)
|
||||
links = re.compile('view_video.php\?movId=(.*?)[\'"]', re.DOTALL).findall(data)
|
||||
result += links
|
||||
return list(set(result))
|
||||
|
||||
def get_data(id):
|
||||
result = {}
|
||||
url = "http://www.filmsdivision.org/view_video.php?movId=%s" % id
|
||||
data = ox.cache.read_url(url)
|
||||
result['title'] = re.compile('<td.*?class="vdoheadtxt".*?>(.*?)</td>').findall(data)[0]
|
||||
result['year'] = re.compile('Release: (\d{4})').findall(data)[0]
|
||||
result['duration'] = int(re.compile('Duration: (\d+)mins').findall(data)[0]) * 60
|
||||
result['producer'] = re.compile('Producer: (.*?)\t').findall(data)[0].strip()
|
||||
if 'Director:' in data:
|
||||
result['director'] = re.compile('Director: (.*?)\t').findall(data)[0].strip()
|
||||
else:
|
||||
result['director'] = "Unknown Director"
|
||||
result['url'] = re.compile('value="(.*?.wmv)"').findall(data)[0]
|
||||
return result
|
||||
|
||||
def download_video(url, filename):
|
||||
dirname = os.path.dirname(filename)
|
||||
if not os.path.exists(dirname):
|
||||
os.makedirs(dirname)
|
||||
p = subprocess.Popen(['gst-launch', 'mmssrc', 'location=%s'%url, '!', 'filesink', 'locaiton='%filename])
|
||||
p.wait()
|
||||
return p.returncode == 0
|
||||
74
Shared/lib/python3.4/site-packages/ox/web/flixter.py
Normal file
74
Shared/lib/python3.4/site-packages/ox/web/flixter.py
Normal file
|
|
@ -0,0 +1,74 @@
|
|||
# -*- coding: UTF-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
|
||||
import re
|
||||
from lxml.html import document_fromstring
|
||||
|
||||
from ox.cache import read_url
|
||||
from ox import find_re, strip_tags
|
||||
from ox.web.imdb import ImdbCombined
|
||||
|
||||
|
||||
def get_data(id, timeout=-1):
|
||||
'''
|
||||
>>> get_data('the-matrix')['poster']
|
||||
'http://content7.flixster.com/movie/16/90/52/1690525_gal.jpg'
|
||||
|
||||
>>> get_data('0133093')['poster']
|
||||
'http://content7.flixster.com/movie/16/90/52/1690525_gal.jpg'
|
||||
|
||||
>>> get_data('2-or-3-things-i-know-about-her')['poster']
|
||||
'http://content6.flixster.com/movie/10/95/43/10954392_gal.jpg'
|
||||
|
||||
>>> get_data('0078875')['rottentomatoes_id']
|
||||
'http://www.rottentomatoes.com/m/the-tin-drum/'
|
||||
'''
|
||||
if len(id) == 7:
|
||||
try:
|
||||
int(id)
|
||||
id = get_id(imdb=id)
|
||||
except:
|
||||
pass
|
||||
data = {
|
||||
"url": get_url(id),
|
||||
}
|
||||
html = read_url(data['url'], timeout=timeout, unicode=True)
|
||||
doc = document_fromstring(html)
|
||||
|
||||
props = {
|
||||
'og:title': 'title',
|
||||
'og:image': 'poster',
|
||||
'og:url': 'rottentomatoes_id',
|
||||
}
|
||||
for meta in doc.head.findall('meta'):
|
||||
prop = meta.attrib.get('property', None)
|
||||
content = meta.attrib.get('content', '')
|
||||
if prop in props and content:
|
||||
data[props[prop]] = content
|
||||
|
||||
for p in doc.body.find_class('synopsis'):
|
||||
data['synopsis'] = p.text.strip()
|
||||
|
||||
if 'poster' in data and data['poster']:
|
||||
data['poster'] = data['poster'].replace('_pro.jpg', '_gal.jpg')
|
||||
if not 'title' in data:
|
||||
return None
|
||||
return data
|
||||
|
||||
def get_id(url=None, imdb=None):
|
||||
'''
|
||||
>>> get_id(imdb='0133093')
|
||||
u'the-matrix'
|
||||
|
||||
#>>> get_id(imdb='0060304')
|
||||
#u'2-or-3-things-i-know-about-her'
|
||||
'''
|
||||
if imdb:
|
||||
i = ImdbCombined(imdb)
|
||||
title = i['title']
|
||||
return title.replace(' ', '-').lower().replace("'", '')
|
||||
return url.split('/')[-1]
|
||||
|
||||
def get_url(id):
|
||||
return "http://www.flixster.com/movie/%s"%id
|
||||
|
||||
42
Shared/lib/python3.4/site-packages/ox/web/freebase.py
Normal file
42
Shared/lib/python3.4/site-packages/ox/web/freebase.py
Normal file
|
|
@ -0,0 +1,42 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
import json
|
||||
|
||||
from ox.cache import read_url
|
||||
from ox import find_re
|
||||
|
||||
class Freebase(dict):
|
||||
def __init__(self, id, timeout=-1):
|
||||
url = "http://ids.freebaseapps.com/get_ids?id=/authority/imdb/title/tt%s" % id
|
||||
'''
|
||||
"http://graph.freebase.com/imdb.title.tt%s" % id
|
||||
might also be of interest at some point, right now not much info
|
||||
'''
|
||||
data = read_url(url, unicode=True)
|
||||
try:
|
||||
data = json.loads(data)
|
||||
except ValueError:
|
||||
return
|
||||
'''
|
||||
for key in data:
|
||||
self[key] = data[key]
|
||||
'''
|
||||
for key in ('id', 'guid', 'name'):
|
||||
self[key] = data[key]
|
||||
keys = {
|
||||
'wikipedia': '/wikipedia/en',
|
||||
'netflix': '/authority/netflix/movie',
|
||||
'nytimes': '/source/nytimes/movie',
|
||||
'metacritic': '/source/metacritic/movie',
|
||||
}
|
||||
for key in keys:
|
||||
links = filter(lambda x: x['namespace'] == keys[key],data['ids'])
|
||||
if links:
|
||||
self[key] = links[0]['uri']
|
||||
|
||||
if 'nytimes' in self:
|
||||
self['nytimes'] = self['nytimes'].replace('_/overview', '%s/overview' % self['name'].replace(' ', '-'))
|
||||
self['amgId'] = find_re(self['nytimes'], 'movie/(\d+)/')
|
||||
|
||||
|
||||
|
||||
44
Shared/lib/python3.4/site-packages/ox/web/google.py
Normal file
44
Shared/lib/python3.4/site-packages/ox/web/google.py
Normal file
|
|
@ -0,0 +1,44 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
import re
|
||||
from six.moves import urllib
|
||||
|
||||
import ox
|
||||
from ox import strip_tags, decode_html
|
||||
|
||||
DEFAULT_MAX_RESULTS = 10
|
||||
DEFAULT_TIMEOUT = 24*60*60
|
||||
|
||||
def read_url(url, data=None, headers=ox.net.DEFAULT_HEADERS, timeout=DEFAULT_TIMEOUT):
|
||||
return ox.cache.read_url(url, data, headers, timeout, unicode=True)
|
||||
|
||||
def quote_plus(s):
|
||||
if not isinstance(s, bytes):
|
||||
s = s.encode('utf-8')
|
||||
return urllib.parse.quote_plus(s)
|
||||
|
||||
def find(query, max_results=DEFAULT_MAX_RESULTS, timeout=DEFAULT_TIMEOUT):
|
||||
"""
|
||||
Return max_results tuples with title, url, description
|
||||
|
||||
>>> find("The Matrix site:imdb.com", 1)[0][0]
|
||||
u'The Matrix (1999) - IMDb'
|
||||
|
||||
>>> find("The Matrix site:imdb.com", 1)[0][1]
|
||||
u'http://www.imdb.com/title/tt0133093/'
|
||||
"""
|
||||
results = []
|
||||
offset = 0
|
||||
while len(results) < max_results:
|
||||
url = 'http://google.com/search?q=%s' % quote_plus(query)
|
||||
if offset:
|
||||
url += '&start=%d' % offset
|
||||
data = read_url(url, timeout=timeout)
|
||||
data = re.sub('<span class="f">(.*?)</span>', '\\1', data)
|
||||
for a in re.compile('<a href="(htt\S+?)".*?>(.*?)</a>.*?<span class="st">(.*?)<\/span>').findall(data):
|
||||
results.append((strip_tags(decode_html(a[1])), a[0], strip_tags(decode_html(a[2]))))
|
||||
if len(results) >= max_results:
|
||||
break
|
||||
offset += 10
|
||||
return results
|
||||
|
||||
821
Shared/lib/python3.4/site-packages/ox/web/imdb.py
Normal file
821
Shared/lib/python3.4/site-packages/ox/web/imdb.py
Normal file
|
|
@ -0,0 +1,821 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
from __future__ import print_function
|
||||
|
||||
import re
|
||||
import time
|
||||
import unicodedata
|
||||
|
||||
from six.moves import urllib
|
||||
from six import string_types
|
||||
|
||||
|
||||
from .. import find_re, strip_tags, decode_html
|
||||
from .. import cache
|
||||
|
||||
|
||||
from . siteparser import SiteParser
|
||||
from . import duckduckgo
|
||||
from ..utils import datetime
|
||||
from ..geo import normalize_country_name
|
||||
|
||||
def read_url(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None, unicode=False):
|
||||
headers = headers.copy()
|
||||
return cache.read_url(url, data, headers, timeout, unicode=unicode)
|
||||
|
||||
def get_url(id):
|
||||
return "http://www.imdb.com/title/tt%s/" % id
|
||||
|
||||
class Imdb(SiteParser):
|
||||
'''
|
||||
>>> Imdb('0068646')['title']
|
||||
u'The Godfather'
|
||||
|
||||
>>> Imdb('0133093')['title']
|
||||
u'The Matrix'
|
||||
'''
|
||||
regex = {
|
||||
'alternativeTitles': {
|
||||
'page': 'releaseinfo',
|
||||
're': [
|
||||
'name="akas".*?<table.*?>(.*?)</table>',
|
||||
"td>(.*?)</td>.*?<td>(.*?)</td>"
|
||||
],
|
||||
'type': 'list'
|
||||
|
||||
},
|
||||
'aspectratio': {
|
||||
'page': 'combined',
|
||||
're': 'Aspect Ratio:</h5><div class="info-content">([\d\.]+)',
|
||||
'type': 'float',
|
||||
},
|
||||
'budget': {
|
||||
'page': 'business',
|
||||
're': [
|
||||
'<h5>Budget</h5>\s*?\$(.*?)<br',
|
||||
lambda data: find_re(decode_html(data).replace(',', ''), '\d+')
|
||||
],
|
||||
'type': 'int'
|
||||
},
|
||||
'cast': {
|
||||
'page': 'combined',
|
||||
're': [
|
||||
'<td class="nm">.*?>(.*?)</a>.*?<td class="char">(.*?)</td>',
|
||||
lambda ll: [strip_tags(l) for l in ll]
|
||||
],
|
||||
'type': 'list'
|
||||
},
|
||||
'cinematographer': {
|
||||
'page': 'combined',
|
||||
're': [
|
||||
lambda data: data.split('Series Crew')[0],
|
||||
'Cinematography by</a>(.*?)</table>',
|
||||
'<a href="/name/.*?/">(.*?)</a>'
|
||||
],
|
||||
'type': 'list'
|
||||
},
|
||||
'connections': {
|
||||
'page': 'trivia?tab=mc',
|
||||
're': '<h4 class="li_group">(.*?)</h4>(.*?)(<\/div>\n <a|<script)',
|
||||
'type': 'list'
|
||||
},
|
||||
'country': {
|
||||
'page': 'combined',
|
||||
're': [
|
||||
'<div class="info"><h5>Country:</h5>.*?<div class="info">',
|
||||
#'<a href="/country/.*?">(.*?)</a>', #links changed to work with existing caches, just take all links
|
||||
'<a.*?>(.*?)</a>',
|
||||
],
|
||||
'type': 'list'
|
||||
},
|
||||
'creator': {
|
||||
'page': 'combined',
|
||||
're': [
|
||||
'<h5>Creator.?:</h5>.*?<div class="info-content">(.*?)</div>',
|
||||
'<a href="/name/.*?>(.*?)</a>'
|
||||
],
|
||||
'type': 'list'
|
||||
},
|
||||
'director': {
|
||||
'page': 'combined',
|
||||
're': [
|
||||
lambda data: data.split('<b>Series Crew</b>')[0],
|
||||
'Directed by</a>(.*?)</table>',
|
||||
'<a href="/name/.*?>(.*?)</a>'
|
||||
],
|
||||
'type': 'list'
|
||||
},
|
||||
'_director': {
|
||||
'page': 'combined',
|
||||
're': [
|
||||
'<h5>Director:</h5>.*?<div class="info-content">(.*?)</div>',
|
||||
'<a href="/name/.*?>(.*?)</a>'
|
||||
],
|
||||
'type': 'list'
|
||||
},
|
||||
'editor': {
|
||||
'page': 'combined',
|
||||
're': [
|
||||
lambda data: data.split('Series Crew')[0],
|
||||
'Film Editing by</a>(.*?)</table>',
|
||||
'<a href="/name/.*?>(.*?)</a>'
|
||||
],
|
||||
'type': 'list'
|
||||
},
|
||||
'composer': {
|
||||
'page': 'combined',
|
||||
're': [
|
||||
lambda data: data.split('Series Crew')[0],
|
||||
'Original Music by</a>(.*?)</table>',
|
||||
'<a href="/name/.*?>(.*?)</a>'
|
||||
],
|
||||
'type': 'list'
|
||||
},
|
||||
'episodeTitle': {
|
||||
'page': 'combined',
|
||||
're': '<div id="tn15title">.*?<em>(.*?)</em>',
|
||||
'type': 'string'
|
||||
},
|
||||
'filmingLocations': {
|
||||
'page': 'locations',
|
||||
're': [
|
||||
'<a href="/search/title\?locations=.*?".*?>(.*?)</a>',
|
||||
lambda data: data.strip(),
|
||||
],
|
||||
'type': 'list'
|
||||
},
|
||||
'genre': {
|
||||
'page': 'combined',
|
||||
're': [
|
||||
'<h5>Genre:</h5>(.*?)<hr',
|
||||
'<a href="/Sections/Genres/.*?/">(.*?)</a>'
|
||||
],
|
||||
'type': 'list'
|
||||
},
|
||||
'gross': {
|
||||
'page': 'business',
|
||||
're': [
|
||||
'<h5>Gross</h5>\s*?\$(.*?)<br',
|
||||
lambda data: find_re(data.replace(',', ''), '\d+')
|
||||
],
|
||||
'type': 'int'
|
||||
},
|
||||
'keyword': {
|
||||
'page': 'keywords',
|
||||
're': '<a href="/keyword/.*?>(.*?)</a>',
|
||||
'type': 'list'
|
||||
},
|
||||
'language': {
|
||||
'page': 'combined',
|
||||
're': [
|
||||
'<div class="info"><h5>Language:</h5>.*?<div class="info">',
|
||||
#'<a href="/language/.*?">(.*?)</a>', #links changed to work with existing caches, just take all links
|
||||
'<a.*?>(.*?)</a>',
|
||||
],
|
||||
'type': 'list'
|
||||
},
|
||||
'summary': {
|
||||
'page': 'plotsummary',
|
||||
're': '<p class="plotSummary">(.*?)<\/p>',
|
||||
'type': 'string'
|
||||
},
|
||||
'posterId': {
|
||||
'page': 'combined',
|
||||
're': '/primary-photo/media/rm(.*?)/tt',
|
||||
'type': 'string'
|
||||
},
|
||||
'posterIds': {
|
||||
'page': 'posters',
|
||||
're': '/unknown-thumbnail/media/rm(.*?)/tt',
|
||||
'type': 'list'
|
||||
},
|
||||
'producer': {
|
||||
'page': 'combined',
|
||||
're': [
|
||||
lambda data: data.split('Series Crew')[0],
|
||||
'Produced by</a>(.*?)</table>',
|
||||
'<a href="/name/.*?/">(.*?)</a>'
|
||||
],
|
||||
'type': 'list'
|
||||
},
|
||||
'productionCompany': {
|
||||
'page': 'combined',
|
||||
're': [
|
||||
'Production Companies</b><ul>(.*?)</ul>',
|
||||
'<a href="/company/.*?/">(.*?)</a>'
|
||||
],
|
||||
'type': 'list'
|
||||
},
|
||||
'rating': {
|
||||
'page': 'combined',
|
||||
're': '<div class="starbar-meta">.*?<b>([\d,.]+?)/10</b>',
|
||||
'type': 'float'
|
||||
},
|
||||
'releasedate': {
|
||||
'page': 'releaseinfo',
|
||||
're': [
|
||||
'<td class="release_date">(.*?)</td>',
|
||||
strip_tags,
|
||||
],
|
||||
'type': 'list'
|
||||
},
|
||||
'reviews': {
|
||||
'page': 'externalreviews',
|
||||
're': [
|
||||
'<ol>(.*?)</ol>',
|
||||
'<li><a href="(http.*?)".*?>(.*?)</a></li>'
|
||||
],
|
||||
'type': 'list'
|
||||
},
|
||||
'runtime': {
|
||||
'page': 'combined',
|
||||
're': '<h5>Runtime:</h5><div class="info-content">.*?([0-9]+ sec|[0-9]+ min).*?</div>',
|
||||
'type': 'string'
|
||||
},
|
||||
'color': {
|
||||
'page': 'combined',
|
||||
're': [
|
||||
'<h5>Color:</h5><div class="info-content">(.*?)</div>',
|
||||
'<a.*?>(.*?)</a>'
|
||||
],
|
||||
'type': 'list'
|
||||
},
|
||||
'sound': {
|
||||
'page': 'combined',
|
||||
're': [
|
||||
'<h5>Sound Mix:</h5><div class="info-content">(.*?)</div>',
|
||||
'<a.*?>(.*?)</a>'
|
||||
],
|
||||
'type': 'list'
|
||||
},
|
||||
'season': {
|
||||
'page': 'combined',
|
||||
're': [
|
||||
'<h5>Original Air Date:</h5>.*?<div class="info-content">(.*?)</div>',
|
||||
'\(Season (\d+), Episode \d+\)',
|
||||
],
|
||||
'type': 'int'
|
||||
},
|
||||
'episode': {
|
||||
'page': 'combined',
|
||||
're': [
|
||||
'<h5>Original Air Date:</h5>.*?<div class="info-content">(.*?)</div>',
|
||||
'\(Season \d+, Episode (\d+)\)',
|
||||
],
|
||||
'type': 'int'
|
||||
},
|
||||
'series': {
|
||||
'page': 'combined',
|
||||
're': '<h5>TV Series:</h5>.*?<a href="/title/tt(\d{7})',
|
||||
'type': 'string'
|
||||
},
|
||||
'isSeries': {
|
||||
'page': 'combined',
|
||||
're': '<span class="tv-extra">(TV series|TV mini-series) ',
|
||||
'type': 'string'
|
||||
},
|
||||
'title': {
|
||||
'page': 'combined',
|
||||
're': '<h1>(.*?) <span>',
|
||||
'type': 'string'
|
||||
},
|
||||
'trivia': {
|
||||
'page': 'trivia',
|
||||
're': [
|
||||
'<div class="sodatext">(.*?)<(br|/div)',
|
||||
lambda data: data[0]
|
||||
],
|
||||
'type': 'list',
|
||||
},
|
||||
'votes': {
|
||||
'page': 'combined',
|
||||
're': '<a href="ratings" class="tn15more">([\d,]*?) votes</a>',
|
||||
'type': 'string'
|
||||
},
|
||||
'writer': {
|
||||
'page': 'combined',
|
||||
're': [
|
||||
lambda data: data.split('Series Crew')[0],
|
||||
'Writing credits</a>(.*?)</table>',
|
||||
'<a href="/name/.*?/">(.*?)</a>'
|
||||
],
|
||||
'type': 'list'
|
||||
},
|
||||
'year': {
|
||||
'page': 'combined',
|
||||
're': '="og:title" content="[^"]*?\((\d{4}).*?"',
|
||||
'type': 'int'
|
||||
}
|
||||
}
|
||||
|
||||
def read_url(self, url, timeout):
|
||||
if not url in self._cache:
|
||||
self._cache[url] = read_url(url, timeout=timeout, unicode=True)
|
||||
return self._cache[url]
|
||||
|
||||
def __init__(self, id, timeout=-1):
|
||||
#use akas.imdb.com to always get original title:
|
||||
#http://www.imdb.com/help/show_leaf?titlelanguagedisplay
|
||||
self.baseUrl = "http://akas.imdb.com/title/tt%s/" % id
|
||||
super(Imdb, self).__init__(timeout)
|
||||
|
||||
url = self.baseUrl + 'combined'
|
||||
page = self.read_url(url, timeout=-1)
|
||||
if '<title>IMDb: Page not found</title>' in page \
|
||||
or 'The requested URL was not found on our server.' in page:
|
||||
return
|
||||
if "<p>We're sorry, something went wrong.</p>" in page:
|
||||
time.sleep(1)
|
||||
super(Imdb, self).__init__(0)
|
||||
|
||||
if 'alternativeTitles' in self:
|
||||
if len(self['alternativeTitles']) == 2 and \
|
||||
isinstance(self['alternativeTitles'][0], string_types):
|
||||
self['alternativeTitles'] = [self['alternativeTitles']]
|
||||
|
||||
#normalize country names
|
||||
if 'country' in self:
|
||||
self['country'] = [normalize_country_name(c) or c for c in self['country']]
|
||||
|
||||
if 'sound' in self:
|
||||
self['sound'] = list(set(self['sound']))
|
||||
|
||||
types = {}
|
||||
stop_words = [
|
||||
'alternative spelling',
|
||||
'alternative title',
|
||||
'alternative transliteration',
|
||||
'closing credits title',
|
||||
'complete title',
|
||||
'IMAX version',
|
||||
'informal short title',
|
||||
'International (Spanish title)',
|
||||
'Japan (imdb display title)',
|
||||
'longer version',
|
||||
'new title',
|
||||
'original subtitled version',
|
||||
'pre-release title',
|
||||
'promotional abbreviation',
|
||||
'recut version',
|
||||
'reissue title',
|
||||
'restored version',
|
||||
'script title',
|
||||
'short title',
|
||||
'(subtitle)',
|
||||
'TV title',
|
||||
'working title',
|
||||
'World-wide (Spanish title)',
|
||||
]
|
||||
#ignore english japanese titles
|
||||
#for movies that are not only from japan
|
||||
if ['Japan'] != self.get('country', []):
|
||||
stop_words += [
|
||||
'Japan (English title)'
|
||||
]
|
||||
for t in self.get('alternativeTitles', []):
|
||||
for type in t[0].split('/'):
|
||||
type = type.strip()
|
||||
stop_word = False
|
||||
for key in stop_words:
|
||||
if key in type:
|
||||
stop_word = True
|
||||
break
|
||||
if not stop_word:
|
||||
if not type in types:
|
||||
types[type] = []
|
||||
types[type].append(t[1])
|
||||
titles = {}
|
||||
for type in types:
|
||||
for title in types[type]:
|
||||
if not title in titles:
|
||||
titles[title] = []
|
||||
titles[title].append(type)
|
||||
def select_title(type):
|
||||
title = types[type][0]
|
||||
count = 0
|
||||
if len(types[type]) > 1:
|
||||
for t in types[type]:
|
||||
if len(titles[t]) > count:
|
||||
count = len(titles[t])
|
||||
title = t
|
||||
return title
|
||||
|
||||
#FIXME: does work in python2.6, possible to import from __future__?
|
||||
#types = {type: select_title(type) for type in types}
|
||||
_types = {}
|
||||
for type in types:
|
||||
_types[type] = select_title(type)
|
||||
types = _types
|
||||
|
||||
regexps = [
|
||||
"^.+ \(imdb display title\) \(English title\)$",
|
||||
"^USA \(imdb display title\)$",
|
||||
"^International \(English title\)$",
|
||||
"^International \(English title\)$",
|
||||
"^UK \(imdb display title\)$",
|
||||
"^International \(.+\) \(English title\)$",
|
||||
"^World-wide \(English title\)$",
|
||||
]
|
||||
if 'Hong Kong' in self.get('country', []):
|
||||
regexps += [
|
||||
"Hong Kong \(English title\)"
|
||||
]
|
||||
english_countries = (
|
||||
'USA', 'UK', 'United States', 'United Kingdom',
|
||||
'Australia', 'New Zealand'
|
||||
)
|
||||
if not filter(lambda c: c in english_countries, self.get('country', [])):
|
||||
regexps += [
|
||||
"^[^(]+ \(English title\)$",
|
||||
"^.+ \(.+\) \(English title\)$",
|
||||
"^USA$",
|
||||
"^UK$",
|
||||
"^USA \(.+\)$",
|
||||
"^UK \(.+\)$",
|
||||
"^Australia \(.+\)$",
|
||||
"World-wide \(English title\)",
|
||||
"\(literal English title\)",
|
||||
"^International \(.+ title\)$",
|
||||
"^International \(.+\) \(.+ title\)$",
|
||||
]
|
||||
for regexp in regexps:
|
||||
for type in types:
|
||||
if re.compile(regexp).findall(type):
|
||||
#print types[type], type
|
||||
self['internationalTitle'] = types[type]
|
||||
break
|
||||
if 'internationalTitle' in self:
|
||||
break
|
||||
|
||||
def cleanup_title(title):
|
||||
if title.startswith('"') and title.endswith('"'):
|
||||
title = title[1:-1]
|
||||
if title.startswith("'") and title.endswith("'"):
|
||||
title = title[1:-1]
|
||||
title = re.sub('\(\#[.\d]+\)', '', title)
|
||||
return title.strip()
|
||||
|
||||
for t in ('title', 'internationalTitle'):
|
||||
if t in self:
|
||||
self[t] = cleanup_title(self[t])
|
||||
|
||||
if 'internationalTitle' in self and \
|
||||
self.get('title', '').lower() == self['internationalTitle'].lower():
|
||||
del self['internationalTitle']
|
||||
|
||||
if 'alternativeTitles' in self:
|
||||
alt = {}
|
||||
for t in self['alternativeTitles']:
|
||||
title = cleanup_title(t[1])
|
||||
if title not in (self.get('title'), self.get('internationalTitle')):
|
||||
if title not in alt:
|
||||
alt[title] = []
|
||||
for c in t[0].split('/'):
|
||||
if not '(working title)' in c:
|
||||
c = c.replace('International', '').replace('World-wide', '').split('(')[0].strip()
|
||||
if c:
|
||||
alt[title].append(c)
|
||||
self['alternativeTitles'] = []
|
||||
for t in sorted(alt, key=lambda a: sorted(alt[a])):
|
||||
if alt[t]:
|
||||
countries = sorted([normalize_country_name(c) or c for c in alt[t]])
|
||||
self['alternativeTitles'].append((t, countries))
|
||||
if not self['alternativeTitles']:
|
||||
del self['alternativeTitles']
|
||||
|
||||
if 'internationalTitle' in self:
|
||||
self['originalTitle'] = self['title']
|
||||
self['title'] = self.pop('internationalTitle')
|
||||
|
||||
if 'runtime' in self and self['runtime']:
|
||||
if 'min' in self['runtime']: base=60
|
||||
else: base=1
|
||||
self['runtime'] = int(find_re(self['runtime'], '([0-9]+)')) * base
|
||||
if 'runtime' in self and not self['runtime']:
|
||||
del self['runtime']
|
||||
if 'votes' in self: self['votes'] = self['votes'].replace(',', '')
|
||||
|
||||
if 'cast' in self:
|
||||
if isinstance(self['cast'][0], string_types):
|
||||
self['cast'] = [self['cast']]
|
||||
self['actor'] = [c[0] for c in self['cast']]
|
||||
def cleanup_character(c):
|
||||
c = c.replace('(uncredited)', '').strip()
|
||||
return c
|
||||
self['cast'] = [{'actor': x[0], 'character': cleanup_character(x[1])}
|
||||
for x in self['cast']]
|
||||
|
||||
if 'connections' in self:
|
||||
cc={}
|
||||
if len(self['connections']) == 3 and isinstance(self['connections'][0], string_types):
|
||||
self['connections'] = [self['connections']]
|
||||
for rel, data, _ in self['connections']:
|
||||
if isinstance(rel, bytes):
|
||||
rel = rel.decode('utf-8')
|
||||
#cc[rel] = re.compile('<a href="/title/tt(\d{7})/">(.*?)</a>').findall(data)
|
||||
def get_conn(c):
|
||||
r = {
|
||||
'id': c[0],
|
||||
'title': cleanup_title(c[1]),
|
||||
}
|
||||
description = c[2].split('<br />')
|
||||
if len(description) == 2 and description[-1].strip() != '-':
|
||||
r['description'] = description[-1].strip()
|
||||
return r
|
||||
cc[rel] = list(map(get_conn, re.compile('<a href="/title/tt(\d{7})/">(.*?)</a>(.*?)<\/div', re.DOTALL).findall(data)))
|
||||
|
||||
|
||||
self['connections'] = cc
|
||||
|
||||
for key in ('country', 'genre'):
|
||||
if key in self:
|
||||
self[key] = list(filter(lambda x: x.lower() != 'home', self[key]))
|
||||
#0092999
|
||||
if '_director' in self:
|
||||
if 'series' in self or 'isSeries' in self:
|
||||
self['creator'] = self.pop('_director')
|
||||
else:
|
||||
del self['_director']
|
||||
if 'isSeries' in self:
|
||||
del self['isSeries']
|
||||
self['isSeries'] = True
|
||||
if 'episodeTitle' in self:
|
||||
self['episodeTitle'] = re.sub('Episode \#\d+\.\d+', '', self['episodeTitle'])
|
||||
|
||||
if 'series' in self:
|
||||
series = Imdb(self['series'], timeout=timeout)
|
||||
self['seriesTitle'] = series['title']
|
||||
if 'episodeTitle' in self:
|
||||
self['seriesTitle'] = series['title']
|
||||
if 'season' in self and 'episode' in self:
|
||||
self['title'] = "%s (S%02dE%02d) %s" % (
|
||||
self['seriesTitle'], self['season'], self['episode'], self['episodeTitle'])
|
||||
else:
|
||||
self['title'] = "%s (S01) %s" % (self['seriesTitle'], self['episodeTitle'])
|
||||
self['season'] = 1
|
||||
self['title'] = self['title'].strip()
|
||||
if 'director' in self:
|
||||
self['episodeDirector'] = self['director']
|
||||
|
||||
if not 'creator' in series and 'director' in series:
|
||||
series['creator'] = series['director']
|
||||
if len(series['creator']) > 10:
|
||||
series['creator'] = series['director'][:1]
|
||||
|
||||
for key in ['creator', 'country']:
|
||||
if key in series:
|
||||
self[key] = series[key]
|
||||
|
||||
if 'year' in series:
|
||||
self['seriesYear'] = series['year']
|
||||
if not 'year' in self:
|
||||
self['year'] = series['year']
|
||||
|
||||
if 'year' in self:
|
||||
self['episodeYear'] = self['year']
|
||||
if 'creator' in self:
|
||||
self['seriesDirector'] = self['creator']
|
||||
if 'originalTitle' in self:
|
||||
del self['originalTitle']
|
||||
else:
|
||||
for key in ('seriesTitle', 'episodeTitle', 'season', 'episode'):
|
||||
if key in self:
|
||||
del self[key]
|
||||
if 'creator' in self:
|
||||
if 'director' in self:
|
||||
self['episodeDirector'] = self['director']
|
||||
self['director'] = self['creator']
|
||||
|
||||
#make lists unique but keep order
|
||||
for key in ('director', 'language'):
|
||||
if key in self:
|
||||
self[key] = [x for i,x in enumerate(self[key])
|
||||
if x not in self[key][i+1:]]
|
||||
|
||||
for key in ('actor', 'writer', 'producer', 'editor', 'composer'):
|
||||
if key in self:
|
||||
if isinstance(self[key][0], list):
|
||||
self[key] = [i[0] for i in self[key] if i]
|
||||
self[key] = sorted(list(set(self[key])), key=lambda a: self[key].index(a))
|
||||
|
||||
|
||||
if 'budget' in self and 'gross' in self:
|
||||
self['profit'] = self['gross'] - self['budget']
|
||||
|
||||
if 'releasedate' in self:
|
||||
def parse_date(d):
|
||||
try:
|
||||
d = datetime.strptime(d, '%d %B %Y')
|
||||
except:
|
||||
try:
|
||||
d = datetime.strptime(d, '%B %Y')
|
||||
except:
|
||||
return 'x'
|
||||
return '%d-%02d-%02d' % (d.year, d.month, d.day)
|
||||
self['releasedate'] = min([
|
||||
parse_date(d) for d in self['releasedate']
|
||||
])
|
||||
if self['releasedate'] == 'x':
|
||||
del self['releasedate']
|
||||
if 'summary' in self:
|
||||
if isinstance(self['summary'], list):
|
||||
self['summary'] = self['summary'][0]
|
||||
self['summary'] = self['summary'].split('</p')[0].strip()
|
||||
|
||||
class ImdbCombined(Imdb):
|
||||
def __init__(self, id, timeout=-1):
|
||||
_regex = {}
|
||||
for key in self.regex:
|
||||
if self.regex[key]['page'] in ('combined', 'releaseinfo'):
|
||||
_regex[key] = self.regex[key]
|
||||
self.regex = _regex
|
||||
super(ImdbCombined, self).__init__(id, timeout)
|
||||
|
||||
def get_movie_by_title(title, timeout=-1):
|
||||
'''
|
||||
This only works for exact title matches from the data dump
|
||||
Usually in the format
|
||||
Title (Year)
|
||||
"Series Title" (Year) {(#Season.Episode)}
|
||||
"Series Title" (Year) {Episode Title (#Season.Episode)}
|
||||
|
||||
If there is more than one film with that title for the year
|
||||
Title (Year/I)
|
||||
|
||||
>>> get_movie_by_title(u'"Father Knows Best" (1954) {(#5.34)}')
|
||||
u'1602860'
|
||||
|
||||
>>> get_movie_by_title(u'The Matrix (1999)')
|
||||
u'0133093'
|
||||
|
||||
>>> get_movie_by_title(u'Little Egypt (1951)')
|
||||
u'0043748'
|
||||
|
||||
>>> get_movie_by_title(u'Little Egypt (1897/I)')
|
||||
u'0214882'
|
||||
|
||||
>>> get_movie_by_title(u'Little Egypt')
|
||||
None
|
||||
|
||||
>>> get_movie_by_title(u'"Dexter" (2006) {Father Knows Best (#1.9)}')
|
||||
u'0866567'
|
||||
'''
|
||||
params = {'s':'tt','q': title}
|
||||
if not isinstance(title, bytes):
|
||||
try:
|
||||
params['q'] = unicodedata.normalize('NFKC', params['q']).encode('latin-1')
|
||||
except:
|
||||
params['q'] = params['q'].encode('utf-8')
|
||||
params = urllib.urlencode(params)
|
||||
url = "http://akas.imdb.com/find?" + params
|
||||
data = read_url(url, timeout=timeout, unicode=True)
|
||||
#if search results in redirect, get id of current page
|
||||
r = '<meta property="og:url" content="http://www.imdb.com/title/tt(\d{7})/" />'
|
||||
results = re.compile(r).findall(data)
|
||||
if results:
|
||||
return results[0]
|
||||
return None
|
||||
|
||||
def get_movie_id(title, director='', year='', timeout=-1):
|
||||
'''
|
||||
>>> get_movie_id('The Matrix')
|
||||
u'0133093'
|
||||
|
||||
>>> get_movie_id('2 or 3 Things I Know About Her', 'Jean-Luc Godard')
|
||||
u'0060304'
|
||||
|
||||
>>> get_movie_id('2 or 3 Things I Know About Her', 'Jean-Luc Godard', '1967')
|
||||
u'0060304'
|
||||
|
||||
>>> get_movie_id(u"Histoire(s) du cinema: Le controle de l'univers", 'Jean-Luc Godard')
|
||||
u'0179214'
|
||||
|
||||
>>> get_movie_id(u"Histoire(s) du cinéma: Le contrôle de l'univers", 'Jean-Luc Godard')
|
||||
u'0179214'
|
||||
'''
|
||||
imdbId = {
|
||||
(u'Le jour se l\xe8ve', u'Marcel Carn\xe9'): '0031514',
|
||||
(u'Wings', u'Larisa Shepitko'): '0061196',
|
||||
(u'The Ascent', u'Larisa Shepitko'): '0075404',
|
||||
(u'Fanny and Alexander', u'Ingmar Bergman'): '0083922',
|
||||
(u'Torment', u'Alf Sj\xf6berg'): '0036914',
|
||||
(u'Crisis', u'Ingmar Bergman'): '0038675',
|
||||
(u'To Joy', u'Ingmar Bergman'): '0043048',
|
||||
(u'Humain, trop humain', u'Louis Malle'): '0071635',
|
||||
(u'Place de la R\xe9publique', u'Louis Malle'): '0071999',
|
||||
(u'God\u2019s Country', u'Louis Malle'): '0091125',
|
||||
(u'Flunky, Work Hard', u'Mikio Naruse'): '0022036',
|
||||
(u'The Courtesans of Bombay', u'Richard Robbins') : '0163591',
|
||||
(u'Je tu il elle', u'Chantal Akerman') : '0071690',
|
||||
(u'Hotel Monterey', u'Chantal Akerman') : '0068725',
|
||||
(u'No Blood Relation', u'Mikio Naruse') : '023261',
|
||||
(u'Apart from You', u'Mikio Naruse') : '0024214',
|
||||
(u'Every-Night Dreams', u'Mikio Naruse') : '0024793',
|
||||
(u'Street Without End', u'Mikio Naruse') : '0025338',
|
||||
(u'Sisters of the Gion', u'Kenji Mizoguchi') : '0027672',
|
||||
(u'Osaka Elegy', u'Kenji Mizoguchi') : '0028021',
|
||||
(u'Blaise Pascal', u'Roberto Rossellini') : '0066839',
|
||||
(u'Japanese Girls at the Harbor', u'Hiroshi Shimizu') : '0160535',
|
||||
(u'The Private Life of Don Juan', u'Alexander Korda') : '0025681',
|
||||
(u'Last Holiday', u'Henry Cass') : '0042665',
|
||||
(u'A Colt Is My Passport', u'Takashi Nomura') : '0330536',
|
||||
(u'Androcles and the Lion', u'Chester Erskine') : '0044355',
|
||||
(u'Major Barbara', u'Gabriel Pascal') : '0033868',
|
||||
(u'Come On Children', u'Allan King') : '0269104',
|
||||
|
||||
(u'Jimi Plays Monterey & Shake! Otis at Monterey', u'D. A. Pennebaker and Chris Hegedus') : '',
|
||||
(u'Martha Graham: Dance on Film', u'Nathan Kroll') : '',
|
||||
(u'Carmen', u'Carlos Saura'): '0085297',
|
||||
(u'The Story of a Cheat', u'Sacha Guitry'): '0028201',
|
||||
(u'Weekend', 'Andrew Haigh'): '1714210',
|
||||
}.get((title, director), None)
|
||||
if imdbId:
|
||||
return imdbId
|
||||
params = {'s':'tt','q': title}
|
||||
if director:
|
||||
params['q'] = u'"%s" %s' % (title, director)
|
||||
if year:
|
||||
params['q'] = u'"%s (%s)" %s' % (title, year, director)
|
||||
google_query = "site:imdb.com %s" % params['q']
|
||||
if not isinstance(params['q'], bytes):
|
||||
try:
|
||||
params['q'] = unicodedata.normalize('NFKC', params['q']).encode('latin-1')
|
||||
except:
|
||||
params['q'] = params['q'].encode('utf-8')
|
||||
params = urllib.urlencode(params)
|
||||
url = "http://akas.imdb.com/find?" + params
|
||||
#print url
|
||||
|
||||
data = read_url(url, timeout=timeout, unicode=True)
|
||||
#if search results in redirect, get id of current page
|
||||
r = '<meta property="og:url" content="http://www.imdb.com/title/tt(\d{7})/" />'
|
||||
results = re.compile(r).findall(data)
|
||||
if results:
|
||||
return results[0]
|
||||
#otherwise get first result
|
||||
r = '<td valign="top">.*?<a href="/title/tt(\d{7})/"'
|
||||
results = re.compile(r).findall(data)
|
||||
if results:
|
||||
return results[0]
|
||||
|
||||
#print (title, director), ": '',"
|
||||
#print google_query
|
||||
#results = google.find(google_query, timeout=timeout)
|
||||
results = duckduckgo.find(google_query, timeout=timeout)
|
||||
if results:
|
||||
for r in results[:2]:
|
||||
imdbId = find_re(r[1], 'title/tt(\d{7})')
|
||||
if imdbId:
|
||||
return imdbId
|
||||
#or nothing
|
||||
return ''
|
||||
|
||||
def get_movie_poster(imdbId):
|
||||
'''
|
||||
>>> get_movie_poster('0133093')
|
||||
'http://ia.media-imdb.com/images/M/MV5BMjEzNjg1NTg2NV5BMl5BanBnXkFtZTYwNjY3MzQ5._V1._SX338_SY475_.jpg'
|
||||
|
||||
>>> get_movie_poster('0994352')
|
||||
'http://ia.media-imdb.com/images/M/MV5BMjA3NzMyMzU1MV5BMl5BanBnXkFtZTcwNjc1ODUwMg@@._V1._SX594_SY755_.jpg'
|
||||
'''
|
||||
info = ImdbCombined(imdbId)
|
||||
if 'posterId' in info:
|
||||
url = "http://www.imdb.com/media/rm%s/tt%s" % (info['posterId'], imdbId)
|
||||
data = read_url(url).decode('utf-8', 'ignore')
|
||||
poster = find_re(data, 'img.*?id="primary-img".*?src="(.*?)"')
|
||||
return poster
|
||||
elif 'series' in info:
|
||||
return get_movie_poster(info['series'])
|
||||
return ''
|
||||
|
||||
def get_episodes(imdbId, season=None):
|
||||
episodes = {}
|
||||
url = 'http://www.imdb.com/title/tt%s/episodes' % imdbId
|
||||
if season:
|
||||
url += '?season=%d' % season
|
||||
data = cache.read_url(url)
|
||||
for e in re.compile('<div data-const="tt(\d{7})".*?>.*?<div>S(\d+), Ep(\d+)<\/div>\n<\/div>', re.DOTALL).findall(data):
|
||||
episodes['S%02dE%02d' %(int(e[1]), int(e[2]))] = e[0]
|
||||
else:
|
||||
data = cache.read_url(url)
|
||||
match = re.compile('<strong>Season (\d+)</strong>').findall(data)
|
||||
if match:
|
||||
for season in range(1, int(match[0]) + 1):
|
||||
episodes.update(get_episodes(imdbId, season))
|
||||
return episodes
|
||||
|
||||
def max_votes():
|
||||
url = 'http://www.imdb.com/search/title?num_votes=500000,&sort=num_votes,desc'
|
||||
data = cache.read_url(url)
|
||||
votes = max([int(v.replace(',', ''))
|
||||
for v in re.compile('<td class="sort_col">([\d,]+)</td>').findall(data)])
|
||||
return votes
|
||||
|
||||
def guess(title, director='', timeout=-1):
|
||||
return get_movie_id(title, director, timeout=timeout)
|
||||
|
||||
if __name__ == "__main__":
|
||||
import json
|
||||
print(json.dumps(Imdb('0306414'), indent=2))
|
||||
#print json.dumps(Imdb('0133093'), indent=2)
|
||||
|
||||
300
Shared/lib/python3.4/site-packages/ox/web/impawards.py
Normal file
300
Shared/lib/python3.4/site-packages/ox/web/impawards.py
Normal file
|
|
@ -0,0 +1,300 @@
|
|||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
# encoding: utf-8
|
||||
import re
|
||||
|
||||
from ox.cache import read_url
|
||||
from ox.html import strip_tags
|
||||
from ox.text import find_re
|
||||
|
||||
|
||||
def get_data(id):
|
||||
'''
|
||||
>>> get_data('1991/silence_of_the_lambs')['imdbId']
|
||||
u'0102926'
|
||||
|
||||
>>> get_data('1991/silence_of_the_lambs')['posters'][0]
|
||||
u'http://www.impawards.com/1991/posters/silence_of_the_lambs_ver1.jpg'
|
||||
|
||||
>>> get_data('1991/silence_of_the_lambs')['url']
|
||||
u'http://www.impawards.com/1991/silence_of_the_lambs_ver1.html'
|
||||
'''
|
||||
data = {
|
||||
'url': get_url(id)
|
||||
}
|
||||
html = read_url(data['url'], unicode=True)
|
||||
data['imdbId'] = find_re(html, 'imdb.com/title/tt(\d{7})')
|
||||
if not data['imdbId']:
|
||||
data['imdbId'] = _id_map.get(id, '')
|
||||
data['title'] = strip_tags(find_re(html, '<p class="name white">(.*?) \(<a href="alpha1.html">'))
|
||||
data['year'] = find_re(html, '\(<a href="alpha1.html">(.*?)</a>\)')
|
||||
data['posters'] = []
|
||||
poster = find_re(html, '<img src="(posters.*?)"')
|
||||
if poster:
|
||||
poster = 'http://www.impawards.com/%s/%s' % (data['year'], poster)
|
||||
data['posters'].append(poster)
|
||||
results = re.compile('<a href = (%s.*?html)' % id[5:], re.DOTALL).findall(html)
|
||||
for result in results:
|
||||
result = result.replace('_xlg.html', '.html')
|
||||
url = 'http://www.impawards.com/%s/%s' % (data['year'], result)
|
||||
html = read_url(url, unicode=True)
|
||||
result = find_re(html, '<a href = (\w*?_xlg.html)')
|
||||
if result:
|
||||
url = 'http://www.impawards.com/%s/%s' % (data['year'], result)
|
||||
html = read_url(url, unicode=True)
|
||||
poster = 'http://www.impawards.com/%s/%s' % (data['year'], find_re(html, '<img SRC="(.*?)"'))
|
||||
else:
|
||||
poster = 'http://www.impawards.com/%s/%s' % (data['year'], find_re(html, '<img src="(posters.*?)"'))
|
||||
data['posters'].append(poster)
|
||||
|
||||
return data
|
||||
|
||||
def get_id(url):
|
||||
split = url.split('/')
|
||||
year = split[3]
|
||||
split = split[4][:-5].split('_')
|
||||
if split[-1] == 'xlg':
|
||||
split.pop()
|
||||
if find_re(split[-1], 'ver\d+$'):
|
||||
split.pop()
|
||||
id = '%s/%s' % (year, '_'.join(split))
|
||||
return id
|
||||
|
||||
def get_ids(page=None):
|
||||
ids = []
|
||||
if page:
|
||||
html = read_url('http://www.impawards.com/archives/page%s.html' % page, timeout = -1, unicode=True)
|
||||
results = re.compile('<a href = \.\./(.*?)>', re.DOTALL).findall(html)
|
||||
for result in results:
|
||||
url = 'http://impawards.com/%s' % result
|
||||
ids.append(get_id(url))
|
||||
return set(ids)
|
||||
#get all
|
||||
html = read_url('http://www.impawards.com/archives/latest.html', timeout = 60*60, unicode=True)
|
||||
pages = int(find_re(html, '<a href= page(.*?).html>')) + 1
|
||||
for page in range(pages, 0, -1):
|
||||
for id in get_ids(page):
|
||||
if not id in ids:
|
||||
ids.append(id)
|
||||
return ids
|
||||
|
||||
def get_url(id):
|
||||
url = u"http://www.impawards.com/%s.html" % id
|
||||
html = read_url(url, unicode=True)
|
||||
if find_re(html, "No Movie Posters on This Page"):
|
||||
url = u"http://www.impawards.com/%s_ver1.html" % id
|
||||
return url
|
||||
|
||||
_id_map = {
|
||||
'1933/forty_second_street': '0024034',
|
||||
'1933/tarzan_the_fearless': '0024645',
|
||||
'1935/informer': '0026529',
|
||||
'1935/thirty_nine_steps': '0026529',
|
||||
'1935/top_hat': '0027125',
|
||||
'1938/charlie_chaplin_cavalcade': '0284687',
|
||||
'1943/falcon_and_the_co-eds': '035855',
|
||||
'1969/angel_angel_down_we_go': '0065602',
|
||||
'1970/crimson_altar': '0062833',
|
||||
'1975/man_who_would_be_king_ver1': '0073341',
|
||||
'1975/picnic_at_hanging_rock_ver1': '0073540',
|
||||
'1979/electric_horseman_ver1': '0079100',
|
||||
'1980/caligula_ver1': '0080491',
|
||||
'1980/hollywood_knights_ver1': '0080881',
|
||||
'1981/history_of_the_world_part_i': '0082517',
|
||||
'1981/sea_wolves': '0081470',
|
||||
'1983/krull_ver1': '0085811',
|
||||
'1985/warriors_of_the_wind': '0087544',
|
||||
'1989/friday_the_thirteenth_part_viii_ver1': '0097388',
|
||||
'1989/high_hopes': '0095302',
|
||||
'1989/millenium': '0097883',
|
||||
'1989/story_of_women': '0096336',
|
||||
'1990/edward_scissorhands_ver1': '0099487',
|
||||
'1991/freddys_dead_ver1': '0101917',
|
||||
'1993/robocop_three_ver1': '0107978',
|
||||
'1993/waynes_world_two_ver1': '0108525',
|
||||
'1994/above_the_rim_ver1': '0109035',
|
||||
'1994/helas_pour_moi': '0107175',
|
||||
'1994/house_of_the_spirits_ver1': '0107151',
|
||||
'1994/i_dont_want_to_talk_about_it': '0106678',
|
||||
'1994/in_custody': '0107199',
|
||||
'1994/ladybird_ladybird': '0110296',
|
||||
'1994/leon_the_pig_farmer': '0104710',
|
||||
'1994/love_after_love': '0103710',
|
||||
'1994/l_six_two_seven': '0104658',
|
||||
'1994/martin_lawrence_you_so_crazy_ver1': '0111804',
|
||||
'1994/savage_nights': '0105032',
|
||||
'1994/sex_drugs_and_democracy': '0111135',
|
||||
'1995/bye_bye_love': '0112606',
|
||||
'1995/cold_comfort_farm': '0112701',
|
||||
'1995/gumby_the_movie': '0113234',
|
||||
'1995/les_miserables': '0113828',
|
||||
'1995/mystery_of_rampo': '0110943',
|
||||
'1995/pharaohs_army': '0114122',
|
||||
'1995/pure_formality': '0110917',
|
||||
'1995/quick_and_the_dead_ver1': '0114214',
|
||||
'1995/reflections_in_the_dark': '0110956',
|
||||
'1995/safe_ver1': '0114323',
|
||||
'1995/search_and_destroy': '0114371',
|
||||
'1995/secret_of_roan_inish_ver1': '0111112',
|
||||
'1995/underneath': '0114788',
|
||||
'1996/ghost_in_the_shell': '0113568',
|
||||
'1996/hate': '0113247',
|
||||
'1996/horseman_on_the_roof': '0113362',
|
||||
'1996/kids_in_the_hall_brain_candy': '0116768',
|
||||
'1996/maybe_maybe_not': '0109255',
|
||||
'1996/prisoner_of_the_mountains': '0116754',
|
||||
'1997/fifth_element_ver1': '0119116',
|
||||
'1997/fools_rush_in_ver1': '0119141',
|
||||
'1997/gi_jane_ver1': '0119173',
|
||||
'1997/happy_together_ver1': '0118845',
|
||||
'1997/lilies': '0116882',
|
||||
'1997/mouth_to_mouth': '0112546',
|
||||
'1997/mr_nice_guy': '0117786',
|
||||
'1997/nenette_and_boni': '0117221',
|
||||
'1997/paperback_romance': '0110405',
|
||||
'1997/second_jungle_book': '0120087',
|
||||
'1997/single_girl': '0113057',
|
||||
'1997/super_speedway': '0120245',
|
||||
'1997/temptress_moon': '0116295',
|
||||
'1998/alarmist': '0119534',
|
||||
'1998/barneys_great_adventure_the_movie': '0120598',
|
||||
'1998/bulworth_ver1': '0118798',
|
||||
'1998/celebration': '0154420',
|
||||
'1998/east_palace_west_palace': '0119007',
|
||||
'1998/hurricane_streets': '0119338',
|
||||
'1998/i_married_a_strange_person': '0119346',
|
||||
'1998/inheritors': '0141824',
|
||||
'1998/killing_time': '0140312',
|
||||
'1998/live_flesh': '0118819',
|
||||
'1998/music_from_another_room': '0119734',
|
||||
'1998/post_coitum_ver1': '0119923',
|
||||
'1998/steam_the_turkish_bath': '0119248',
|
||||
'1998/velocity_of_gary': '0120878',
|
||||
'1999/after_life': '0165078',
|
||||
'1999/emperor_and_the_assassin': '0162866',
|
||||
'1999/fantasia_two_thousand': '0120910',
|
||||
'1999/get_bruce': '0184510',
|
||||
'1999/god_said_ha': '0119207',
|
||||
'1999/jawbreaker': '0155776',
|
||||
'1999/jeanne_and_the_perfect_guy': '0123923',
|
||||
'1999/king_and_i': '0160429',
|
||||
'1999/lovers_of_the_arctic_circle': '0133363',
|
||||
'1999/plunkett_and_macleane': '0134033',
|
||||
'1999/pokemon_the_first_movie': '0190641',
|
||||
'1999/school_of_flesh': '0157208',
|
||||
'1999/splendor': '0127296',
|
||||
'1999/stranger_in_the_kingdom': '0126680',
|
||||
'1999/train_of_life': '0170705',
|
||||
'1999/twice_upon_a_yesterday': '0138590',
|
||||
'1999/whiteboys': '0178988',
|
||||
'1999/wildfire': '0194544',
|
||||
'1999/windhorse': '0169388',
|
||||
'2000/claim': '0218378',
|
||||
'2000/color_of_paradise': '0191043',
|
||||
'2000/criminal_lovers': '0205735',
|
||||
'2000/everlasting_piece': '0218182',
|
||||
'2000/girl_on_the_bridge_ver1': '0144201',
|
||||
'2000/godzilla_two_thousand': '0188640',
|
||||
'2000/goya_in_bordeaux': '0210717',
|
||||
'2000/mad_about_mambo': '0156757',
|
||||
'2000/picking_up_the_pieces': '0192455',
|
||||
'2000/pokemon_the_movie_2000': '0257001',
|
||||
'2000/seven_days_to_live': '0221928',
|
||||
'2000/south_of_heaven_west_of_hell': '0179473',
|
||||
'2000/suzhou_river': '0234837',
|
||||
'2000/time_for_drunken_horses': '0259072',
|
||||
'2000/venus_beauty_institute': '0174330',
|
||||
'2001/circle': '0368646',
|
||||
'2001/devils_backbone': '0256009',
|
||||
'2001/kill_me_later': '0243595',
|
||||
'2001/king_is_dancing': '0244173',
|
||||
'2001/learning_curve': '0219126',
|
||||
'2001/marco_polo__return_to_xanadu_ver1': '0296074',
|
||||
'2001/me_you_them': '0244504',
|
||||
'2001/our_lady_of_the_assassins': '0250809',
|
||||
'2001/pinero': '0261066',
|
||||
'2001/pokemon_three_the_movie_ver1': '0266860',
|
||||
'2001/scratch': '0143861',
|
||||
'2001/vampire_hunter_d_bloodlust_ver1': '0216651',
|
||||
'2002/el_bosque_animado': '0310790',
|
||||
'2002/fifty_first_state': '0227984',
|
||||
'2002/les_destinees': '0216689',
|
||||
'2002/sons_room': '0208990',
|
||||
'2003/open_hearts': '0315543',
|
||||
'2003/tulse_luper_suitcases': '0307596',
|
||||
'2003/valentin': '0296915',
|
||||
'2004/if_only_ver1': '0332136',
|
||||
'2004/wondrous_oblivion': '0334725',
|
||||
'2005/wu_ji': '0417976',
|
||||
'2006/golden_door': '0465188',
|
||||
'2006/kin': '1091189',
|
||||
'2007/revenge_of_the_nerds': '0088000',
|
||||
'2008/bad_batch': '1605644',
|
||||
'2008/mercedes': '1368083',
|
||||
'2008/spirit': '0831887',
|
||||
'2009/dead_air': '0993841',
|
||||
'2009/edge_of_love': '0819714',
|
||||
'2009/fuel': '1072437',
|
||||
'2009/fuel': '1072437',
|
||||
'2009/one_good_man': '1239357',
|
||||
'2009/st_trinians': '1210106',
|
||||
'2009/surveillance': '0409345',
|
||||
'2009/taken': '0936501',
|
||||
'2009/vaml': '1610453',
|
||||
'2010/adopting_haiti': '1764164',
|
||||
'2010/afterlife': '0838247',
|
||||
'2010/agora': '1186830',
|
||||
'2010/athlete': '1356996',
|
||||
'2010/beneath_the_blue': '1222698',
|
||||
'2010/bitch_slap': '1212974',
|
||||
'2010/black_waters_of_echos_pond': '0960066',
|
||||
'2010/case_thirty_nine': '0795351',
|
||||
'2010/finite_and_infinite_games': '1772268',
|
||||
'2010/hole': '1085779',
|
||||
'2010/jolene': '0867334',
|
||||
'2010/lake_mungo': '0816556',
|
||||
'2010/last_day_of_summer': '1242544',
|
||||
'2010/leaves_of_grass': '1151359',
|
||||
'2010/life_of_lemon': '1466057',
|
||||
'2010/man_in_the_maze': '1721692',
|
||||
'2010/mr_immortality_the_life_and_times_of_twista': '1711017',
|
||||
'2010/paper_man': '0437405',
|
||||
'2010/perfect_game': '0473102',
|
||||
'2010/red_baron': '0365675',
|
||||
'2010/satin': '0433397',
|
||||
'2010/shutter_island': '1130884',
|
||||
'2010/strange_powers': '1534075',
|
||||
'2010/suicidegirls_must_die': '1584733',
|
||||
'2010/veronika_decides_to_die': '1068678',
|
||||
'2010/witchblade': '0494292',
|
||||
'2010/youth_in_revolt': '0403702',
|
||||
'2011/beastly': '1152398',
|
||||
'2011/burning_palms': '1283887',
|
||||
'2011/cabin_in_the_woods': '1259521',
|
||||
'2011/conan': '0816462',
|
||||
'2011/courageous': '1630036',
|
||||
'2011/cruces_divided_two': '1698645',
|
||||
'2011/green_with_envy': '1204342',
|
||||
'2011/happythankyoumoreplease': '1481572',
|
||||
'2011/homework': '1645080',
|
||||
'2011/i_got_next': '1915570',
|
||||
'2011/lebanon_pa': '1290082',
|
||||
'2011/money_pet': '1965198',
|
||||
'2011/my_suicide': '0492896',
|
||||
'2011/priest': '0822847',
|
||||
'2011/prowl': '1559033',
|
||||
'2011/red_sonja': '0800175',
|
||||
'2011/season_of_the_witch': '0479997',
|
||||
'2011/stay_cool': '1235807',
|
||||
'2011/sympathy_for_delicious': '1270277',
|
||||
'2011/trust': '1529572',
|
||||
'2011/undefeated': '1961604',
|
||||
'2011/vanishing_on_seventh_street': '1452628',
|
||||
'2011/where_is_robert_fisher': '2042712',
|
||||
'2011/yellowbrickroad': '1398428',
|
||||
'2012/haywire': '1506999',
|
||||
'2012/last_call_at_the_oasis': '2043900',
|
||||
}
|
||||
|
||||
if __name__ == '__main__':
|
||||
ids = get_ids()
|
||||
print sorted(ids), len(ids)
|
||||
187
Shared/lib/python3.4/site-packages/ox/web/itunes.py
Normal file
187
Shared/lib/python3.4/site-packages/ox/web/itunes.py
Normal file
|
|
@ -0,0 +1,187 @@
|
|||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
# encoding: utf-8
|
||||
import re
|
||||
import urllib
|
||||
|
||||
from ox.cache import read_url
|
||||
from ox.html import decode_html, strip_tags
|
||||
from ox.text import find_re
|
||||
from ox.text import find_string
|
||||
|
||||
|
||||
# to sniff itunes traffic, use something like
|
||||
# sudo tcpdump -i en1 -Avs 8192 host appleglobal.112.2o7.net
|
||||
|
||||
# http://ax.phobos.apple.com.edgesuite.net/WebObjects/MZSearch.woa/wa/advancedSearch?media=music&songTerm=&genreIndex=1&flavor=0&mediaType=2&composerTerm=&allArtistNames=Arcadia&ringtone=0&searchButton=submit
|
||||
# http://ax.phobos.apple.com.edgesuite.net/WebObjects/MZSearch.woa/wa/advancedSearch?media=movie&movieTerm=The%20Matrix&descriptionTerm=&ratingIndex=1&mediaType=3&directorProducerName=Andy%20Wachowski&flavor=0&releaseYearTerm=1999&closedCaption=0&actorTerm=&searchButton=submit
|
||||
|
||||
ITUNES_HEADERS = {
|
||||
'X-Apple-Tz': '0',
|
||||
'X-Apple-Storefront': '143441-1',
|
||||
'User-Agent': 'iTunes/7.6.2 (Macintosh; U; Intel Mac OS X 10.5.2)',
|
||||
'Accept-Language': 'en-us, en;q=0.50',
|
||||
'Accept-Encoding': 'gzip',
|
||||
'Connection': 'close',
|
||||
}
|
||||
|
||||
def compose_url(request, parameters):
|
||||
if request == 'advancedSearch':
|
||||
url = 'http://ax.phobos.apple.com.edgesuite.net/WebObjects/MZSearch.woa/wa/advancedSearch?'
|
||||
if parameters['media'] == 'music':
|
||||
url += urllib.urlencode({
|
||||
'albumTerm': parameters['title'],
|
||||
'allArtistNames': parameters['artist'],
|
||||
'composerTerm': '',
|
||||
'flavor': 0,
|
||||
'genreIndex': 1,
|
||||
'media': 'music',
|
||||
'mediaType': 2,
|
||||
'ringtone': 0,
|
||||
'searchButton': 'submit',
|
||||
'songTerm': ''
|
||||
})
|
||||
elif parameters['media'] == 'movie':
|
||||
url += urllib.urlencode({
|
||||
'actorTerm': '',
|
||||
'closedCaption': 0,
|
||||
'descriptionTerm': '',
|
||||
'directorProducerName': parameters['director'],
|
||||
'flavor': 0,
|
||||
'media': 'movie',
|
||||
'mediaType': 3,
|
||||
'movieTerm': parameters['title'],
|
||||
'ratingIndex': 1,
|
||||
'releaseYearTerm': '',
|
||||
'searchButton': 'submit'
|
||||
})
|
||||
elif request == 'viewAlbum':
|
||||
url = 'http://phobos.apple.com/WebObjects/MZStore.woa/wa/viewAlbum?id=%s' % parameters['id']
|
||||
elif request == 'viewMovie':
|
||||
url = 'http://phobos.apple.com/WebObjects/MZStore.woa/wa/viewMovie?id=%s&prvw=1' % parameters['id']
|
||||
return url
|
||||
|
||||
def parse_xml_dict(xml):
|
||||
values = {}
|
||||
strings = xml.split('<key>')
|
||||
for string in strings:
|
||||
if string.find('</key>') != -1:
|
||||
key = find_re(string, '(.*?)</key>')
|
||||
type = find_re(string, '</key><(.*?)>')
|
||||
if type == 'true/':
|
||||
value = True
|
||||
else:
|
||||
value = find_re(string, '<%s>(.*?)</%s>' % (type, type))
|
||||
if type == 'integer':
|
||||
value = int(value)
|
||||
elif type == 'string':
|
||||
value = decode_html(value)
|
||||
values[key] = value
|
||||
return values
|
||||
|
||||
def parse_cast(xml, title):
|
||||
list = []
|
||||
try:
|
||||
strings = find_re(xml, '<SetFontStyle normalStyle="textColor">%s(.*?)</VBoxView>' % title[:-1].upper()).split('</GotoURL>')
|
||||
strings.pop()
|
||||
for string in strings:
|
||||
list.append(find_re(string, '<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
|
||||
return list
|
||||
except:
|
||||
return list
|
||||
|
||||
def parse_movies(xml, title):
|
||||
list = []
|
||||
try:
|
||||
strings = find_re(xml, '<SetFontStyle normalStyle="outlineTitleFontStyle"><b>%s(.*?)</Test>' % title[:-1].upper()).split('</GotoURL>')
|
||||
strings.pop()
|
||||
for string in strings:
|
||||
list.append({
|
||||
'id': find_re(string, 'viewMovie\?id=(.*?)&'),
|
||||
'title': find_re(string, '<SetFontStyle normalStyle="outlineTextFontStyle"><b>(.*?)</b></SetFontStyle>')
|
||||
})
|
||||
return list
|
||||
except:
|
||||
return list
|
||||
|
||||
class ItunesAlbum:
|
||||
def __init__(self, id = '', title = '', artist = ''):
|
||||
self.id = id
|
||||
self.title = title
|
||||
self.artist = artist
|
||||
if not id:
|
||||
self.id = self.get_id()
|
||||
|
||||
def get_id(self):
|
||||
url = compose_url('advancedSearch', {'media': 'music', 'title': self.title, 'artist': self.artist})
|
||||
xml = read_url(url, headers = ITUNES_HEADERS)
|
||||
id = find_re(xml, 'viewAlbum\?id=(.*?)&')
|
||||
return id
|
||||
|
||||
def get_data(self):
|
||||
data = {'id': self.id}
|
||||
url = compose_url('viewAlbum', {'id': self.id})
|
||||
xml = read_url(url, None, ITUNES_HEADERS)
|
||||
data['albumName'] = find_re(xml, '<B>(.*?)</B>')
|
||||
data['artistName'] = find_re(xml, '<b>(.*?)</b>')
|
||||
data['coverUrl'] = find_re(xml, 'reflection="." url="(.*?)"')
|
||||
data['genre'] = find_re(xml, 'Genre:(.*?)<')
|
||||
data['releaseDate'] = find_re(xml, 'Released(.*?)<')
|
||||
data['review'] = strip_tags(find_re(xml, 'REVIEW</b>.*?<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
|
||||
data['tracks'] = []
|
||||
strings = find_re(xml, '<key>items</key>.*?<dict>(.*?)$').split('<dict>')
|
||||
for string in strings:
|
||||
data['tracks'].append(parse_xml_dict(string))
|
||||
data['type'] = find_re(xml, '<key>listType</key><string>(.*?)<')
|
||||
return data
|
||||
|
||||
class ItunesMovie:
|
||||
def __init__(self, id = '', title = '', director = ''):
|
||||
self.id = id
|
||||
self.title = title
|
||||
self.director = director
|
||||
if not id:
|
||||
self.id = self.get_id()
|
||||
|
||||
def get_id(self):
|
||||
url = compose_url('advancedSearch', {'media': 'movie', 'title': self.title, 'director': self.director})
|
||||
xml = read_url(url, headers = ITUNES_HEADERS)
|
||||
id = find_re(xml, 'viewMovie\?id=(.*?)&')
|
||||
return id
|
||||
|
||||
def get_data(self):
|
||||
data = {'id': self.id}
|
||||
url = compose_url('viewMovie', {'id': self.id})
|
||||
xml = read_url(url, None, ITUNES_HEADERS)
|
||||
f = open('/Users/rolux/Desktop/iTunesData.xml', 'w')
|
||||
f.write(xml)
|
||||
f.close()
|
||||
data['actors'] = parse_cast(xml, 'actors')
|
||||
string = find_re(xml, 'Average Rating:(.*?)</HBoxView>')
|
||||
data['averageRating'] = string.count('rating_star_000033.png') + string.count('½') * 0.5
|
||||
data['directors'] = parse_cast(xml, 'directors')
|
||||
data['format'] = find_re(xml, 'Format:(.*?)<')
|
||||
data['genre'] = decode_html(find_re(xml, 'Genre:(.*?)<'))
|
||||
data['plotSummary'] = decode_html(find_re(xml, 'PLOT SUMMARY</b>.*?<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
|
||||
data['posterUrl'] = find_re(xml, 'reflection="." url="(.*?)"')
|
||||
data['producers'] = parse_cast(xml, 'producers')
|
||||
data['rated'] = find_re(xml, 'Rated(.*?)<')
|
||||
data['relatedMovies'] = parse_movies(xml, 'related movies')
|
||||
data['releaseDate'] = find_re(xml, 'Released(.*?)<')
|
||||
data['runTime'] = find_re(xml, 'Run Time:(.*?)<')
|
||||
data['screenwriters'] = parse_cast(xml, 'screenwriters')
|
||||
data['soundtrackId'] = find_re(xml, 'viewAlbum\?id=(.*?)&')
|
||||
data['trailerUrl'] = find_re(xml, 'autoplay="." url="(.*?)"')
|
||||
return data
|
||||
|
||||
if __name__ == '__main__':
|
||||
from ox.utils import json
|
||||
data = ItunesAlbum(title = 'So Red the Rose', artist = 'Arcadia').get_data()
|
||||
print json.dumps(data, sort_keys = True, indent = 4)
|
||||
data = ItunesMovie(title = 'The Matrix', director = 'Wachowski').get_data()
|
||||
print json.dumps(data, sort_keys = True, indent = 4)
|
||||
for v in data['relatedMovies']:
|
||||
data = ItunesMovie(id = v['id']).get_data()
|
||||
print json.dumps(data, sort_keys = True, indent = 4)
|
||||
data = ItunesMovie(id='272960052').get_data()
|
||||
print json.dumps(data, sort_keys = True, indent = 4)
|
||||
|
||||
42
Shared/lib/python3.4/site-packages/ox/web/lookupbyisbn.py
Normal file
42
Shared/lib/python3.4/site-packages/ox/web/lookupbyisbn.py
Normal file
|
|
@ -0,0 +1,42 @@
|
|||
from ox.cache import read_url
|
||||
from ox import find_re, strip_tags
|
||||
import re
|
||||
|
||||
base = 'http://www.lookupbyisbn.com'
|
||||
|
||||
def get_data(isbn):
|
||||
r = {}
|
||||
url = '%s/Search/Book/%s/1' % (base, isbn)
|
||||
|
||||
data = read_url(url).decode('utf-8')
|
||||
m = re.compile('href="(/Lookup/Book/[^"]+?)"').findall(data)
|
||||
if m:
|
||||
ids = m[0].split('/')
|
||||
r['isbn'] = ids[-2]
|
||||
r['asin'] = ids[-3]
|
||||
url = '%s%s' % (base, m[0])
|
||||
data = read_url(url).decode('utf-8')
|
||||
r["title"] = find_re(data, "<h2>(.*?)</h2>")
|
||||
keys = {
|
||||
'author': 'Author(s)',
|
||||
'publisher': 'Publisher',
|
||||
'date': 'Publication date',
|
||||
'edition': 'Edition',
|
||||
'binding': 'Binding',
|
||||
'volume': 'Volume(s)',
|
||||
'pages': 'Pages',
|
||||
}
|
||||
for key in keys:
|
||||
r[key] = find_re(data, '<span class="title">%s:</span>(.*?)</li>'% re.escape(keys[key]))
|
||||
if r[key] == '--':
|
||||
r[key] = ''
|
||||
if key == 'pages' and r[key]:
|
||||
r[key] = int(r[key])
|
||||
desc = find_re(data, '<h2>Description:<\/h2>(.*?)<div ')
|
||||
desc = desc.replace('<br /><br />', ' ').replace('<br /> ', ' ').replace('<br />', ' ')
|
||||
r['description'] = strip_tags(desc).strip()
|
||||
if r['description'] == u'Description of this item is not available at this time.':
|
||||
r['description'] = ''
|
||||
r['cover'] = find_re(data, '<img src="(.*?)" alt="Book cover').replace('._SL160_', '')
|
||||
return r
|
||||
|
||||
21
Shared/lib/python3.4/site-packages/ox/web/lyricsfly.py
Normal file
21
Shared/lib/python3.4/site-packages/ox/web/lyricsfly.py
Normal file
|
|
@ -0,0 +1,21 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
from ox.cache import read_url
|
||||
from ox.html import decode_html
|
||||
from ox.text import find_re
|
||||
|
||||
|
||||
def get_lyrics(title, artist):
|
||||
html = read_url('http://lyricsfly.com/api/')
|
||||
key = find_re(html, '<font color=green><b>(.*?)</b></font>')
|
||||
url = 'http://lyricsfly.com/api/api.php?i=%s&a=%s&t=%s' % (key, artist, title)
|
||||
xml = read_url(url)
|
||||
lyrics = find_re(xml, '<tx>(.*?)\[br\] Lyrics [a-z]* by lyricsfly.com')
|
||||
lyrics = lyrics.replace('\n', '').replace('\r', '')
|
||||
lyrics = lyrics.replace('[br]', '\n').strip()
|
||||
lyrics.replace('\n\n\n', '\n\n')
|
||||
lyrics = decode_html(lyrics.replace('&', '&'))
|
||||
return lyrics
|
||||
|
||||
if __name__ == '__main__':
|
||||
print getLyrics('Election Day', 'Arcadia')
|
||||
63
Shared/lib/python3.4/site-packages/ox/web/metacritic.py
Normal file
63
Shared/lib/python3.4/site-packages/ox/web/metacritic.py
Normal file
|
|
@ -0,0 +1,63 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
import re
|
||||
from urllib import quote
|
||||
from lxml.html import document_fromstring
|
||||
|
||||
from ox.cache import read_url
|
||||
from ox import find_re, strip_tags
|
||||
|
||||
def get_url(id=None, imdb=None):
|
||||
if imdb:
|
||||
url = "http://www.imdb.com/title/tt%s/criticreviews" % imdb
|
||||
data = read_url(url)
|
||||
metacritic_url = find_re(data, '"(http://www.metacritic.com/movie/.*?)"')
|
||||
return metacritic_url or None
|
||||
return 'http://www.metacritic.com/movie/%s' % id
|
||||
|
||||
def get_id(url):
|
||||
return url.split('/')[-1]
|
||||
|
||||
def get_show_url(title):
|
||||
title = quote(title)
|
||||
url = "http://www.metacritic.com/search/process?ty=6&ts=%s&tfs=tvshow_title&x=0&y=0&sb=0&release_date_s=&release_date_e=&metascore_s=&metascore_e=" % title
|
||||
data = read_url(url)
|
||||
return find_re(data, '(http://www.metacritic.com/tv/shows/.*?)\?')
|
||||
|
||||
def get_data(url):
|
||||
data = read_url(url, unicode=True)
|
||||
doc = document_fromstring(data)
|
||||
score = filter(lambda s: s.attrib.get('property') == 'v:average',
|
||||
doc.xpath('//span[@class="score_value"]'))
|
||||
if score:
|
||||
score = int(score[0].text)
|
||||
else:
|
||||
score = -1
|
||||
authors = [a.text
|
||||
for a in doc.xpath('//div[@class="review_content"]//div[@class="author"]//a')]
|
||||
sources = [d.text
|
||||
for d in doc.xpath('//div[@class="review_content"]//div[@class="source"]/a')]
|
||||
reviews = [d.text
|
||||
for d in doc.xpath('//div[@class="review_content"]//div[@class="review_body"]')]
|
||||
scores = [int(d.text.strip())
|
||||
for d in doc.xpath('//div[@class="review_content"]//div[contains(@class, "critscore")]')]
|
||||
urls = [a.attrib['href']
|
||||
for a in doc.xpath('//div[@class="review_content"]//a[contains(@class, "external")]')]
|
||||
|
||||
metacritics = []
|
||||
for i in range(len(authors)):
|
||||
metacritics.append({
|
||||
'critic': authors[i],
|
||||
'url': urls[i],
|
||||
'source': sources[i],
|
||||
'quote': strip_tags(reviews[i]).strip(),
|
||||
'score': scores[i],
|
||||
})
|
||||
|
||||
return {
|
||||
'critics': metacritics,
|
||||
'id': get_id(url),
|
||||
'score': score,
|
||||
'url': url,
|
||||
}
|
||||
|
||||
121
Shared/lib/python3.4/site-packages/ox/web/mininova.py
Normal file
121
Shared/lib/python3.4/site-packages/ox/web/mininova.py
Normal file
|
|
@ -0,0 +1,121 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
from datetime import datetime
|
||||
import re
|
||||
import socket
|
||||
from urllib import quote
|
||||
|
||||
from ox.cache import read_url
|
||||
from ox import find_re, cache, strip_tags, decode_html, get_torrent_info, int_value, normalize_newlines
|
||||
from ox.normalize import normalize_imdbid
|
||||
import ox
|
||||
|
||||
from torrent import Torrent
|
||||
|
||||
|
||||
def _parse_results_page(data, max_results=10):
|
||||
results=[]
|
||||
regexp = '''<tr><td>(.*?)</td><td>(.*?)<a href="/tor/(.*?)">(.*?)</a>.*?</td>.*?</tr>'''
|
||||
for row in re.compile(regexp, re.DOTALL).findall(data):
|
||||
torrentDate = row[0]
|
||||
torrentExtra = row[1]
|
||||
torrentId = row[2]
|
||||
torrentTitle = decode_html(row[3]).strip()
|
||||
torrentLink = "http://www.mininova.org/tor/" + torrentId
|
||||
privateTracker = 'priv.gif' in torrentExtra
|
||||
if not privateTracker:
|
||||
results.append((torrentTitle, torrentLink, ''))
|
||||
return results
|
||||
|
||||
def find_movie(query=None, imdb=None, max_results=10):
|
||||
'''search for torrents on mininova
|
||||
'''
|
||||
if imdb:
|
||||
url = "http://www.mininova.org/imdb/?imdb=%s" % normalize_imdbid(imdb)
|
||||
else:
|
||||
url = "http://www.mininova.org/search/%s/seeds" % quote(query)
|
||||
data = read_url(url, unicode=True)
|
||||
return _parse_results_page(data, max_results)
|
||||
|
||||
def get_id(mininovaId):
|
||||
mininovaId = unicode(mininovaId)
|
||||
d = find_re(mininovaId, "/(\d+)")
|
||||
if d:
|
||||
return d
|
||||
mininovaId = mininovaId.split('/')
|
||||
if len(mininovaId) == 1:
|
||||
return mininovaId[0]
|
||||
else:
|
||||
return mininovaId[-1]
|
||||
|
||||
def exists(mininovaId):
|
||||
mininovaId = get_id(mininovaId)
|
||||
data = ox.net.read_url("http://www.mininova.org/tor/%s" % mininovaId)
|
||||
if not data or 'Torrent not found...' in data:
|
||||
return False
|
||||
if 'tracker</a> of this torrent requires registration.' in data:
|
||||
return False
|
||||
return True
|
||||
|
||||
def get_data(mininovaId):
|
||||
_key_map = {
|
||||
'by': u'uploader',
|
||||
}
|
||||
mininovaId = get_id(mininovaId)
|
||||
torrent = dict()
|
||||
torrent[u'id'] = mininovaId
|
||||
torrent[u'domain'] = 'mininova.org'
|
||||
torrent[u'comment_link'] = "http://www.mininova.org/tor/%s" % mininovaId
|
||||
torrent[u'torrent_link'] = "http://www.mininova.org/get/%s" % mininovaId
|
||||
torrent[u'details_link'] = "http://www.mininova.org/det/%s" % mininovaId
|
||||
|
||||
data = read_url(torrent['comment_link'], unicode=True) + read_url(torrent['details_link'], unicode=True)
|
||||
if '<h1>Torrent not found...</h1>' in data:
|
||||
return None
|
||||
|
||||
for d in re.compile('<p>.<strong>(.*?):</strong>(.*?)</p>', re.DOTALL).findall(data):
|
||||
key = d[0].lower().strip()
|
||||
key = _key_map.get(key, key)
|
||||
value = decode_html(strip_tags(d[1].strip()))
|
||||
torrent[key] = value
|
||||
|
||||
torrent[u'title'] = find_re(data, '<title>(.*?):.*?</title>')
|
||||
torrent[u'imdbId'] = find_re(data, 'title/tt(\d{7})')
|
||||
torrent[u'description'] = find_re(data, '<div id="description">(.*?)</div>')
|
||||
if torrent['description']:
|
||||
torrent['description'] = normalize_newlines(decode_html(strip_tags(torrent['description']))).strip()
|
||||
t = read_url(torrent[u'torrent_link'])
|
||||
torrent[u'torrent_info'] = get_torrent_info(t)
|
||||
return torrent
|
||||
|
||||
class Mininova(Torrent):
|
||||
'''
|
||||
>>> Mininova('123')
|
||||
{}
|
||||
>>> Mininova('1072195')['infohash']
|
||||
'72dfa59d2338e4a48c78cec9de25964cddb64104'
|
||||
'''
|
||||
def __init__(self, mininovaId):
|
||||
self.data = get_data(mininovaId)
|
||||
if not self.data:
|
||||
return
|
||||
Torrent.__init__(self)
|
||||
ratio = self.data['share ratio'].split(',')
|
||||
self['seeder'] = -1
|
||||
self['leecher'] = -1
|
||||
if len(ratio) == 2:
|
||||
val = int_value(ratio[0].replace(',','').strip())
|
||||
if val:
|
||||
self['seeder'] = int(val)
|
||||
val = int_value(ratio[1].replace(',','').strip())
|
||||
if val:
|
||||
self['leecher'] = int(val)
|
||||
val = int_value(self.data['downloads'].replace(',','').strip())
|
||||
if val:
|
||||
self['downloaded'] = int(val)
|
||||
else:
|
||||
self['downloaded'] = -1
|
||||
published = self.data['added on']
|
||||
published = published.split(' +')[0]
|
||||
self['published'] = datetime.strptime(published, "%a, %d %b %Y %H:%M:%S")
|
||||
|
||||
44
Shared/lib/python3.4/site-packages/ox/web/movieposterdb.py
Normal file
44
Shared/lib/python3.4/site-packages/ox/web/movieposterdb.py
Normal file
|
|
@ -0,0 +1,44 @@
|
|||
# -*- coding: UTF-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
|
||||
import re
|
||||
|
||||
from ox.cache import read_url
|
||||
from ox import find_re
|
||||
|
||||
def get_data(id):
|
||||
'''
|
||||
>>> get_data('0060304')['posters'][0]
|
||||
u'http://www.movieposterdb.com/posters/06_03/1967/0060304/l_99688_0060304_639fdd1e.jpg'
|
||||
>>> get_data('0123456')['posters']
|
||||
[]
|
||||
'''
|
||||
data = {
|
||||
"url": get_url(id)
|
||||
}
|
||||
data["posters"] = get_posters(data["url"])
|
||||
return data
|
||||
|
||||
def get_id(url):
|
||||
return url.split("/")[-2]
|
||||
|
||||
def get_posters(url, group=True, timeout=-1):
|
||||
posters = []
|
||||
html = read_url(url, timeout=timeout, unicode=True)
|
||||
if url in html:
|
||||
if group:
|
||||
results = re.compile('<a href="(http://www.movieposterdb.com/group/.+?)\??">', re.DOTALL).findall(html)
|
||||
for result in results:
|
||||
posters += get_posters(result, False)
|
||||
results = re.compile('<a href="(http://www.movieposterdb.com/poster/.+?)">', re.DOTALL).findall(html)
|
||||
for result in results:
|
||||
html = read_url(result, timeout=timeout, unicode=True)
|
||||
posters.append(find_re(html, '"(http://www.movieposterdb.com/posters/.+?\.jpg)"'))
|
||||
return posters
|
||||
|
||||
def get_url(id):
|
||||
return "http://www.movieposterdb.com/movie/%s/" % id
|
||||
|
||||
if __name__ == '__main__':
|
||||
print get_data('0060304')
|
||||
print get_data('0133093')
|
||||
41
Shared/lib/python3.4/site-packages/ox/web/opensubtitles.py
Normal file
41
Shared/lib/python3.4/site-packages/ox/web/opensubtitles.py
Normal file
|
|
@ -0,0 +1,41 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
import re
|
||||
|
||||
import feedparser
|
||||
from ox.cache import read_url
|
||||
from ox import find_re, strip_tags
|
||||
from ox.iso import langCode2To3, langTo3Code
|
||||
|
||||
def find_subtitles(imdb, parts = 1, language = "eng"):
|
||||
if len(language) == 2:
|
||||
language = langCode2To3(language)
|
||||
elif len(language) != 3:
|
||||
language = langTo3Code(language)
|
||||
url = "http://www.opensubtitles.org/en/search/"
|
||||
if language:
|
||||
url += "sublanguageid-%s/" % language
|
||||
url += "subsumcd-%s/subformat-srt/imdbid-%s/rss_2_00" % (parts, imdb)
|
||||
data = read_url(url)
|
||||
if "title>opensubtitles.com - search results</title" in data:
|
||||
fd = feedparser.parse(data)
|
||||
opensubtitleId = None
|
||||
if fd.entries:
|
||||
link = fd.entries[0]['links'][0]['href']
|
||||
opensubtitleId = re.compile('subtitles/(.*?)/').findall(link)
|
||||
if opensubtitleId:
|
||||
opensubtitleId = opensubtitleId[0]
|
||||
else:
|
||||
opensubtitleId = find_re(data, '/en/subtitles/(.*?)/')
|
||||
return opensubtitleId
|
||||
|
||||
def download_subtitle(opensubtitle_id):
|
||||
srts = {}
|
||||
data = read_url('http://www.opensubtitles.org/en/subtitles/%s' % opensubtitle_id)
|
||||
reg_exp = 'href="(/en/download/file/.*?)">(.*?)</a>'
|
||||
for f in re.compile(reg_exp, re.DOTALL).findall(data):
|
||||
name = strip_tags(f[1]).split('\n')[0]
|
||||
url = "http://www.opensubtitles.com%s" % f[0]
|
||||
srts[name] = read_url(url, unicode=True)
|
||||
return srts
|
||||
|
||||
10
Shared/lib/python3.4/site-packages/ox/web/oxdb.py
Normal file
10
Shared/lib/python3.4/site-packages/ox/web/oxdb.py
Normal file
|
|
@ -0,0 +1,10 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
import ox.cache
|
||||
|
||||
def get_poster_url(id):
|
||||
url = "http://0xdb.org/%s/poster.0xdb.jpg" % id
|
||||
if ox.cache.exists(url):
|
||||
return url
|
||||
return ''
|
||||
|
||||
19
Shared/lib/python3.4/site-packages/ox/web/piratecinema.py
Normal file
19
Shared/lib/python3.4/site-packages/ox/web/piratecinema.py
Normal file
|
|
@ -0,0 +1,19 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
from __future__ import print_function
|
||||
|
||||
import re
|
||||
from ox.net import read_url
|
||||
|
||||
def get_poster_url(id):
|
||||
url = 'http://piratecinema.org/posters/'
|
||||
html = read_url(url, unicode=True)
|
||||
results = re.compile('src="(.+)" title=".+\((\d{7})\)"').findall(html)
|
||||
for result in results:
|
||||
if result[1] == id:
|
||||
return url + result[0]
|
||||
return ''
|
||||
|
||||
if __name__ == '__main__':
|
||||
print(get_poster_url('0749451'))
|
||||
|
||||
54
Shared/lib/python3.4/site-packages/ox/web/rottentomatoes.py
Normal file
54
Shared/lib/python3.4/site-packages/ox/web/rottentomatoes.py
Normal file
|
|
@ -0,0 +1,54 @@
|
|||
# -*- coding: UTF-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
import re
|
||||
|
||||
from ox.cache import read_url
|
||||
from ox import find_re, strip_tags
|
||||
|
||||
|
||||
def get_url(id=None, imdb=None):
|
||||
#this would also wor but does not cache:
|
||||
'''
|
||||
from urllib2 import urlopen
|
||||
u = urlopen(url)
|
||||
return u.url
|
||||
'''
|
||||
if imdb:
|
||||
url = "http://www.rottentomatoes.com/alias?type=imdbid&s=%s" % imdb
|
||||
data = read_url(url)
|
||||
if "movie_title" in data:
|
||||
movies = re.compile('(/m/.*?/)').findall(data)
|
||||
if movies:
|
||||
return "http://www.rottentomatoes.com" + movies[0]
|
||||
return None
|
||||
|
||||
def get_og(data, key):
|
||||
return find_re(data, '<meta property="og:%s".*?content="(.*?)"' % key)
|
||||
|
||||
def get_data(url):
|
||||
data = read_url(url)
|
||||
r = {}
|
||||
r['title'] = find_re(data, '<h1 class="movie_title">(.*?)</h1>')
|
||||
if '(' in r['title']:
|
||||
r['year'] = find_re(r['title'], '\((\d*?)\)')
|
||||
r['title'] = strip_tags(re.sub('\((\d*?)\)', '', r['title'])).strip()
|
||||
r['summary'] = strip_tags(find_re(data, '<p id="movieSynopsis" class="movie_synopsis" itemprop="description">(.*?)</p>')).strip()
|
||||
r['summary'] = r['summary'].replace('\t', ' ').replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')
|
||||
if not r['summary']:
|
||||
r['summary'] = get_og(data, 'description')
|
||||
|
||||
meter = re.compile('<span id="all-critics-meter" class="meter(.*?)">(.*?)</span>').findall(data)
|
||||
meter = filter(lambda m: m[1].isdigit(), meter)
|
||||
if meter:
|
||||
r['tomatometer'] = meter[0][1]
|
||||
r['rating'] = find_re(data, 'Average Rating: <span>([\d.]+)/10</span>')
|
||||
r['user_score'] = find_re(data, '<span class="meter popcorn numeric ">(\d+)</span>')
|
||||
r['user_rating'] = find_re(data, 'Average Rating: ([\d.]+)/5')
|
||||
poster = get_og(data, 'image')
|
||||
if poster and not 'poster_default.gif' in poster:
|
||||
r['posters'] = [poster]
|
||||
for key in r.keys():
|
||||
if not r[key]:
|
||||
del r[key]
|
||||
return r
|
||||
|
||||
76
Shared/lib/python3.4/site-packages/ox/web/siteparser.py
Normal file
76
Shared/lib/python3.4/site-packages/ox/web/siteparser.py
Normal file
|
|
@ -0,0 +1,76 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
import re
|
||||
|
||||
from six import string_types
|
||||
|
||||
from ..cache import read_url
|
||||
from .. import decode_html
|
||||
from ..utils import datetime
|
||||
|
||||
|
||||
def cleanup(key, data, data_type):
|
||||
if data:
|
||||
if isinstance(data[0], string_types):
|
||||
#FIXME: some types need strip_tags
|
||||
#data = [strip_tags(decode_html(p)).strip() for p in data]
|
||||
data = [decode_html(p).strip() for p in data]
|
||||
elif isinstance(data[0], list) or isinstance(data[0], tuple):
|
||||
data = [cleanup(key, p, data_type) for p in data]
|
||||
while len(data) == 1 and not isinstance(data, string_types):
|
||||
data = data[0]
|
||||
if data_type == 'list' and isinstance(data, string_types):
|
||||
data = [data, ]
|
||||
elif data_type != 'list':
|
||||
data = ''
|
||||
return data
|
||||
|
||||
class SiteParser(dict):
|
||||
baseUrl = ''
|
||||
regex = {}
|
||||
|
||||
def get_url(self, page):
|
||||
return "%s%s" % (self.baseUrl, page)
|
||||
|
||||
def read_url(self, url, timeout):
|
||||
if not url in self._cache:
|
||||
self._cache[url] = read_url(url, timeout=timeout, unicode=True)
|
||||
return self._cache[url]
|
||||
|
||||
def __init__(self, timeout=-1):
|
||||
self._cache = {}
|
||||
for key in self.regex:
|
||||
url = self.get_url(self.regex[key]['page'])
|
||||
data = self.read_url(url, timeout)
|
||||
if isinstance(self.regex[key]['re'], string_types):
|
||||
data = re.compile(self.regex[key]['re'], re.DOTALL).findall(data)
|
||||
data = cleanup(key, data, self.regex[key]['type'])
|
||||
elif callable(self.regex[key]['re']):
|
||||
data = self.regex[key]['re'](data)
|
||||
else:
|
||||
for r in self.regex[key]['re']:
|
||||
if callable(r):
|
||||
f = r
|
||||
else:
|
||||
f = re.compile(r, re.DOTALL).findall
|
||||
if isinstance(data, string_types):
|
||||
data = f(data)
|
||||
else:
|
||||
data = [f(d) for d in data]
|
||||
data = cleanup(key, data, self.regex[key]['type'])
|
||||
def apply_f(f, data):
|
||||
if data and isinstance(data[0], list):
|
||||
data = [f(d) for d in data]
|
||||
else:
|
||||
data = f(data)
|
||||
return data
|
||||
if self.regex[key]['type'] == 'float' and data:
|
||||
data = apply_f(float, data)
|
||||
elif self.regex[key]['type'] == 'int' and data:
|
||||
data = apply_f(int, data)
|
||||
elif self.regex[key]['type'] == 'date':
|
||||
parse_date = lambda d: d and datetime.strptime('-'.join(d), '%m-%d-%Y').strftime('%Y-%m-%d')
|
||||
data = apply_f(parse_date, data)
|
||||
if data:
|
||||
self[key] = data
|
||||
|
||||
287
Shared/lib/python3.4/site-packages/ox/web/spiegel.py
Normal file
287
Shared/lib/python3.4/site-packages/ox/web/spiegel.py
Normal file
|
|
@ -0,0 +1,287 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
from datetime import datetime
|
||||
import re
|
||||
import time
|
||||
|
||||
import ox.cache
|
||||
from ox.html import decode_html, strip_tags
|
||||
import ox.net
|
||||
|
||||
|
||||
def get_news(year, month, day):
|
||||
sections = [
|
||||
'politik', 'wirtschaft', 'panorama', 'sport', 'kultur', 'netzwelt',
|
||||
'wissenschaft', 'unispiegel', 'schulspiegel', 'reise', 'auto'
|
||||
]
|
||||
dt = datetime(year, month, day)
|
||||
day = int(dt.strftime('%j'))
|
||||
date = dt.strftime('%d.%m.%Y')
|
||||
news = []
|
||||
for section in sections:
|
||||
url = 'http://www.spiegel.de/%s/0,1518,archiv-%d-%03d,00.html' % (section, year, day)
|
||||
if date == time.strftime('%d.%m.%Y', time.localtime()):
|
||||
html = ox.net.read_url(url)
|
||||
else:
|
||||
html = ox.cache.read_url(url)
|
||||
for item in re.compile('<div class="spTeaserCenterpage(.*?)</p>', re.DOTALL).findall(html):
|
||||
dateString = strip_tags(re.compile('<div class="spDateTime">(.*?)</div>', re.DOTALL).findall(item)[0]).strip()
|
||||
try:
|
||||
description = format_string(re.compile('<p>(.*?)<', re.DOTALL).findall(item)[0])
|
||||
except:
|
||||
description = ''
|
||||
try:
|
||||
imageUrl = re.compile('<img src="(.*?)"').findall(item)[0]
|
||||
except:
|
||||
imageUrl = ''
|
||||
try:
|
||||
title = format_string(re.compile('alt=[\'|"](.*?)[\'|"] title=', re.DOTALL).findall(item)[0]).replace(' : ', ': ').replace('::', ':')
|
||||
except:
|
||||
title = ''
|
||||
if dateString[:10] == date and description and imageUrl and title.find(': ') != -1:
|
||||
new = {}
|
||||
if len(dateString) == 10:
|
||||
new['date'] = '%s-%s-%s 00:00' % (dateString[6:10], dateString[3:5], dateString[:2])
|
||||
else:
|
||||
new['date'] = '%s-%s-%s %s:%s' % (dateString[6:10], dateString[3:5], dateString[:2], dateString[12:14], dateString[15:17])
|
||||
# fix decode_html
|
||||
# new['description'] = format_string(decode_html(description))
|
||||
new['description'] = format_string(description)
|
||||
new['imageUrl'] = imageUrl
|
||||
new['section'] = format_section(section)
|
||||
new['title'] = format_string(title)
|
||||
new['title1'] = new['title'].replace('\xdf', '\xdf\xdf')[:len(format_string(re.compile('<h4>(.*?)</h4>', re.DOTALL).findall(item)[0]))].replace('\xdf\xdf', '\xdf')
|
||||
if new['title1'][-1:] == ':':
|
||||
new['title1'] = new['title1'][0:-1]
|
||||
new['title2'] = new['title'][len(new['title1']) + 2:]
|
||||
new['url'] = re.compile('<a href="(.*?)"').findall(item)[0]
|
||||
if new['url'][:1] == '/':
|
||||
new['url'] = 'http://www.spiegel.de' + new['url']
|
||||
news.append(new)
|
||||
# print '%s, %s' % (new['section'], dateString)
|
||||
'''
|
||||
elif dateString[:10] == date and not description:
|
||||
print dateString + ' - no description'
|
||||
elif dateString[:10] == date and not imageUrl:
|
||||
print dateString + ' - no image'
|
||||
'''
|
||||
return news
|
||||
|
||||
def split_title(title):
|
||||
title1 = re.compile('(.*?): ').findall(title)[0]
|
||||
title2 = re.compile(': (.*?)$').findall(title)[0]
|
||||
return [title1, title2]
|
||||
|
||||
def format_string(string):
|
||||
string = string.replace('<span class="spOptiBreak"> </span>', '')
|
||||
string = string.replace('\n', ' ').replace(' ', ' ').strip()
|
||||
string = string.replace('&', '&').replace(''', '\'').replace('"', '"')
|
||||
return string
|
||||
|
||||
def format_section(string):
|
||||
return string[:1].upper() + string[1:].replace('spiegel', 'SPIEGEL')
|
||||
|
||||
def format_subsection(string):
|
||||
# SPIEGEL, SPIEGEL special
|
||||
subsection = {
|
||||
'abi': 'Abi - und dann?',
|
||||
'formel1': 'Formel 1',
|
||||
'jobundberuf': 'Job & Beruf',
|
||||
'leben': 'Leben U21',
|
||||
'mensch': 'Mensch & Technik',
|
||||
'sonst': '',
|
||||
'staedte': u'St\xc3dte',
|
||||
'ussports': 'US-Sports',
|
||||
'wunderbar': 'wunderBAR'
|
||||
}
|
||||
if subsection.has_key(string):
|
||||
return subsection[string].replace(u'\xc3', 'ae')
|
||||
return string[:1].upper() + string[1:]
|
||||
|
||||
def get_issue(year, week):
|
||||
coverUrl = 'http://www.spiegel.de/static/epaper/SP/%d/%d/ROSPANZ%d%03d0001-312.jpg' % (year, week, year, week)
|
||||
if not ox.net.exists(coverUrl):
|
||||
return None
|
||||
url = 'http://service.spiegel.de/digas/servlet/epaper?Q=SP&JG=%d&AG=%d&SE=1&AN=INHALT' % (year, week)
|
||||
contents = []
|
||||
data = ox.cache.read_url(url)
|
||||
items = re.compile('<a.?href="http://service.spiegel.de/digas/servlet/epaper\?Q=SP&JG=".?>(.*?)</a>').findall(data)
|
||||
for item in items:
|
||||
item = item[1]
|
||||
page = int(re.compile('&SE=(.*?)"').findall(item)[0])
|
||||
title = strip_tags(item).strip()
|
||||
contents.append({'title': title, 'page': page})
|
||||
pageUrl = {}
|
||||
pages = page + 2
|
||||
for page in range(1, pages + 10):
|
||||
url = 'http://www.spiegel.de/static/epaper/SP/%d/%d/ROSPANZ%d%03d%04d-205.jpg' % (year, week, year, week, page)
|
||||
if ox.cache.exists(url):
|
||||
pageUrl[page] = url
|
||||
else:
|
||||
pageUrl[page] = ''
|
||||
return {'pages': pages, 'contents': contents, 'coverUrl': coverUrl, 'pageUrl': pageUrl}
|
||||
|
||||
|
||||
def archive_issues():
|
||||
'''
|
||||
this is just an example of an archiving application
|
||||
'''
|
||||
p = {}
|
||||
import os
|
||||
from ox.utils import json
|
||||
import time
|
||||
archivePath = '/Volumes/Rolux Home/Desktop/Data/spiegel.de/Der Spiegel'
|
||||
localtime = time.localtime()
|
||||
year = int(time.strftime('%Y', localtime))
|
||||
week = int(time.strftime('%W', localtime))
|
||||
for y in range(year, 1993, -1):
|
||||
if y == year:
|
||||
wMax = week + 1
|
||||
else:
|
||||
wMax = 53
|
||||
for w in range(wMax, 0, -1):
|
||||
print 'get_issue(%d, %d)' % (y, w)
|
||||
issue = get_issue(y, w)
|
||||
if issue:
|
||||
dirname = '%s/%d/%02d' % (archivePath, y, w)
|
||||
if not os.path.exists(dirname):
|
||||
os.makedirs(dirname)
|
||||
filename = '%s/Der Spiegel %d %02d.json' % (dirname, y, w)
|
||||
if not os.path.exists(filename):
|
||||
data = json.dumps(issue, ensure_ascii = False)
|
||||
f = open(filename, 'w')
|
||||
f.write(data)
|
||||
f.close()
|
||||
filename = '%s/Der Spiegel %d %02d.txt' % (dirname, y, w)
|
||||
if not os.path.exists(filename):
|
||||
data = []
|
||||
for item in issue['contents']:
|
||||
data.append('%3d %s' % (item['page'], item['title']))
|
||||
data = '\n'.join(data)
|
||||
f = open(filename, 'w')
|
||||
f.write(data)
|
||||
f.close()
|
||||
filename = '%s/Der Spiegel %d %02d.jpg' % (dirname, y, w)
|
||||
if not os.path.exists(filename):
|
||||
data = ox.cache.read_url(issue['coverUrl'])
|
||||
f = open(filename, 'w')
|
||||
f.write(data)
|
||||
f.close()
|
||||
for page in issue['pageUrl']:
|
||||
url = issue['pageUrl'][page]
|
||||
if url:
|
||||
filename = '%s/Der Spiegel %d %02d %03d.jpg' % (dirname, y, w, page)
|
||||
if not os.path.exists(filename):
|
||||
data = ox.cache.read_url(url)
|
||||
f = open(filename, 'w')
|
||||
f.write(data)
|
||||
f.close()
|
||||
if not p:
|
||||
p = {'num': 1, 'sum': issue['pages'], 'min': issue['pages'], 'max': issue['pages']}
|
||||
else:
|
||||
p['num'] += 1
|
||||
p['sum'] += issue['pages']
|
||||
if issue['pages'] < p['min']:
|
||||
p['min'] = issue['pages']
|
||||
if issue['pages'] > p['max']:
|
||||
p['max'] = issue['pages']
|
||||
print p['min'], p['sum'] / p['num'], p['max']
|
||||
|
||||
|
||||
def archive_news():
|
||||
'''
|
||||
this is just an example of an archiving application
|
||||
'''
|
||||
import os
|
||||
from ox.utils import json
|
||||
import time
|
||||
|
||||
count = {}
|
||||
colon = []
|
||||
|
||||
archivePath = '/Volumes/Rolux Home/Desktop/Data/spiegel.de/Spiegel Online'
|
||||
days = [0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
|
||||
localtime = time.localtime()
|
||||
year = int(time.strftime('%Y', localtime))
|
||||
month = int(time.strftime('%m', localtime))
|
||||
day = int(time.strftime('%d', localtime)) - 1
|
||||
for y in range(year, 1999, -1):
|
||||
if y == year:
|
||||
mMax = month
|
||||
else:
|
||||
mMax = 12
|
||||
for m in range(mMax, 0, -1):
|
||||
if y == year and m == month:
|
||||
dMax = day
|
||||
elif m == 2 and y % 4 == 0 and y % 400 != 0:
|
||||
dMax = days[m] + 1
|
||||
else:
|
||||
dMax = days[m]
|
||||
for d in range(dMax, 0, -1):
|
||||
print 'getNews(%d, %d, %d)' % (y, m, d)
|
||||
news = getNews(y, m ,d)
|
||||
for new in news:
|
||||
dirname = archivePath + '/' + new['date'][0:4] + '/' + new['date'][5:7] + new['date'][8:10] + '/' + new['date'][11:13] + new['date'][14:16]
|
||||
if not os.path.exists(dirname):
|
||||
os.makedirs(dirname)
|
||||
if new['url'][-5:] == '.html':
|
||||
filename = dirname + '/' + new['url'].split('/')[-1][:-5] + '.json'
|
||||
else:
|
||||
filename = dirname + '/' + new['url'] + '.json'
|
||||
if not os.path.exists(filename) or True:
|
||||
data = json.dumps(new, ensure_ascii = False)
|
||||
f = open(filename, 'w')
|
||||
f.write(data)
|
||||
f.close()
|
||||
filename = filename[:-5] + '.txt'
|
||||
if not os.path.exists(filename) or True:
|
||||
data = split_title(new['title'])
|
||||
data.append(new['description'])
|
||||
data = '\n'.join(data)
|
||||
f = open(filename, 'w')
|
||||
f.write(data)
|
||||
f.close()
|
||||
filename = dirname + '/' + new['imageUrl'].split('/')[-1]
|
||||
if not os.path.exists(filename):
|
||||
data = ox.cache.read_url(new['imageUrl'])
|
||||
f = open(filename, 'w')
|
||||
f.write(data)
|
||||
f.close()
|
||||
|
||||
strings = new['url'].split('/')
|
||||
string = strings[3]
|
||||
if len(strings) == 6:
|
||||
string += '/' + strings[4]
|
||||
if not count.has_key(string):
|
||||
count[string] = {'count': 1, 'string': '%s %s http://www.spiegel.de/%s/0,1518,archiv-%d-%03d,00.html' % (new['date'], new['date'], new['section'].lower(), y, int(datetime(y, m, d).strftime('%j')))}
|
||||
else:
|
||||
count[string] = {'count': count[string]['count'] + 1, 'string': '%s %s' % (new['date'], count[string]['string'][17:])}
|
||||
strings = split_title(new['title'])
|
||||
if strings[0] != new['title1'] or strings[1] != new['title2']:
|
||||
colon.append('%s %s %s: %s' % (new['date'], new['title'], new['title1'], new['title2']))
|
||||
for key in sorted(count):
|
||||
print '%6d %-24s %s' % (count[key]['count'], key, count[key]['string'])
|
||||
for value in colon:
|
||||
print value
|
||||
|
||||
if __name__ == '__main__':
|
||||
# spiegel = Spiegel(2008, 8)
|
||||
# print spiegel.getContents()
|
||||
# news = News(2001, 9, 10)
|
||||
# output(news.getNews())
|
||||
'''
|
||||
x = []
|
||||
for d in range(10, 30):
|
||||
print '2/%d' % d
|
||||
news = getNews(2008, 2, d)
|
||||
for new in news:
|
||||
strings = new['url'].split('/')
|
||||
string = format_section(strings[3])
|
||||
if len(strings) == 6:
|
||||
string += '/' + format_subsection(strings[4])
|
||||
if not string in x:
|
||||
x.append(string)
|
||||
print x
|
||||
'''
|
||||
# archive_issues()
|
||||
archive_news()
|
||||
117
Shared/lib/python3.4/site-packages/ox/web/thepiratebay.py
Normal file
117
Shared/lib/python3.4/site-packages/ox/web/thepiratebay.py
Normal file
|
|
@ -0,0 +1,117 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
from datetime import datetime
|
||||
import re
|
||||
import socket
|
||||
from urllib import quote, urlencode
|
||||
from urllib2 import URLError
|
||||
|
||||
from ox import find_re, cache, strip_tags, decode_html, get_torrent_info, normalize_newlines
|
||||
from ox.normalize import normalize_imdbid
|
||||
import ox
|
||||
|
||||
from torrent import Torrent
|
||||
|
||||
cache_timeout = 24*60*60 # cache search only for 24 hours
|
||||
|
||||
season_episode = re.compile("S..E..", re.IGNORECASE)
|
||||
|
||||
|
||||
def read_url(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None, unicode=False):
|
||||
headers = headers.copy()
|
||||
headers['Cookie'] = 'language=en_EN'
|
||||
return cache.read_url(url, data, headers, timeout, unicode=unicode)
|
||||
|
||||
def find_movies(query=None, imdb=None, max_results=10):
|
||||
if imdb:
|
||||
query = "tt" + normalize_imdbid(imdb)
|
||||
results = []
|
||||
next = ["http://thepiratebay.org/search/%s/0/3/200" % quote(query), ]
|
||||
page_count = 1
|
||||
while next and page_count < 4:
|
||||
page_count += 1
|
||||
url = next[0]
|
||||
if not url.startswith('http'):
|
||||
if not url.startswith('/'):
|
||||
url = "/" + url
|
||||
url = "http://thepiratebay.org" + url
|
||||
data = read_url(url, timeout=cache_timeout, unicode=True)
|
||||
regexp = '''<tr.*?<td class="vertTh"><a href="/browse/(.*?)".*?<td><a href="(/torrent/.*?)" class="detLink".*?>(.*?)</a>.*?</tr>'''
|
||||
for row in re.compile(regexp, re.DOTALL).findall(data):
|
||||
torrentType = row[0]
|
||||
torrentLink = "http://thepiratebay.org" + row[1]
|
||||
torrentTitle = decode_html(row[2])
|
||||
# 201 = Movies , 202 = Movie DVDR, 205 TV Shows
|
||||
if torrentType in ['201']:
|
||||
results.append((torrentTitle, torrentLink, ''))
|
||||
if len(results) >= max_results:
|
||||
return results
|
||||
next = re.compile('<a.*?href="(.*?)".*?>.*?next.gif.*?</a>').findall(data)
|
||||
return results
|
||||
|
||||
def get_id(piratebayId):
|
||||
if piratebayId.startswith('http://torrents.thepiratebay.org/'):
|
||||
piratebayId = piratebayId.split('org/')[1]
|
||||
d = find_re(piratebayId, "tor/(\d+)")
|
||||
if d:
|
||||
piratebayId = d
|
||||
d = find_re(piratebayId, "torrent/(\d+)")
|
||||
if d:
|
||||
piratebayId = d
|
||||
return piratebayId
|
||||
|
||||
def exists(piratebayId):
|
||||
piratebayId = get_id(piratebayId)
|
||||
return ox.net.exists("http://thepiratebay.org/torrent/%s" % piratebayId)
|
||||
|
||||
def get_data(piratebayId):
|
||||
_key_map = {
|
||||
'spoken language(s)': u'language',
|
||||
'texted language(s)': u'subtitle language',
|
||||
'by': u'uploader',
|
||||
'leechers': 'leecher',
|
||||
'seeders': 'seeder',
|
||||
}
|
||||
piratebayId = get_id(piratebayId)
|
||||
torrent = dict()
|
||||
torrent[u'id'] = piratebayId
|
||||
torrent[u'domain'] = 'thepiratebay.org'
|
||||
torrent[u'comment_link'] = 'http://thepiratebay.org/torrent/%s' % piratebayId
|
||||
|
||||
data = read_url(torrent['comment_link'], unicode=True)
|
||||
torrent[u'title'] = find_re(data, '<title>(.*?) \(download torrent\) - TPB</title>')
|
||||
if not torrent[u'title']:
|
||||
return None
|
||||
torrent[u'title'] = decode_html(torrent[u'title']).strip()
|
||||
torrent[u'imdbId'] = find_re(data, 'title/tt(\d{7})')
|
||||
title = quote(torrent['title'].encode('utf-8'))
|
||||
torrent[u'torrent_link']="http://torrents.thepiratebay.org/%s/%s.torrent" % (piratebayId, title)
|
||||
for d in re.compile('dt>(.*?):</dt>.*?<dd.*?>(.*?)</dd>', re.DOTALL).findall(data):
|
||||
key = d[0].lower().strip()
|
||||
key = _key_map.get(key, key)
|
||||
value = decode_html(strip_tags(d[1].strip()))
|
||||
torrent[key] = value
|
||||
torrent[u'description'] = find_re(data, '<div class="nfo">(.*?)</div>')
|
||||
if torrent[u'description']:
|
||||
torrent['description'] = normalize_newlines(decode_html(strip_tags(torrent['description']))).strip()
|
||||
t = read_url(torrent[u'torrent_link'])
|
||||
torrent[u'torrent_info'] = get_torrent_info(t)
|
||||
return torrent
|
||||
|
||||
class Thepiratebay(Torrent):
|
||||
'''
|
||||
>>> Thepiratebay('123')
|
||||
{}
|
||||
|
||||
>>> Thepiratebay('3951349')['infohash']
|
||||
'4e84415d36ed7b54066160c05a0b0f061898d12b'
|
||||
'''
|
||||
def __init__(self, piratebayId):
|
||||
self.data = get_data(piratebayId)
|
||||
if not self.data:
|
||||
return
|
||||
Torrent.__init__(self)
|
||||
published = self.data['uploaded']
|
||||
published = published.replace(' GMT', '').split(' +')[0]
|
||||
self['published'] = datetime.strptime(published, "%Y-%m-%d %H:%M:%S")
|
||||
|
||||
37
Shared/lib/python3.4/site-packages/ox/web/torrent.py
Normal file
37
Shared/lib/python3.4/site-packages/ox/web/torrent.py
Normal file
|
|
@ -0,0 +1,37 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
from ox import int_value
|
||||
|
||||
|
||||
class Torrent(dict):
|
||||
'''
|
||||
>>> Torrent()
|
||||
{'files': 1, 'domain': u'', 'subtitle language': u'', 'seeder': -1, 'description': u'', 'language': u'', 'title': u'', 'imdbId': u'', 'downloaded': -1, 'leecher': -1, 'torrent_link': u'', 'torrent_info': {}, 'published': u'', 'announce': '', 'infohash': '', 'id': u'', 'comment_link': u'', 'size': -1}
|
||||
'''
|
||||
_string_keys = ('id', 'title', 'description', 'infohash', 'torrent_link', 'comment_link',
|
||||
'imdbId', 'announce', 'domain', 'published', 'language', 'subtitle language')
|
||||
_int_keys = ('size', 'seeder', 'leecher', 'downloaded', 'files')
|
||||
_dict_keys = ('torrent_info', )
|
||||
_list_keys = ()
|
||||
data = {'torrent_info': {}}
|
||||
|
||||
def __init__(self):
|
||||
for key in self._string_keys:
|
||||
self[key] = self.data.get(key, u'')
|
||||
for key in self._dict_keys:
|
||||
self[key] = self.data.get(key, {})
|
||||
for key in self._list_keys:
|
||||
self[key] = self.data.get(key, [])
|
||||
for key in self._int_keys:
|
||||
value = self.data.get(key, -1)
|
||||
if not isinstance(value, int):
|
||||
value = int(int_value(value))
|
||||
self[key] = value
|
||||
self['infohash'] = self.data['torrent_info'].get('hash', '')
|
||||
self['size'] = self.data['torrent_info'].get('size', -1)
|
||||
self['announce'] = self.data['torrent_info'].get('announce', '')
|
||||
if 'files' in self.data['torrent_info']:
|
||||
self['files'] = len(self.data['torrent_info']['files'])
|
||||
else:
|
||||
self['files'] = 1
|
||||
|
||||
32
Shared/lib/python3.4/site-packages/ox/web/tv.py
Normal file
32
Shared/lib/python3.4/site-packages/ox/web/tv.py
Normal file
|
|
@ -0,0 +1,32 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
import re
|
||||
import time
|
||||
|
||||
from ox import strip_tags, find_re
|
||||
from ox.cache import read_url
|
||||
|
||||
|
||||
def get_episode_data(url):
|
||||
'''
|
||||
prases informatin on tvcom episode pages
|
||||
returns dict with title, show, description, score
|
||||
example:
|
||||
get_episode_data('http://www.tv.com/lost/do-no-harm/episode/399310/summary.html')
|
||||
'''
|
||||
data = read_url(url, unicode=True)
|
||||
r = {}
|
||||
r['description'] = strip_tags(find_re(data, 'div id="main-col">.*?<div>(.*?)</div').split('\r')[0])
|
||||
r['show'] = find_re(data, '<h1>(.*?)</h1>')
|
||||
r['title'] = find_re(data, '<title>.*?: (.*?) - TV.com </title>')
|
||||
#episode score
|
||||
r['episode score'] = find_re(data, '<span class="f-28 f-bold mt-10 mb-10 f-FF9 db lh-18">(.*?)</span>')
|
||||
|
||||
match = re.compile('Episode Number: (\d*?) Season Num: (\d*?) First Aired: (.*?)  ').findall(data)
|
||||
if match:
|
||||
r['season'] = int(match[0][1])
|
||||
r['episode'] = int(match[0][0])
|
||||
#'Wednesday September 29, 2004' -> 2004-09-29
|
||||
r['air date'] = time.strftime('%Y-%m-%d', time.strptime(match[0][2], '%A %B %d, %Y'))
|
||||
return r
|
||||
|
||||
35
Shared/lib/python3.4/site-packages/ox/web/twitter.py
Normal file
35
Shared/lib/python3.4/site-packages/ox/web/twitter.py
Normal file
|
|
@ -0,0 +1,35 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
import re
|
||||
from datetime import datetime
|
||||
from urllib import quote
|
||||
|
||||
import lxml.html
|
||||
import ox
|
||||
from ox.cache import read_url
|
||||
|
||||
def find(query=None, user=None, timeout=60):
|
||||
if user:
|
||||
url = 'https://twitter.com/' + quote(user)
|
||||
else:
|
||||
url = 'https://twitter.com/search/' + quote(query)
|
||||
data = ox.cache.read_url(url, timeout=timeout).decode('utf-8')
|
||||
doc = lxml.html.document_fromstring(data)
|
||||
tweets = []
|
||||
for e in doc.xpath("//div[contains(@class, 'original-tweet')]"):
|
||||
t = lxml.html.tostring(e)
|
||||
text = e.xpath(".//p[contains(@class, 'js-tweet-text')]")[0]
|
||||
html = lxml.html.tostring(text, encoding='unicode').strip()
|
||||
text = ox.decode_html(ox.strip_tags(html)).strip()
|
||||
user = re.compile('data-name="(.*?)"').findall(t)[0]
|
||||
user = ox.decode_html(ox.strip_tags(user)).strip()
|
||||
tweets.append({
|
||||
'id': re.compile('data-tweet-id="(\d+)"').findall(t)[0],
|
||||
'user-id': re.compile('data-user-id="(\d+)"').findall(t)[0],
|
||||
'name': re.compile('data-screen-name="(.*?)"').findall(t)[0],
|
||||
'time': datetime.fromtimestamp(int(re.compile('data-time="(\d+)"').findall(t)[0])),
|
||||
'user': user,
|
||||
'text': text,
|
||||
'html': html,
|
||||
})
|
||||
return tweets
|
||||
99
Shared/lib/python3.4/site-packages/ox/web/ubu.py
Normal file
99
Shared/lib/python3.4/site-packages/ox/web/ubu.py
Normal file
|
|
@ -0,0 +1,99 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
import re
|
||||
|
||||
from ox import find_re, strip_tags, decode_html
|
||||
from ox.cache import read_url
|
||||
|
||||
|
||||
def get_id(url):
|
||||
return url.replace('http://www.ubu.com/', '').split('.html')[0]
|
||||
|
||||
def get_url(id):
|
||||
return 'http://www.ubu.com/%s.html' % id
|
||||
|
||||
def get_data(url):
|
||||
if not url.startswith('http:'):
|
||||
url = get_url(url)
|
||||
data = read_url(url, unicode=True)
|
||||
m = {
|
||||
'id': get_id(url),
|
||||
'url': url,
|
||||
'type': re.compile('ubu.com/(.*?)/').findall(url)[0]
|
||||
}
|
||||
for videourl, title in re.compile('<a href="(http://ubumexico.centro.org.mx/.*?)">(.*?)</a>').findall(data):
|
||||
if videourl.endswith('.srt'):
|
||||
m['srt'] = videourl
|
||||
elif not 'video' in m:
|
||||
m['video'] = videourl
|
||||
m['video'] = m['video'].replace('/video/ ', '/video/').replace(' ', '%20')
|
||||
if m['video'] == 'http://ubumexico.centro.org.mx/video/':
|
||||
del m['video']
|
||||
m['title'] = strip_tags(decode_html(title)).strip()
|
||||
if not 'url' in m:
|
||||
print url, 'missing'
|
||||
if 'title' in m:
|
||||
m['title'] = re.sub('(.*?) \(\d{4}\)$', '\\1', m['title'])
|
||||
|
||||
match = re.compile("flashvars','file=(.*?.flv)'").findall(data)
|
||||
if match:
|
||||
m['flv'] = match[0]
|
||||
m['flv'] = m['flv'].replace('/video/ ', '/video/').replace(' ', '%20')
|
||||
|
||||
y = re.compile('\((\d{4})\)').findall(data)
|
||||
if y:
|
||||
m['year'] = int(y[0])
|
||||
d = re.compile('Director: (.+)').findall(data)
|
||||
if d:
|
||||
m['director'] = strip_tags(decode_html(d[0])).strip()
|
||||
|
||||
a = re.compile('<a href="(.*?)">Back to (.*?)</a>', re.DOTALL).findall(data)
|
||||
if a:
|
||||
m['artist'] = strip_tags(decode_html(a[0][1])).strip()
|
||||
else:
|
||||
a = re.compile('<a href="(.*?)">(.*?) in UbuWeb Film').findall(data)
|
||||
if a:
|
||||
m['artist'] = strip_tags(decode_html(a[0][1])).strip()
|
||||
else:
|
||||
a = re.compile('<b>(.*?)\(b\..*?\d{4}\)').findall(data)
|
||||
if a:
|
||||
m['artist'] = strip_tags(decode_html(a[0])).strip()
|
||||
elif m['id'] == 'film/lawder_color':
|
||||
m['artist'] = 'Standish Lawder'
|
||||
if 'artist' in m:
|
||||
m['artist'] = m['artist'].replace('in UbuWeb Film', '')
|
||||
m['artist'] = m['artist'].replace('on UbuWeb Film', '').strip()
|
||||
if m['id'] == 'film/coulibeuf':
|
||||
m['title'] = 'Balkan Baroque'
|
||||
m['year'] = 1999
|
||||
return m
|
||||
|
||||
def get_films():
|
||||
ids = get_ids()
|
||||
films = []
|
||||
for id in ids:
|
||||
info = get_data(id)
|
||||
if info['type'] == 'film' and ('flv' in info or 'video' in info):
|
||||
films.append(info)
|
||||
return films
|
||||
|
||||
def get_ids():
|
||||
data = read_url('http://www.ubu.com/film/')
|
||||
ids = []
|
||||
author_urls = []
|
||||
for url, author in re.compile('<a href="(\./.*?)">(.*?)</a>').findall(data):
|
||||
url = 'http://www.ubu.com/film' + url[1:]
|
||||
data = read_url(url)
|
||||
author_urls.append(url)
|
||||
for u, title in re.compile('<a href="(.*?)">(.*?)</a>').findall(data):
|
||||
if not u.startswith('http'):
|
||||
if u == '../../sound/burroughs.html':
|
||||
u = 'http://www.ubu.com/sound/burroughs.html'
|
||||
elif u.startswith('../'):
|
||||
u = 'http://www.ubu.com/' + u[3:]
|
||||
else:
|
||||
u = 'http://www.ubu.com/film/' + u
|
||||
if u not in author_urls and u.endswith('.html'):
|
||||
ids.append(u)
|
||||
ids = [get_id(url) for url in list(set(ids))]
|
||||
return ids
|
||||
27
Shared/lib/python3.4/site-packages/ox/web/vimeo.py
Normal file
27
Shared/lib/python3.4/site-packages/ox/web/vimeo.py
Normal file
|
|
@ -0,0 +1,27 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
import re
|
||||
from StringIO import StringIO
|
||||
import xml.etree.ElementTree as ET
|
||||
|
||||
from ox.cache import read_url
|
||||
from ox import find_string, find_re
|
||||
|
||||
|
||||
def get_data(id):
|
||||
url = 'http://www.vimeo.com/moogaloop/load/clip:%s' %id
|
||||
xml = read_url(url)
|
||||
tree = ET.parse(StringIO(xml))
|
||||
request_signature = tree.find('request_signature').text
|
||||
request_signature_expires = tree.find('request_signature_expires').text
|
||||
|
||||
data = {}
|
||||
video_url = "http://www.vimeo.com/moogaloop/play/clip:%s/%s/%s/?q=" % \
|
||||
(id, request_signature, request_signature_expires)
|
||||
data['video_sd'] = video_url + 'sd'
|
||||
data['video_hd'] = video_url + 'hd'
|
||||
video = tree.find('video')
|
||||
for key in ('caption', 'width', 'height', 'duration', 'thumbnail'):
|
||||
data[key] = video.find(key).text
|
||||
return data
|
||||
|
||||
156
Shared/lib/python3.4/site-packages/ox/web/wikipedia.py
Normal file
156
Shared/lib/python3.4/site-packages/ox/web/wikipedia.py
Normal file
|
|
@ -0,0 +1,156 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
from __future__ import print_function
|
||||
|
||||
import re
|
||||
|
||||
from six.moves import urllib
|
||||
|
||||
from ox.utils import json
|
||||
from ox.cache import read_url
|
||||
from ox import find_re
|
||||
|
||||
|
||||
def get_id(url):
|
||||
return url.split("/")[-1]
|
||||
|
||||
def get_url(id=None, imdb=None, allmovie=None):
|
||||
if imdb:
|
||||
query = '"%s"'% imdb
|
||||
result = find(query)
|
||||
if result:
|
||||
url = result[0][1]
|
||||
data = get_movie_data(url)
|
||||
if 'imdb_id' in data:
|
||||
return url
|
||||
return ""
|
||||
if allmovie:
|
||||
query = '"amg_id = 1:%s"'% allmovie
|
||||
result = find(query)
|
||||
if result:
|
||||
url = result[0][1]
|
||||
return url
|
||||
return ''
|
||||
return "http://en.wikipedia.org/wiki/%s" % id
|
||||
|
||||
def get_movie_id(title, director='', year=''):
|
||||
query = '"%s" film %s %s' % (title, director, year)
|
||||
result = find(query, 1)
|
||||
if result:
|
||||
return result[0][1]
|
||||
return ''
|
||||
|
||||
def get_wiki_data(wikipedia_url):
|
||||
url = wikipedia_url.replace('wikipedia.org/wiki/', 'wikipedia.org/w/index.php?title=')
|
||||
url = "%s&action=raw" % url
|
||||
data = read_url(url).decode('utf-8')
|
||||
return data
|
||||
|
||||
def get_movie_data(wikipedia_url):
|
||||
if not wikipedia_url.startswith('http'):
|
||||
wikipedia_url = get_url(wikipedia_url)
|
||||
data = get_wiki_data(wikipedia_url)
|
||||
filmbox_data = find_re(data, '''\{\{[Ii]nfobox.[Ff]ilm(.*?)\n\}\}''')
|
||||
filmbox = {}
|
||||
_box = filmbox_data.strip().split('|')
|
||||
for row in _box:
|
||||
d = row.split('=')
|
||||
if len(d) == 2:
|
||||
_key = d[0].strip()
|
||||
if _key:
|
||||
key = _key
|
||||
if key[0] == '|':
|
||||
key = key[1:]
|
||||
key = key.strip()
|
||||
value = d[1].strip()
|
||||
value = value.replace('<!-- see WP:ALT -->', '')
|
||||
if '<br>' in value:
|
||||
value = value.split('<br>')
|
||||
if value:
|
||||
if key in filmbox:
|
||||
if isinstance(value, list) and isinstance(filmbox[key], basestring):
|
||||
filmbox[key] = [filmbox[key]] + value
|
||||
else:
|
||||
filmbox[key] += value
|
||||
if isinstance(filmbox[key], list):
|
||||
filmbox[key] = [k for k in filmbox[key] if k]
|
||||
else:
|
||||
filmbox[key] = value
|
||||
if not filmbox_data:
|
||||
return filmbox
|
||||
if 'amg_id' in filmbox and not filmbox['amg_id'].isdigit():
|
||||
del filmbox['amg_id']
|
||||
if 'Allmovie movie' in data:
|
||||
filmbox['amg_id'] = find_re(data, 'Allmovie movie\|.*?(\d+)')
|
||||
elif 'Allmovie title' in data:
|
||||
filmbox['amg_id'] = find_re(data, 'Allmovie title\|.*?(\d+)')
|
||||
|
||||
if 'Official website' in data:
|
||||
filmbox['website'] = find_re(data, 'Official website\|(.*?)}').strip()
|
||||
|
||||
r = re.compile('{{IMDb title\|id=(\d{7})', re.IGNORECASE).findall(data)
|
||||
if r:
|
||||
filmbox['imdb_id'] = r[0]
|
||||
else:
|
||||
r = re.compile('{{IMDb title\|(\d{7})', re.IGNORECASE).findall(data)
|
||||
if r:
|
||||
filmbox['imdb_id'] = r[0]
|
||||
|
||||
r = re.compile('{{Internet Archive.*?\|id=(.*?)[\|}]', re.IGNORECASE).findall(data)
|
||||
if r:
|
||||
filmbox['archiveorg_id'] = r[0]
|
||||
|
||||
r = re.compile('{{mojo title\|(.*?)[\|}]', re.IGNORECASE).findall(data)
|
||||
if r:
|
||||
filmbox['mojo_id'] = r[0].replace('id=', '')
|
||||
|
||||
r = re.compile('{{rotten-tomatoes\|(.*?)[\|}]', re.IGNORECASE).findall(data)
|
||||
if r:
|
||||
filmbox['rottentomatoes_id'] = r[0].replace('id=', '')
|
||||
if 'google video' in data:
|
||||
filmbox['google_video_id'] = find_re(data, 'google video\|.*?(\d*?)[\|}]')
|
||||
if 'DEFAULTSORT' in data:
|
||||
filmbox['title_sort'] = find_re(data, '''\{\{DEFAULTSORT:(.*?)\}\}''')
|
||||
return filmbox
|
||||
|
||||
def get_image_url(name):
|
||||
url = 'http://en.wikipedia.org/wiki/Image:' + name.replace(' ', '%20')
|
||||
data = read_url(url)
|
||||
url = find_re(data, 'href="(http://upload.wikimedia.org/.*?)"')
|
||||
if not url:
|
||||
url = find_re(data, 'href="(//upload.wikimedia.org/.*?)"')
|
||||
if url:
|
||||
url = 'http:' + url
|
||||
return url
|
||||
|
||||
def get_poster_url(wikipedia_url):
|
||||
if not wikipedia_url.startswith('http'): wikipedia_url = get_url(wikipedia_url)
|
||||
data = get_movie_data(wikipedia_url)
|
||||
if 'image' in data:
|
||||
return get_image_url(data['image'])
|
||||
return ''
|
||||
|
||||
def get_movie_poster(wikipedia_url):
|
||||
# deprecated, use get_poster_url()
|
||||
return get_poster_url(wikipedia_url)
|
||||
|
||||
def get_allmovie_id(wikipedia_url):
|
||||
data = get_movie_data(wikipedia_url)
|
||||
return data.get('amg_id', '')
|
||||
|
||||
def find(query, max_results=10):
|
||||
query = {'action': 'query', 'list':'search', 'format': 'json',
|
||||
'srlimit': max_results, 'srwhat': 'text', 'srsearch': query.encode('utf-8')}
|
||||
url = "http://en.wikipedia.org/w/api.php?" + urllib.parse.urlencode(query)
|
||||
data = read_url(url)
|
||||
if not data:
|
||||
data = read_url(url, timeout=0)
|
||||
result = json.loads(data.decode('utf-8'))
|
||||
results = []
|
||||
if result and 'query' in result:
|
||||
for r in result['query']['search']:
|
||||
title = r['title']
|
||||
url = "http://en.wikipedia.org/wiki/%s" % title.replace(' ', '_')
|
||||
results.append((title, url, ''))
|
||||
return results
|
||||
|
||||
217
Shared/lib/python3.4/site-packages/ox/web/youtube.py
Normal file
217
Shared/lib/python3.4/site-packages/ox/web/youtube.py
Normal file
|
|
@ -0,0 +1,217 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
from urllib import quote, unquote_plus
|
||||
import urllib2
|
||||
import cookielib
|
||||
import re
|
||||
from xml.dom.minidom import parseString
|
||||
import json
|
||||
|
||||
import feedparser
|
||||
import ox
|
||||
from ox.cache import read_url, cache_timeout
|
||||
|
||||
|
||||
def get_id(url):
|
||||
match = re.compile('v=(.+?)($|&)').findall(url)
|
||||
if match:
|
||||
return match[0][0]
|
||||
|
||||
def get_url(id):
|
||||
return 'http://www.youtube.com/watch?v=%s' % id
|
||||
|
||||
def video_url(youtubeId, format='mp4', timeout=cache_timeout):
|
||||
"""
|
||||
youtubeId - if of video
|
||||
format - video format, options: webm, 1080p, 720p, mp4, high
|
||||
"""
|
||||
fmt = None
|
||||
if format == '4k':
|
||||
fmt=38
|
||||
elif format == '1080p':
|
||||
fmt=37
|
||||
elif format == '720p':
|
||||
fmt=22
|
||||
elif format == 'mp4':
|
||||
fmt=18
|
||||
elif format == 'high':
|
||||
fmt=35
|
||||
elif format == 'webm':
|
||||
streams = videos(youtubeId, 'webm')
|
||||
return streams[max(streams.keys())]['url']
|
||||
|
||||
streams = videos(youtubeId)
|
||||
if str(fmt) in streams:
|
||||
return streams[str(fmt)]['url']
|
||||
|
||||
def get_video_info(id):
|
||||
eurl = get_url(id)
|
||||
data = read_url(eurl)
|
||||
t = re.compile('\W[\'"]?t[\'"]?: ?[\'"](.+?)[\'"]').findall(data)
|
||||
if t:
|
||||
t = t[0]
|
||||
else:
|
||||
raise IOError
|
||||
url = "http://www.youtube.com/get_video_info?&video_id=%s&el=$el&ps=default&eurl=%s&hl=en_US&t=%s" % (id, quote(eurl), quote(t))
|
||||
data = read_url(url)
|
||||
info = {}
|
||||
for part in data.split('&'):
|
||||
key, value = part.split('=')
|
||||
info[key] = unquote_plus(value).replace('+', ' ')
|
||||
return info
|
||||
|
||||
def find(query, max_results=10, offset=1, orderBy='relevance'):
|
||||
query = quote(query)
|
||||
url = "http://gdata.youtube.com/feeds/api/videos?vq=%s&orderby=%s&start-index=%s&max-results=%s" % (query, orderBy, offset, max_results)
|
||||
data = read_url(url)
|
||||
fd = feedparser.parse(data)
|
||||
videos = []
|
||||
for item in fd.entries:
|
||||
id = item['id'].split('/')[-1]
|
||||
title = item['title']
|
||||
description = item['description']
|
||||
videos.append((title, id, description))
|
||||
if len(videos) >= max_results:
|
||||
return videos
|
||||
return videos
|
||||
|
||||
def info(id, timeout=cache_timeout):
|
||||
info = {}
|
||||
if id.startswith('http'):
|
||||
id = get_id(id)
|
||||
if not id:
|
||||
return info
|
||||
url = "http://gdata.youtube.com/feeds/api/videos/%s?v=2" % id
|
||||
data = read_url(url, timeout=timeout)
|
||||
xml = parseString(data)
|
||||
info['id'] = id
|
||||
info['url'] = get_url(id)
|
||||
info['title'] = xml.getElementsByTagName('title')[0].firstChild.data
|
||||
info['description'] = xml.getElementsByTagName('media:description')[0].firstChild.data
|
||||
info['date'] = xml.getElementsByTagName('published')[0].firstChild.data.split('T')[0]
|
||||
info['author'] = "http://www.youtube.com/user/%s"%xml.getElementsByTagName('name')[0].firstChild.data
|
||||
|
||||
info['categories'] = []
|
||||
for cat in xml.getElementsByTagName('media:category'):
|
||||
info['categories'].append(cat.firstChild.data)
|
||||
|
||||
k = xml.getElementsByTagName('media:keywords')[0].firstChild
|
||||
if k:
|
||||
info['keywords'] = k.data.split(', ')
|
||||
data = read_url(info['url'], timeout=timeout)
|
||||
match = re.compile('<h4>License:</h4>(.*?)</p>', re.DOTALL).findall(data)
|
||||
if match:
|
||||
info['license'] = match[0].strip()
|
||||
info['license'] = re.sub('<.+?>', '', info['license']).strip()
|
||||
|
||||
url = "http://www.youtube.com/api/timedtext?hl=en&type=list&tlangs=1&v=%s&asrs=1" % id
|
||||
data = read_url(url, timeout=timeout)
|
||||
xml = parseString(data)
|
||||
languages = [t.getAttribute('lang_code') for t in xml.getElementsByTagName('track')]
|
||||
if languages:
|
||||
info['subtitles'] = {}
|
||||
for language in languages:
|
||||
url = "http://www.youtube.com/api/timedtext?hl=en&v=%s&type=track&lang=%s&name&kind"%(id, language)
|
||||
data = read_url(url, timeout=timeout)
|
||||
xml = parseString(data)
|
||||
subs = []
|
||||
for t in xml.getElementsByTagName('text'):
|
||||
start = float(t.getAttribute('start'))
|
||||
duration = t.getAttribute('dur')
|
||||
if not duration:
|
||||
duration = '2'
|
||||
end = start + float(duration)
|
||||
if t.firstChild:
|
||||
text = t.firstChild.data
|
||||
subs.append({
|
||||
'in': start,
|
||||
'out': end,
|
||||
'value': ox.decode_html(text),
|
||||
})
|
||||
info['subtitles'][language] = subs
|
||||
return info
|
||||
|
||||
def videos(id, format=''):
|
||||
stream_type = {
|
||||
'flv': 'video/x-flv',
|
||||
'webm': 'video/webm',
|
||||
'mp4': 'video/mp4'
|
||||
}.get(format)
|
||||
info = get_video_info(id)
|
||||
stream_map = info['url_encoded_fmt_stream_map']
|
||||
streams = {}
|
||||
for x in stream_map.split(','):
|
||||
stream = {}
|
||||
#for s in x.split('\\u0026'):
|
||||
for s in x.split('&'):
|
||||
key, value = s.split('=')
|
||||
value = unquote_plus(value)
|
||||
stream[key] = value
|
||||
if 'url' in stream and 'sig' in stream:
|
||||
stream['url'] = '%s&signature=%s' % (stream['url'], stream['sig'])
|
||||
if not stream_type or stream['type'].startswith(stream_type):
|
||||
streams[stream['itag']] = stream
|
||||
return streams
|
||||
|
||||
def playlist(url):
|
||||
data = read_url(url)
|
||||
items = []
|
||||
for i in list(set(re.compile('<a href="(/watch\?v=.*?)" title="(.*?)" ').findall(data))):
|
||||
items.append({
|
||||
'title': i[1],
|
||||
'url': 'http://www.youtube.com' + i[0].split('&')[0]
|
||||
})
|
||||
return items
|
||||
|
||||
def download_webm(id, filename):
|
||||
stream_type = 'video/webm'
|
||||
url = "http://www.youtube.com/watch?v=%s" % id
|
||||
cj = cookielib.CookieJar()
|
||||
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
|
||||
opener.addheaders = [
|
||||
('User-Agent',
|
||||
'Mozilla/5.0 (X11; Linux i686; rv:2.0) Gecko/20100101 Firefox/4.0'),
|
||||
('Accept-Language', 'en-us, en;q=0.50')
|
||||
]
|
||||
u = opener.open(url)
|
||||
data = u.read()
|
||||
u.close()
|
||||
match = re.compile('"url_encoded_fmt_stream_map": "(.*?)"').findall(data)
|
||||
streams = {}
|
||||
for x in match[0].split(','):
|
||||
stream = {}
|
||||
for s in x.split('\\u0026'):
|
||||
key, value = s.split('=')
|
||||
value = unquote_plus(value)
|
||||
stream[key] = value
|
||||
if stream['type'].startswith(stream_type):
|
||||
streams[stream['itag']] = stream
|
||||
if streams:
|
||||
s = max(streams.keys())
|
||||
url = streams[s]['url']
|
||||
if 'sig' in streams[s]:
|
||||
url += 'signature=' + streams[s]['sig']
|
||||
else:
|
||||
return None
|
||||
|
||||
#download video and save to file.
|
||||
u = opener.open(url)
|
||||
f = open(filename, 'w')
|
||||
data = True
|
||||
while data:
|
||||
data = u.read(4096)
|
||||
f.write(data)
|
||||
f.close()
|
||||
u.close()
|
||||
return filename
|
||||
|
||||
def get_config(id):
|
||||
if id.startswith('http'):
|
||||
url = id
|
||||
else:
|
||||
url = get_url(id)
|
||||
data = read_url(url)
|
||||
match = re.compile('ytplayer.config = (.*?);<').findall(data)
|
||||
if match:
|
||||
config = json.load(match[0])
|
||||
return config
|
||||
Loading…
Add table
Add a link
Reference in a new issue