Switch to python3

This commit is contained in:
j 2014-09-30 18:15:32 +02:00
commit 9ba4b6a91a
5286 changed files with 677347 additions and 576888 deletions

View file

@ -0,0 +1,34 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
# GPL 2011
__version__ = '2.1.1'
from . import cache
from . import js
from . import jsonc
from . import net
from . import srt
from . import utils
from .api import *
from .file import *
from .form import *
from .format import *
from .geo import *
from .html import *
#image depends on PIL, not easy enough to instal on osx
try:
from .image import *
except:
pass
from .location import *
from .movie import *
from .normalize import *
from .oembed import *
from .text import *
#currently broken in python3
try:
from .torrent import *
except:
pass
from .fixunicode import *

View file

@ -0,0 +1,112 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
# GPL 2011
from __future__ import with_statement
from six.moves import http_cookiejar as cookielib
import gzip
from six import StringIO
from six.moves import urllib
from types import MethodType
from . import __version__
from .utils import json
from .form import MultiPartForm
__all__ = ['getAPI', 'API']
def getAPI(url, cj=None):
return API(url, cj)
class API(object):
__version__ = __version__
__name__ = 'ox'
DEBUG = False
debuglevel = 0
def __init__(self, url, cj=None):
if cj:
self._cj = cj
else:
self._cj = cookielib.CookieJar()
self._opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(self._cj),
urllib.HTTPHandler(debuglevel=self.debuglevel))
self._opener.addheaders = [
('User-Agent', '%s/%s' % (self.__name__, self.__version__))
]
self.url = url
r = self._request('api', {'docs': True})
self._properties = r['data']['actions']
self._actions = r['data']['actions'].keys()
for a in r['data']['actions']:
self._add_action(a)
def _add_method(self, method, name):
if name is None:
name = method.func_name
setattr(self, name, MethodType(method, self, type(self)))
def _add_action(self, action):
def method(self, *args, **kw):
if not kw:
if args:
kw = args[0]
else:
kw = None
return self._request(action, kw)
if 'doc' in self._properties[action]:
method.__doc__ = self._properties[action]['doc']
method.func_name = str(action)
self._add_method(method, action)
def _json_request(self, url, form):
result = {}
try:
body = str(form)
request = urllib.reuqest.Request(str(url))
request.add_header('Content-type', form.get_content_type())
request.add_header('Content-Length', str(len(body)))
request.add_header('Accept-Encoding', 'gzip, deflate')
request.add_data(body)
f = self._opener.open(request)
result = f.read()
if f.headers.get('content-encoding', None) == 'gzip':
result = gzip.GzipFile(fileobj=StringIO.StringIO(result)).read()
result = result.decode('utf-8')
return json.loads(result)
except urllib.error.HTTPError as e:
if self.DEBUG:
import webbrowser
if e.code >= 500:
with open('/tmp/error.html', 'w') as f:
f.write(e.read())
webbrowser.open_new_tab('/tmp/error.html')
result = e.read()
try:
result = result.decode('utf-8')
result = json.loads(result)
except:
result = {'status':{}}
result['status']['code'] = e.code
result['status']['text'] = str(e)
return result
except:
if self.DEBUG:
import webbrowser
import traceback
traceback.print_exc()
if result:
with open('/tmp/error.html', 'w') as f:
f.write(str(result))
webbrowser.open_new_tab('/tmp/error.html')
raise
def _request(self, action, data=None):
form = MultiPartForm()
form.add_field('action', action)
if data:
form.add_field('data', json.dumps(data))
return self._json_request(self.url, form)

View file

@ -0,0 +1,333 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
# GPL 2011
from __future__ import with_statement, print_function
import gzip
import zlib
import hashlib
import os
from six import BytesIO
import time
from six.moves import urllib
import sqlite3
from .utils import json
from .file import makedirs
from . import net
from .net import DEFAULT_HEADERS, detect_encoding
cache_timeout = 30*24*60*60 # default is 30 days
COMPRESS_TYPES = (
'text/html',
'text/plain',
'text/xml',
'application/xhtml+xml',
'application/x-javascript',
'application/javascript',
'application/ecmascript',
'application/rss+xml'
)
def status(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):
'''
>>> status('http://google.com')
200
>>> status('http://google.com/mysearch')
404
'''
headers = get_headers(url, data, headers)
return int(headers['status'])
def exists(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):
'''
>>> exists('http://google.com')
True
>>> exists('http://google.com/mysearch')
False
'''
s = status(url, data, headers, timeout)
if s >= 200 and s < 400:
return True
return False
def get_headers(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout):
url_headers = store.get(url, data, headers, timeout, "headers")
if not url_headers:
url_headers = net.get_headers(url, data, headers)
store.set(url, data, -1, url_headers)
return url_headers
class InvalidResult(Exception):
"""Base class for exceptions in this module."""
def __init__(self, result, headers):
self.result = result
self.headers = headers
def _fix_unicode_url(url):
if not isinstance(url, bytes):
url = url.encode('utf-8')
return url
def read_url(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout, valid=None, unicode=False):
'''
url - url to load
data - possible post data
headers - headers to send with request
timeout - get from cache if cache not older than given seconds, -1 to get from cache
valid - function to check if result is ok, its passed result and headers
if this function fails, InvalidResult will be raised deal with it in your code
'''
if net.DEBUG:
print('ox.cache.read_url', url)
#FIXME: send last-modified / etag from cache and only update if needed
#url = _fix_unicode_url(url)
result = store.get(url, data, headers, timeout)
url_headers = {}
if not result:
try:
url_headers, result = net.read_url(url, data, headers, return_headers=True)
except urllib.error.HTTPError as e:
e.headers['Status'] = "%s" % e.code
for key in e.headers:
url_headers[key.lower()] = e.headers[key]
result = e.read()
if url_headers.get('content-encoding', None) == 'gzip':
result = gzip.GzipFile(fileobj=BytesIO(result)).read()
if not valid or valid(result, url_headers):
store.set(url, post_data=data, data=result, headers=url_headers)
else:
raise InvalidResult(result, url_headers)
if unicode:
ctype = url_headers.get('content-type', '').lower()
if 'charset' in ctype:
encoding = ctype.split('charset=')[-1]
else:
encoding = detect_encoding(result)
if not encoding:
encoding = 'latin-1'
result = result.decode(encoding)
return result
def save_url(url, filename, overwrite=False):
if not os.path.exists(filename) or overwrite:
dirname = os.path.dirname(filename)
if not os.path.exists(dirname):
os.makedirs(dirname)
data = read_url(url)
f = open(filename, 'w')
f.write(data)
f.close()
def cache_path():
return os.environ.get('oxCACHE', os.path.expanduser('~/.ox/cache'))
class Cache:
def __init__(self):
pass
def get(self, url, data, headers=DEFAULT_HEADERS, timeout=-1, value="data"):
'''
if value == 'data' return data of url if its in the cache else None
if value == 'headers' return headers for url
'''
pass
def set(self, url, post_data, data, headers):
pass
class SQLiteCache(Cache):
def __init__(self):
path = cache_path()
if not os.path.exists(path):
os.makedirs(path)
self.db = os.path.join(path, "cache.sqlite")
self.create()
def connect(self):
self.conn = sqlite3.connect(self.db, timeout=10)
return self.conn
def create(self):
conn = self.connect()
c = conn.cursor()
# Create table and indexes
c.execute('''CREATE TABLE IF NOT EXISTS cache (url_hash varchar(42) unique, domain text, url text,
post_data text, headers text, created int, data blob, only_headers int)''')
c.execute('''CREATE INDEX IF NOT EXISTS cache_domain ON cache (domain)''')
c.execute('''CREATE INDEX IF NOT EXISTS cache_url ON cache (url)''')
c.execute('''CREATE INDEX IF NOT EXISTS cache_url_hash ON cache (url_hash)''')
c.execute('''CREATE TABLE IF NOT EXISTS setting (key varchar(1024) unique, value text)''')
if int(self.get_setting(c, 'version', 0)) < 1:
self.set_setting(c, 'version', 1)
c.execute('''ALTER TABLE cache ADD compressed INT DEFAULT 0''')
conn.commit()
def get_setting(self, c, key, default=None):
c.execute('SELECT value FROM setting WHERE key = ?', (key, ))
for row in c:
return row[0]
return default
def set_setting(self, c, key, value):
c.execute(u'INSERT OR REPLACE INTO setting values (?, ?)', (key, str(value)))
def get(self, url, data={}, headers=DEFAULT_HEADERS, timeout=-1, value="data"):
r = None
if timeout == 0:
return r
if data:
url_hash = hashlib.sha1((url + '?' + data).encode('utf-8')).hexdigest()
else:
url_hash = hashlib.sha1(url.encode('utf-8')).hexdigest()
conn = self.connect()
c = conn.cursor()
sql = 'SELECT %s, compressed FROM cache WHERE url_hash=?' % value
if timeout > 0:
now = time.mktime(time.localtime())
t = (url_hash, now-timeout)
sql += ' AND created > ?'
else:
t = (url_hash, )
if value != "headers":
sql += ' AND only_headers != 1 '
c.execute(sql, t)
for row in c:
r = row[0]
if value == 'headers':
r = json.loads(r)
elif value == 'data':
if row[1] == 1:
r = zlib.decompress(r)
else:
r = str(r)
break
c.close()
conn.close()
return r
def set(self, url, post_data, data, headers):
if post_data:
url_hash = hashlib.sha1((url + '?' + post_data).encode('utf-8')).hexdigest()
else:
url_hash = hashlib.sha1(url.encode('utf-8')).hexdigest()
domain = ".".join(urllib.parse.urlparse(url)[1].split('.')[-2:])
conn = self.connect()
c = conn.cursor()
# Insert a row of data
if not post_data: post_data=""
only_headers = 0
if data == -1:
only_headers = 1
data = ""
created = time.mktime(time.localtime())
content_type = headers.get('content-type', '').split(';')[0].strip()
if content_type in COMPRESS_TYPES:
compressed = 1
data = zlib.compress(data)
else:
compressed = 0
data = sqlite3.Binary(data)
#fixme: this looks wrong
try:
_headers = json.dumps(headers)
except:
for h in headers:
headers[h] = headers[h].decode(detect_encoding(headers[h]))
_headers = json.dumps(headers)
t = (url_hash, domain, url, post_data, _headers, created,
data, only_headers, compressed)
c.execute(u"""INSERT OR REPLACE INTO cache values (?, ?, ?, ?, ?, ?, ?, ?, ?)""", t)
# Save (commit) the changes and clean up
conn.commit()
c.close()
conn.close()
class FileCache(Cache):
def __init__(self):
f, self.root = cache_path().split(':')
def files(self, domain, h):
prefix = os.path.join(self.root, domain, h[:2], h[2:4], h[4:6], h[6:8])
i = os.path.join(prefix, '%s.json'%h)
f = os.path.join(prefix, '%s.dat'%h)
return prefix, i, f
def get(self, url, data={}, headers=DEFAULT_HEADERS, timeout=-1, value="data"):
r = None
if timeout == 0:
return r
if data:
url_hash = hashlib.sha1((url + '?' + data).encode('utf-8')).hexdigest()
else:
url_hash = hashlib.sha1(url.encode('utf-8')).hexdigest()
domain = ".".join(urllib.parse.urlparse(url)[1].split('.')[-2:])
prefix, i, f = self.files(domain, url_hash)
if os.path.exists(i):
with open(i) as _i:
try:
info = json.load(_i)
except:
return r
now = time.mktime(time.localtime())
expired = now-timeout
if value != 'headers' and info['only_headers']:
return None
if timeout < 0 or info['created'] > expired:
if value == 'headers':
r = info['headers']
else:
with open(f) as data:
r = data.read()
if info['compressed']:
r = zlib.decompress(r)
return r
def set(self, url, post_data, data, headers):
if post_data:
url_hash = hashlib.sha1((url + '?' + post_data).encode('utf-8')).hexdigest()
else:
url_hash = hashlib.sha1(url.encode('utf-8')).hexdigest()
domain = ".".join(urllib.parse.urlparse(url)[1].split('.')[-2:])
prefix, i, f = self.files(domain, url_hash)
makedirs(prefix)
created = time.mktime(time.localtime())
content_type = headers.get('content-type', '').split(';')[0].strip()
info = {
'compressed': content_type in COMPRESS_TYPES,
'only_headers': data == -1,
'created': created,
'headers': headers,
'url': url,
}
if post_data:
info['post_data'] = post_data
if not info['only_headers']:
if info['compressed']:
data = zlib.compress(data)
with open(f, 'w') as _f:
_f.write(data)
with open(i, 'w') as _i:
json.dump(info, _i)
if cache_path().startswith('fs:'):
store = FileCache()
else:
store = SQLiteCache()

View file

@ -0,0 +1 @@
from actions import actions

View file

@ -0,0 +1,143 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
from __future__ import division, with_statement
import inspect
import sys
from django.conf import settings
from ..shortcuts import render_to_json_response, json_response
from ...utils import json
def autodiscover():
#register api actions from all installed apps
from django.utils.importlib import import_module
from django.utils.module_loading import module_has_submodule
for app in settings.INSTALLED_APPS:
if app != 'api':
mod = import_module(app)
try:
import_module('%s.views'%app)
except:
if module_has_submodule(mod, 'views'):
raise
def trim(docstring):
if not docstring:
return ''
# Convert tabs to spaces (following the normal Python rules)
# and split into a list of lines:
lines = docstring.expandtabs().splitlines()
# Determine minimum indentation (first line doesn't count):
indent = sys.maxint
for line in lines[1:]:
stripped = line.lstrip()
if stripped:
indent = min(indent, len(line) - len(stripped))
# Remove indentation (first line is special):
trimmed = [lines[0].strip()]
if indent < sys.maxint:
for line in lines[1:]:
trimmed.append(line[indent:].rstrip())
# Strip off trailing and leading blank lines:
while trimmed and not trimmed[-1]:
trimmed.pop()
while trimmed and not trimmed[0]:
trimmed.pop(0)
# Return a single string:
return '\n'.join(trimmed)
class ApiActions(dict):
properties = {}
versions = {}
def __init__(self):
def api(request):
'''
returns list of all known api actions
param data {
docs: bool
}
if docs is true, action properties contain docstrings
return {
status: {'code': int, 'text': string},
data: {
actions: {
'api': {
cache: true,
doc: 'recursion'
},
'hello': {
cache: true,
..
}
...
}
}
}
'''
data = json.loads(request.POST.get('data', '{}'))
docs = data.get('docs', False)
code = data.get('code', False)
version = getattr(request, 'version', None)
if version:
_actions = self.versions.get(version, {}).keys()
_actions = list(set(_actions + self.keys()))
else:
_actions = self.keys()
_actions.sort()
actions = {}
for a in _actions:
actions[a] = self.properties[a]
if docs:
actions[a]['doc'] = self.doc(a, version)
if code:
actions[a]['code'] = self.code(a, version)
response = json_response({'actions': actions})
return render_to_json_response(response)
self.register(api)
def doc(self, name, version=None):
if version:
f = self.versions[version].get(name, self.get(name))
else:
f = self[name]
return trim(f.__doc__)
def code(self, name, version=None):
if version:
f = self.versions[version].get(name, self.get(name))
else:
f = self[name]
if name != 'api' and hasattr(f, 'func_closure') and f.func_closure:
fc = filter(lambda c: hasattr(c.cell_contents, '__call__'), f.func_closure)
f = fc[len(fc)-1].cell_contents
info = f.func_code.co_filename[len(settings.PROJECT_ROOT)+1:]
info = u'%s:%s' % (info, f.func_code.co_firstlineno)
return info, trim(inspect.getsource(f))
def register(self, method, action=None, cache=True, version=None):
if not action:
action = method.func_name
if version:
if not version in self.versions:
self.versions[version] = {}
self.versions[version][action] = method
else:
self[action] = method
self.properties[action] = {'cache': cache}
def unregister(self, action):
if action in self:
del self[action]
actions = ApiActions()
def error(request):
'''
this action is used to test api error codes, it should return a 503 error
'''
success = error_is_success
return render_to_json_response({})
actions.register(error)

View file

@ -0,0 +1,13 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
from django.conf.urls import patterns
import views
import actions
actions.autodiscover()
urlpatterns = patterns("",
(r'^$', views.api),
)

View file

@ -0,0 +1,44 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
from __future__ import division, with_statement
from django.shortcuts import render_to_response
from django.template import RequestContext
from django.conf import settings
from ..shortcuts import render_to_json_response, json_response
from actions import actions
def api(request):
if request.META['REQUEST_METHOD'] == "OPTIONS":
response = render_to_json_response({'status': {'code': 200,
'text': 'use POST'}})
response['Access-Control-Allow-Origin'] = '*'
return response
if not 'action' in request.POST:
methods = actions.keys()
api = []
for f in sorted(methods):
api.append({'name': f,
'doc': actions.doc(f).replace('\n', '<br>\n')})
context = RequestContext(request, {
'api': api,
'settings': settings,
'sitename': settings.SITENAME
})
return render_to_response('api.html', context)
action = request.POST['action']
version = getattr(request, 'version', None)
if version:
f = actions.versions.get(version, {}).get(action, actions.get(action))
else:
f = actions.get(action)
if f:
response = f(request)
else:
response = render_to_json_response(json_response(status=400,
text='Unknown action %s' % action))
response['Access-Control-Allow-Origin'] = '*'
return response

View file

@ -0,0 +1,32 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
try:
from django.contrib.auth.decorators import wraps
except:
from django.utils.functional import wraps
from shortcuts import render_to_json_response
def login_required_json(function=None):
"""
Decorator for views that checks that the user is logged in
return json error if not logged in.
"""
def _wrapped_view(request, *args, **kwargs):
if request.user.is_authenticated():
return function(request, *args, **kwargs)
return render_to_json_response({'status': {'code': 401, 'text': 'login required'}})
return wraps(function)(_wrapped_view)
def admin_required_json(function=None):
"""
Decorator for views that checks that the user is logged in
return json error if not logged in.
"""
def _wrapped_view(request, *args, **kwargs):
if request.user.is_authenticated() and request.user.get_profile().get_level() == 'admin':
return function(request, *args, **kwargs)
return render_to_json_response({'status': {'code': 403, 'text': 'permission denied'}})
return wraps(function)(_wrapped_view)

View file

@ -0,0 +1,108 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import time
import datetime
from django.db import models
from django.utils import datetime_safe
from ox.utils import json
def to_json(python_object):
if isinstance(python_object, datetime.datetime):
if python_object.year < 1900:
tt = python_object.timetuple()
value = '%d-%02d-%02dT%02d:%02d%02dZ' % tuple(list(tt)[:6])
else:
value = python_object.strftime('%Y-%m-%dT%H:%M:%SZ')
return {'__class__': 'datetime.datetime',
'__value__': value}
if isinstance(python_object, datetime_safe.datetime):
return {'__class__': 'datetime.datetime',
'__value__': python_object.strftime('%Y-%m-%dT%H:%M:%SZ')}
if isinstance(python_object, time.struct_time):
return {'__class__': 'time.asctime',
'__value__': time.asctime(python_object)}
try:
if isinstance(python_object, bytes):
return {'__class__': 'bytes',
'__value__': list(python_object)}
except:
pass
raise TypeError(repr(python_object) + ' is not JSON serializable')
def from_json(json_object):
if '__class__' in json_object:
if json_object['__class__'] == 'bytes':
return bytes(json_object['__value__'])
if json_object['__class__'] == 'datetime_safe.datetime' \
or json_object['__class__'] == 'datetime.datetime':
return datetime_safe.datetime.strptime(json_object['__value__'], '%Y-%m-%dT%H:%M:%SZ')
if json_object['__class__'] == 'time.asctime':
return time.strptime(json_object['__value__'])
return json_object
class DictField(models.TextField):
"""DictField is a textfield that contains JSON-serialized dictionaries."""
# Used so to_python() is called
__metaclass__ = models.SubfieldBase
def to_python(self, value):
"""Convert our string value to python after we load it from the DB"""
if value == None:
return value
if isinstance(value, dict):
return value
try:
value = json.loads(value, object_hook=from_json)
except: #this is required to load fixtures
value = eval(value)
assert isinstance(value, dict)
return value
def get_db_prep_save(self, value, connection):
"""Convert our JSON object to a string before we save"""
if value == None:
return value
if isinstance(value, basestring):
value = eval(value)
assert isinstance(value, dict)
value = json.dumps(value, default=to_json)
return super(DictField, self).get_db_prep_save(value, connection=connection)
class TupleField(models.TextField):
"""TupleField is a textfield that contains JSON-serialized tuples."""
# Used so to_python() is called
__metaclass__ = models.SubfieldBase
def to_python(self, value):
"""Convert our string value to JSON after we load it from the DB"""
if isinstance(value, tuple):
return value
try:
value = json.loads(value, object_hook=from_json)
except: #this is required to load fixtures
value = eval(value)
assert isinstance(value, list)
return tuple(value)
def get_db_prep_save(self, value, connection):
"""Convert our JSON object to a string before we save"""
if isinstance(value, basestring):
value = eval(value)
if isinstance(value, list):
value = tuple(value)
assert isinstance(value, tuple)
value = json.dumps(value, default=to_json)
return super(TupleField, self).get_db_prep_save(value, connection=connection)
try:
from south.modelsinspector import add_introspection_rules
add_introspection_rules([], ["^ox.django\.fields\.DictField"])
add_introspection_rules([], ["^ox.django\.fields\.TupleField"])
except:
pass

View file

@ -0,0 +1,58 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import os
import mimetypes
from datetime import datetime, timedelta
from urllib import quote
from django.http import HttpResponse, Http404
from django.conf import settings
def HttpFileResponse(path, content_type=None, filename=None):
if not os.path.exists(path):
raise Http404
if not content_type:
content_type = mimetypes.guess_type(path)[0]
if not content_type:
content_type = 'application/octet-stream'
if getattr(settings, 'XACCELREDIRECT', False):
response = HttpResponse()
response['Content-Length'] = os.stat(path).st_size
for PREFIX in ('STATIC', 'MEDIA'):
root = getattr(settings, PREFIX+'_ROOT', '')
url = getattr(settings, PREFIX+'_URL', '')
if root and path.startswith(root):
path = url + path[len(root)+1:]
if isinstance(path, unicode):
path = path.encode('utf-8')
response['X-Accel-Redirect'] = path
if content_type:
response['Content-Type'] = content_type
elif getattr(settings, 'XSENDFILE', False):
response = HttpResponse()
if isinstance(path, unicode):
path = path.encode('utf-8')
response['X-Sendfile'] = path
if content_type:
response['Content-Type'] = content_type
response['Content-Length'] = os.stat(path).st_size
else:
response = HttpResponse(open(path), content_type=content_type)
if filename:
if isinstance(filename, unicode):
filename = filename.encode('utf-8')
response['Content-Disposition'] = "attachment; filename*=UTF=8''%s" % quote(filename)
response['Expires'] = datetime.strftime(datetime.utcnow() + timedelta(days=1), "%a, %d-%b-%Y %H:%M:%S GMT")
def allow_access():
for key in ('X-Accel-Redirect', 'X-Sendfile'):
if key in response:
del response[key]
response['Access-Control-Allow-Origin'] = '*'
response.allow_access = allow_access
return response

View file

@ -0,0 +1,15 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
from shortcuts import HttpErrorJson, render_to_json_response
class ExceptionMiddleware(object):
def process_exception(self, request, exception):
if isinstance(exception, HttpErrorJson):
return render_to_json_response(exception.response)
return None
class ChromeFrameMiddleware(object):
def process_response(self, request, response):
response['X-UA-Compatible'] = 'chrome=1'
return response

View file

@ -0,0 +1,113 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import os
import sys
import time
import signal
import threading
import atexit
import Queue
_interval = 1.0
_times = {}
_files = []
_running = False
_queue = Queue.Queue()
_lock = threading.Lock()
def _restart(path):
_queue.put(True)
prefix = 'monitor (pid=%d):' % os.getpid()
print >> sys.stderr, '%s Change detected to \'%s\'.' % (prefix, path)
print >> sys.stderr, '%s Triggering process restart.' % prefix
os.kill(os.getpid(), signal.SIGINT)
def _modified(path):
try:
# If path doesn't denote a file and were previously
# tracking it, then it has been removed or the file type
# has changed so force a restart. If not previously
# tracking the file then we can ignore it as probably
# pseudo reference such as when file extracted from a
# collection of modules contained in a zip file.
if not os.path.isfile(path):
return path in _times
# Check for when file last modified.
mtime = os.stat(path).st_mtime
if path not in _times:
_times[path] = mtime
# Force restart when modification time has changed, even
# if time now older, as that could indicate older file
# has been restored.
if mtime != _times[path]:
return True
except:
# If any exception occured, likely that file has been
# been removed just before stat(), so force a restart.
return True
return False
def _monitor():
while 1:
# Check modification times on all files in sys.modules.
for module in sys.modules.values():
if not hasattr(module, '__file__'):
continue
path = getattr(module, '__file__')
if not path:
continue
if os.path.splitext(path)[1] in ['.pyc', '.pyo', '.pyd']:
path = path[:-1]
if _modified(path):
return _restart(path)
# Check modification times on files which have
# specifically been registered for monitoring.
for path in _files:
if _modified(path):
return _restart(path)
# Go to sleep for specified interval.
try:
return _queue.get(timeout=_interval)
except:
pass
_thread = threading.Thread(target=_monitor)
_thread.setDaemon(True)
def _exiting():
try:
_queue.put(True)
except:
pass
_thread.join()
atexit.register(_exiting)
def track(path):
if not path in _files:
_files.append(path)
def start(interval=1.0):
global _interval
if interval < _interval:
_interval = interval
global _running
_lock.acquire()
if not _running:
_running = True
_thread.start()
_lock.release()

View file

@ -0,0 +1,73 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
from django.db.models.sql import Query
from django.db.models.sql.compiler import SQLCompiler
from django.db import connections
import django.db.models.query
'''
models.py:
-----------------------------------
from ox.django.query import QuerySet
class Manager(models.Manager):
def get_query_set(self):
return QuerySet(self.model)
class Model(models.Model):
...
objects = Manager()
'''
class SQLCompiler(SQLCompiler):
def get_ordering(self):
result, group_by = super(SQLCompiler, self).get_ordering()
if self.query.nulls_last and len(result):
if self.connection.vendor == 'sqlite':
_result = []
for r in result:
if r.endswith(' DESC'):
_r = r[:-len(' DESC')]
elif r.endswith(' ASC'):
_r = r[:-len(' ASC')]
_result.append(_r + ' IS NULL')
_result.append(r)
result = _result
else:
result = map(lambda e: e + ' NULLS LAST', result)
return result, group_by
class Query(Query):
nulls_last = False
def clone(self, *args, **kwargs):
obj = super(Query, self).clone(*args, **kwargs)
obj.nulls_last = self.nulls_last
return obj
def get_compiler(self, using=None, connection=None):
if using is None and connection is None:
raise ValueError("Need either using or connection")
if using:
connection = connections[using]
# Check that the compiler will be able to execute the query
for alias, aggregate in self.aggregate_select.items():
connection.ops.check_aggregate_support(aggregate)
return SQLCompiler(self, connection, using)
class QuerySet(django.db.models.query.QuerySet):
def __init__(self, model=None, query=None, using=None, **kwargs):
super(QuerySet, self).__init__(model=model, query=query, using=None, **kwargs)
self.query = query or Query(self.model)
def order_by(self, *args, **kwargs):
nulls_last = kwargs.pop('nulls_last', False)
obj = super(QuerySet, self).order_by(*args, **kwargs)
obj.query.nulls_last = nulls_last
return obj

View file

@ -0,0 +1,51 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import datetime
from django.utils import datetime_safe
from django.http import HttpResponse, Http404
try:
import simplejson as json
except ImportError:
from django.utils import simplejson as json
from django.conf import settings
class HttpErrorJson(Http404):
def __init__(self, response):
self.response = response
def json_response(data=None, status=200, text='ok'):
if not data:
data = {}
return {'status': {'code': status, 'text': text}, 'data': data}
def _to_json(python_object):
if isinstance(python_object, datetime.datetime):
if python_object.year < 1900:
tt = python_object.timetuple()
return '%d-%02d-%02dT%02d:%02d%02dZ' % tuple(list(tt)[:6])
return python_object.strftime('%Y-%m-%dT%H:%M:%SZ')
if isinstance(python_object, datetime_safe.datetime):
return python_object.strftime('%Y-%m-%dT%H:%M:%SZ')
raise TypeError(u'%s %s is not JSON serializable' % (repr(python_object), type(python_object)))
def render_to_json_response(dictionary, content_type="text/json", status=200):
indent=None
if settings.DEBUG:
content_type = "text/javascript"
indent = 2
if getattr(settings, 'JSON_DEBUG', False):
print json.dumps(dictionary, indent=2, default=_to_json, ensure_ascii=False).encode('utf-8')
return HttpResponse(json.dumps(dictionary, indent=indent, default=_to_json,
ensure_ascii=False).encode('utf-8'), content_type=content_type, status=status)
def get_object_or_404_json(klass, *args, **kwargs):
from django.shortcuts import _get_queryset
queryset = _get_queryset(klass)
try:
return queryset.get(*args, **kwargs)
except queryset.model.DoesNotExist:
response = {'status': {'code': 404,
'text': '%s not found' % queryset.model._meta.object_name}}
raise HttpErrorJson(response)

View file

@ -0,0 +1,48 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
from django.utils.datetime_safe import datetime
from django.http import HttpResponse,Http404
from django.core.servers.basehttp import FileWrapper
from django.conf import settings
import mimetypes
import os
def basic_sendfile(fname,download_name=None):
if not os.path.exists(fname):
raise Http404
wrapper = FileWrapper(open(fname,"r"))
content_type = mimetypes.guess_type(fname)[0]
response = HttpResponse(wrapper, content_type=content_type)
response['Content-Length'] = os.path.getsize(fname)
if download_name:
response['Content-Disposition'] = "attachment; filename=%s"%download_name
return response
def x_sendfile(fname,download_name=None):
if not os.path.exists(fname):
raise Http404
content_type = mimetypes.guess_type(fname)[0]
response = HttpResponse('', content_type=content_type)
response['Content-Length'] = os.path.getsize(fname)
response['X-Sendfile'] = fname
if download_name:
response['Content-Disposition'] = "attachment; filename=%s"%download_name
return response
try:
__sendfile = getattr(settings,'SENDFILE',False) == 'x_sendfile'
except:
__sendfile = False
if __sendfile == 'x_sendfile':
sendfile = x_sendfile
else:
sendfile = basic_sendfile

View file

@ -0,0 +1,67 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import cookielib
import urllib2
from StringIO import StringIO
from celery.utils import get_full_cls_name
from celery.backends import default_backend
from django.http import HttpResponse
from django.conf import settings
from shortcuts import json_response
import ox
def task_status(request, task_id):
response = json_response(status=200, text='ok')
status = default_backend.get_status(task_id)
res = default_backend.get_result(task_id)
response['data'] = {
'id': task_id,
'status': status,
'result': res
}
if status in default_backend.EXCEPTION_STATES:
traceback = default_backend.get_traceback(task_id)
response['data'].update({'result': str(res.args[0]),
'exc': get_full_cls_name(res.__class__),
'traceback': traceback})
return response
class SessionCookieJar(cookielib.LWPCookieJar):
def save(self):
return "#LWP-Cookies-2.0\n" + self.as_lwp_str()
def load(self, data, ignore_discard=True, ignore_expires=True):
f = StringIO(data)
self._really_load(f, 'memory', ignore_discard, ignore_expires)
def api_proxy(request):
'''
settings.OXAPI_URL =...
from ox.django.views import api_proxy
urlpatterns = patterns('',
url(r'^api/$', api_proxy)
'''
url = settings.OXAPI_URL
cj = SessionCookieJar()
if 'cj' in request.session:
cj.load(request.session['cj'])
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
opener.addheaders = [
('User-Agent', request.META.get('HTTP_USER_AGENT'))
]
form = ox.MultiPartForm()
for key in request.POST:
form.add_field(key, request.POST[key])
r = urllib2.Request(url)
body = str(form)
r.add_header('Content-type', form.get_content_type())
r.add_header('Content-length', len(body))
r.add_data(body)
f = opener.open(r)
response = HttpResponse(f.read())
request.session['cj'] = cj.save()
return response

View file

@ -0,0 +1,9 @@
import django.newforms as forms
from string import Template
from django.utils.safestring import mark_safe
class FirefoggWidget(forms.FileInput):
def render(self, name, value, attrs=None):
tpl = Template(u"""<h1>This should be a Firefogg widget for $name, current value: $value</h1>""")
return mark_safe(tpl.substitute(name=name, value=value))

View file

@ -0,0 +1,341 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
# GPL 2008
from __future__ import division, with_statement, print_function
import os
import hashlib
import re
import shutil
import struct
import subprocess
import sqlite3
from .utils import json
__all__ = ['sha1sum', 'oshash', 'avinfo', 'makedirs']
EXTENSIONS = {
'audio': [
'aac', 'aif', 'aiff',
'flac', 'm4a', 'mp3', 'oga', 'ogg', 'wav', 'wma'
],
'image': [
'bmp', 'gif', 'jpeg', 'jpg', 'png', 'svg', 'webp'
],
'subtitle': [
'idx', 'srt', 'sub'
],
'video': [
'3gp',
'avi', 'divx', 'dv', 'flv', 'm2t', 'm4v', 'mkv', 'mov', 'mp4',
'mpeg', 'mpg', 'mts', 'ogm', 'ogv', 'rm', 'vob', 'webm', 'wmv',
'mod', 'tod', # http://en.wikipedia.org/wiki/MOD_and_TOD
'mxf', 'ts'
],
}
def cmd(program):
local = os.path.expanduser('~/.ox/bin/%s' % program)
if os.path.exists(local):
program = local
return program
def _get_file_cache():
import ox.cache
path = ox.cache.cache_path()
if path.startswith('fs:'):
path = path[3:]
return os.path.join(path, 'files.sqlite')
def cache(filename, type='oshash'):
conn = sqlite3.connect(_get_file_cache(), timeout=10)
conn.text_factory = str
conn.row_factory = sqlite3.Row
if not cache.init:
c = conn.cursor()
c.execute('CREATE TABLE IF NOT EXISTS cache (path varchar(1024) unique, oshash varchar(16), sha1 varchar(42), size int, mtime int, info text)')
c.execute('CREATE INDEX IF NOT EXISTS cache_oshash ON cache (oshash)')
c.execute('CREATE INDEX IF NOT EXISTS cache_sha1 ON cache (sha1)')
conn.commit()
cache.init = True
c = conn.cursor()
c.execute('SELECT oshash, sha1, info, size, mtime FROM cache WHERE path = ?', (filename, ))
stat = os.stat(filename)
row = None
h = None
sha1 = None
info = ''
for row in c:
if stat.st_size == row['size'] and int(stat.st_mtime) == int(row['mtime']):
value = row[type]
if value:
if type == 'info':
value = json.loads(value)
return value
h = row['oshash']
sha1 = row['sha1']
info = row['info']
if type == 'oshash':
value = h = oshash(filename, cached=False)
elif type == 'sha1':
value = sha1 = sha1sum(filename, cached=False)
elif type == 'info':
value = avinfo(filename, cached=False)
info = json.dumps(value)
t = (filename, h, sha1, stat.st_size, int(stat.st_mtime), info)
with conn:
sql = u'INSERT OR REPLACE INTO cache values (?, ?, ?, ?, ?, ?)'
c.execute(sql, t)
return value
cache.init = None
def cleanup_cache():
conn = sqlite3.connect(_get_file_cache(), timeout=10)
conn.text_factory = str
conn.row_factory = sqlite3.Row
c = conn.cursor()
c.execute('SELECT path FROM cache')
paths = [r[0] for r in c]
for path in paths:
if not os.path.exists(path):
c.execute('DELETE FROM cache WHERE path = ?', (path, ))
conn.commit()
c.execute('VACUUM')
conn.commit()
def sha1sum(filename, cached=False):
if cached:
return cache(filename, 'sha1')
sha1 = hashlib.sha1()
with open(filename) as f:
for chunk in iter(lambda: f.read(128*sha1.block_size), ''):
sha1.update(chunk)
return sha1.hexdigest()
'''
os hash - http://trac.opensubtitles.org/projects/opensubtitles/wiki/HashSourceCodes
plus modification for files < 64k, buffer is filled with file data and padded with 0
'''
def oshash(filename, cached=True):
if cached:
return cache(filename, 'oshash')
try:
longlongformat = 'q' # long long
bytesize = struct.calcsize(longlongformat)
f = open(filename, "rb")
filesize = os.path.getsize(filename)
hash = filesize
if filesize < 65536:
for x in range(int(filesize/bytesize)):
buffer = f.read(bytesize)
(l_value,)= struct.unpack(longlongformat, buffer)
hash += l_value
hash = hash & 0xFFFFFFFFFFFFFFFF #to remain as 64bit number
else:
for x in range(int(65536/bytesize)):
buffer = f.read(bytesize)
(l_value,)= struct.unpack(longlongformat, buffer)
hash += l_value
hash = hash & 0xFFFFFFFFFFFFFFFF #to remain as 64bit number
f.seek(max(0,filesize-65536),0)
for x in range(int(65536/bytesize)):
buffer = f.read(bytesize)
(l_value,)= struct.unpack(longlongformat, buffer)
hash += l_value
hash = hash & 0xFFFFFFFFFFFFFFFF
f.close()
returnedhash = "%016x" % hash
return returnedhash
except(IOError):
return "IOError"
def avinfo(filename, cached=True):
if cached:
return cache(filename, 'info')
if os.path.getsize(filename):
ffmpeg2theora = cmd('ffmpeg2theora')
p = subprocess.Popen([ffmpeg2theora], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
info, error = p.communicate()
version = info.split('\n')[0].split(' - ')[0].split(' ')[-1]
if version < '0.27':
raise EnvironmentError('version of ffmpeg2theora needs to be 0.27 or later, found %s' % version)
p = subprocess.Popen([ffmpeg2theora, '--info', filename],
stdout=subprocess.PIPE, stderr=subprocess.PIPE)
info, error = p.communicate()
try:
info = json.loads(info)
except:
#remove metadata, can be broken
reg = re.compile('"metadata": {.*?},', re.DOTALL)
info = re.sub(reg, '', info)
info = json.loads(info)
if 'video' in info:
for v in info['video']:
if not 'display_aspect_ratio' in v and 'width' in v:
v['display_aspect_ratio'] = '%d:%d' % (v['width'], v['height'])
v['pixel_aspect_ratio'] = '1:1'
if len(info.get('audio', [])) > 1:
if 'metadata' in info['audio'][0]:
for stream in info['audio']:
language = stream.get('metadata', {}).get('language')
if language and language != 'und':
stream['language'] = language[0]
else:
ffmpeg = cmd('ffmpeg')
p = subprocess.Popen([ffmpeg, '-i', filename], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout, stderr = p.communicate()
languages = [re.compile('\((.+?)\):').findall(l) for l in stderr.split('\n') if 'Stream' in l and 'Audio' in l]
for i, stream in enumerate(info['audio']):
language = languages[i]
if language and language[0] != 'und':
stream['language'] = language[0]
return info
return {'path': filename, 'size': 0}
def ffprobe(filename):
p = subprocess.Popen([
cmd('ffprobe'),
'-show_format',
'-show_streams',
'-print_format',
'json',
'-i', filename
], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
info, error = p.communicate()
ffinfo = json.loads(info)
def fix_value(key, value):
if key == 'r_frame_rate':
value = value.replace('/', ':')
elif key == 'bit_rate':
value = float(value) / 1000
elif key == 'duration':
value = float(value)
elif key == 'size':
value = int(value)
return value
info = {}
for key in ('duration', 'size', 'bit_rate'):
info[{
'bit_rate': 'bitrate'
}.get(key, key)] = fix_value(key, ffinfo['format'][key])
info['audio'] = []
info['video'] = []
info['metadata'] = ffinfo['format'].get('tags', {})
for s in ffinfo['streams']:
tags = s.pop('tags', {})
language = None
for t in tags:
if t == 'language':
language = tags[t]
else:
info['metadata'][t] = tags[t]
if s.get('codec_type') in ('audio', 'video'):
stream = {}
if language and language != 'und':
stream['language'] = language
keys = [
'codec_name',
'width',
'height',
'bit_rate',
'index',
'display_aspect_ratio',
'sample_rate',
'channels',
]
if s['codec_type'] == 'video':
keys += [
'sample_aspect_ratio',
'r_frame_rate',
'pix_fmt',
]
for key in keys:
if key in s:
stream[{
'codec_name': 'codec',
'bit_rate': 'bitrate',
'index': 'id',
'r_frame_rate': 'framerate',
'sample_rate': 'samplerate',
'pix_fmt': 'pixel_format',
}.get(key, key)] = fix_value(key, s[key])
info[s['codec_type']].append(stream)
else:
pass
#print s
for v in info['video']:
if not 'display_aspect_ratio' in v and 'width' in v:
v['display_aspect_ratio'] = '%d:%d' % (v['width'], v['height'])
v['pixel_aspect_ratio'] = '1:1'
info['oshash'] = oshash(filename)
info['path'] = os.path.basename(filename)
return info
def makedirs(path):
if not os.path.exists(path):
try:
os.makedirs(path)
except OSError as e:
if e.errno != 17:
raise
def copy_file(source, target, verbose=False):
if verbose:
print('copying', source, 'to', target)
write_path(target)
shutil.copyfile(source, target)
def read_file(file, verbose=False):
if verbose:
print('reading', file)
f = open(file)
data = f.read()
f.close()
return data
def read_json(file, verbose=False):
if verbose:
print('reading', file)
with open(file) as fd:
data = json.load(fd)
return data
def write_file(file, data, verbose=False):
if verbose:
print('writing', file)
write_path(file)
f = open(file, 'w')
f.write(data)
f.close()
return len(data)
def write_image(file, image, verbose=False):
if verbose:
print('writing', file)
write_path(file)
image.save(file)
def write_json(file, data, ensure_ascii=True, indent=0, sort_keys=False, verbose=False):
data = json.dumps(data, ensure_ascii=ensure_ascii, indent=indent, sort_keys=sort_keys)
write_file(file, data if ensure_ascii else data.encode('utf-8'), verbose=verbose)
def write_link(source, target, verbose=False):
if verbose:
print('linking', source, 'to', target)
write_path(target)
if os.path.exists(target):
os.unlink(target)
os.symlink(source, target)
def write_path(file):
path = os.path.split(file)[0]
if path and not os.path.exists(path):
os.makedirs(path)

View file

@ -0,0 +1,335 @@
# vi:si:et:sw=4:sts=4:ts=4
# -*- coding: utf-8 -*-
# from http://blog.lumino.so/2012/08/20/fix-unicode-mistakes-with-python/
# MIT
from __future__ import print_function
import unicodedata
from six import unichr
__all__ = ['fix_bad_unicode']
def fix_bad_unicode(text):
"""
Something you will find all over the place, in real-world text, is text
that's mistakenly encoded as utf-8, decoded in some ugly format like
latin-1 or even Windows codepage 1252, and encoded as utf-8 again.
This causes your perfectly good Unicode-aware code to end up with garbage
text because someone else (or maybe "someone else") made a mistake.
This function looks for the evidence of that having happened and fixes it.
It determines whether it should replace nonsense sequences of single-byte
characters that were really meant to be UTF-8 characters, and if so, turns
them into the correctly-encoded Unicode character that they were meant to
represent.
The input to the function must be Unicode. It's not going to try to
auto-decode bytes for you -- then it would just create the problems it's
supposed to fix.
>>> fix_bad_unicode(u'único')
'único'
>>> fix_bad_unicode('This text is fine already :þ')
'This text is fine already :þ'
Because these characters often come from Microsoft products, we allow
for the possibility that we get not just Unicode characters 128-255, but
also Windows's conflicting idea of what characters 128-160 are.
>>> fix_bad_unicode('This — should be an em dash')
'This — should be an em dash'
We might have to deal with both Windows characters and raw control
characters at the same time, especially when dealing with characters like
\x81 that have no mapping in Windows.
>>> fix_bad_unicode('This text is sad .â\x81”.')
'This text is sad .⁔.'
This function even fixes multiple levels of badness:
>>> wtf = '\xc3\xa0\xc2\xb2\xc2\xa0_\xc3\xa0\xc2\xb2\xc2\xa0'
>>> fix_bad_unicode(wtf)
'ಠ_ಠ'
However, it has safeguards against fixing sequences of letters and
punctuation that can occur in valid text:
>>> fix_bad_unicode('not such a fan of Charlotte Brontë…”')
'not such a fan of Charlotte Brontë…”'
Cases of genuine ambiguity can sometimes be addressed by finding other
characters that are not double-encoding, and expecting the encoding to
be consistent:
>>> fix_bad_unicode('AHÅ™, the new sofa from IKEA®')
'AHÅ™, the new sofa from IKEA®'
Finally, we handle the case where the text is in a single-byte encoding
that was intended as Windows-1252 all along but read as Latin-1:
>>> fix_bad_unicode('This text was never Unicode at all\x85')
'This text was never Unicode at all…'
"""
if not isinstance(text, str):
raise TypeError("This isn't even decoded into Unicode yet. "
"Decode it first.")
if len(text) == 0:
return text
maxord = max(ord(char) for char in text)
tried_fixing = []
if maxord < 128:
# Hooray! It's ASCII!
return text
else:
attempts = [(text, text_badness(text) + len(text))]
if maxord < 256:
tried_fixing = reinterpret_latin1_as_utf8(text)
tried_fixing2 = reinterpret_latin1_as_windows1252(text)
attempts.append((tried_fixing, text_cost(tried_fixing)))
attempts.append((tried_fixing2, text_cost(tried_fixing2)))
elif all(ord(char) in WINDOWS_1252_CODEPOINTS for char in text):
tried_fixing = reinterpret_windows1252_as_utf8(text)
attempts.append((tried_fixing, text_cost(tried_fixing)))
else:
# We can't imagine how this would be anything but valid text.
return text
# Sort the results by badness
attempts.sort(key=lambda x: x[1])
#print attempts
goodtext = attempts[0][0]
if goodtext == text:
return goodtext
else:
return fix_bad_unicode(goodtext)
def reinterpret_latin1_as_utf8(wrongtext):
newbytes = wrongtext.encode('latin-1', 'replace')
return newbytes.decode('utf-8', 'replace')
def reinterpret_windows1252_as_utf8(wrongtext):
altered_bytes = []
for char in wrongtext:
if ord(char) in WINDOWS_1252_GREMLINS:
altered_bytes.append(char.encode('WINDOWS_1252'))
else:
altered_bytes.append(char.encode('latin-1', 'replace'))
return b''.join(altered_bytes).decode('utf-8', 'replace')
def reinterpret_latin1_as_windows1252(wrongtext):
"""
Maybe this was always meant to be in a single-byte encoding, and it
makes the most sense in Windows-1252.
"""
return wrongtext.encode('latin-1').decode('WINDOWS_1252', 'replace')
def text_badness(text):
'''
Look for red flags that text is encoded incorrectly:
Obvious problems:
- The replacement character \ufffd, indicating a decoding error
- Unassigned or private-use Unicode characters
Very weird things:
- Adjacent letters from two different scripts
- Letters in scripts that are very rarely used on computers (and
therefore, someone who is using them will probably get Unicode right)
- Improbable control characters, such as 0x81
Moderately weird things:
- Improbable single-byte characters, such as ƒ or ¬
- Letters in somewhat rare scripts
'''
assert isinstance(text, str)
errors = 0
very_weird_things = 0
weird_things = 0
prev_letter_script = None
for pos in range(len(text)):
char = text[pos]
index = ord(char)
if index < 256:
# Deal quickly with the first 256 characters.
weird_things += SINGLE_BYTE_WEIRDNESS[index]
if SINGLE_BYTE_LETTERS[index]:
prev_letter_script = 'latin'
else:
prev_letter_script = None
else:
category = unicodedata.category(char)
if category == 'Co':
# Unassigned or private use
errors += 1
elif index == 0xfffd:
# Replacement character
errors += 1
elif index in WINDOWS_1252_GREMLINS:
lowchar = char.encode('WINDOWS_1252').decode('latin-1')
weird_things += SINGLE_BYTE_WEIRDNESS[ord(lowchar)] - 0.5
if category.startswith('L'):
# It's a letter. What kind of letter? This is typically found
# in the first word of the letter's Unicode name.
name = unicodedata.name(char)
scriptname = name.split()[0]
freq, script = SCRIPT_TABLE.get(scriptname, (0, 'other'))
if prev_letter_script:
if script != prev_letter_script:
very_weird_things += 1
if freq == 1:
weird_things += 2
elif freq == 0:
very_weird_things += 1
prev_letter_script = script
else:
prev_letter_script = None
return 100 * errors + 10 * very_weird_things + weird_things
def text_cost(text):
"""
Assign a cost function to the length plus weirdness of a text string.
"""
return text_badness(text) + len(text)
#######################################################################
# The rest of this file is esoteric info about characters, scripts, and their
# frequencies.
#
# Start with an inventory of "gremlins", which are characters from all over
# Unicode that Windows has instead assigned to the control characters
# 0x80-0x9F. We might encounter them in their Unicode forms and have to figure
# out what they were originally.
WINDOWS_1252_GREMLINS = [
# adapted from http://effbot.org/zone/unicode-gremlins.htm
0x0152, # LATIN CAPITAL LIGATURE OE
0x0153, # LATIN SMALL LIGATURE OE
0x0160, # LATIN CAPITAL LETTER S WITH CARON
0x0161, # LATIN SMALL LETTER S WITH CARON
0x0178, # LATIN CAPITAL LETTER Y WITH DIAERESIS
0x017E, # LATIN SMALL LETTER Z WITH CARON
0x017D, # LATIN CAPITAL LETTER Z WITH CARON
0x0192, # LATIN SMALL LETTER F WITH HOOK
0x02C6, # MODIFIER LETTER CIRCUMFLEX ACCENT
0x02DC, # SMALL TILDE
0x2013, # EN DASH
0x2014, # EM DASH
0x201A, # SINGLE LOW-9 QUOTATION MARK
0x201C, # LEFT DOUBLE QUOTATION MARK
0x201D, # RIGHT DOUBLE QUOTATION MARK
0x201E, # DOUBLE LOW-9 QUOTATION MARK
0x2018, # LEFT SINGLE QUOTATION MARK
0x2019, # RIGHT SINGLE QUOTATION MARK
0x2020, # DAGGER
0x2021, # DOUBLE DAGGER
0x2022, # BULLET
0x2026, # HORIZONTAL ELLIPSIS
0x2030, # PER MILLE SIGN
0x2039, # SINGLE LEFT-POINTING ANGLE QUOTATION MARK
0x203A, # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
0x20AC, # EURO SIGN
0x2122, # TRADE MARK SIGN
]
# a list of Unicode characters that might appear in Windows-1252 text
WINDOWS_1252_CODEPOINTS = list(range(256)) + WINDOWS_1252_GREMLINS
# Rank the characters typically represented by a single byte -- that is, in
# Latin-1 or Windows-1252 -- by how weird it would be to see them in running
# text.
#
# 0 = not weird at all
# 1 = rare punctuation or rare letter that someone could certainly
# have a good reason to use. All Windows-1252 gremlins are at least
# weirdness 1.
# 2 = things that probably don't appear next to letters or other
# symbols, such as math or currency symbols
# 3 = obscure symbols that nobody would go out of their way to use
# (includes symbols that were replaced in ISO-8859-15)
# 4 = why would you use this?
# 5 = unprintable control character
#
# The Portuguese letter à (0xc3) is marked as weird because it would usually
# appear in the middle of a word in actual Portuguese, and meanwhile it
# appears in the mis-encodings of many common characters.
SINGLE_BYTE_WEIRDNESS = (
# 0 1 2 3 4 5 6 7 8 9 a b c d e f
5, 5, 5, 5, 5, 5, 5, 5, 5, 0, 0, 5, 5, 5, 5, 5, # 0x00
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, # 0x10
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x20
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x30
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x40
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x50
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x60
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, # 0x70
2, 5, 1, 4, 1, 1, 3, 3, 4, 3, 1, 1, 1, 5, 1, 5, # 0x80
5, 1, 1, 1, 1, 3, 1, 1, 4, 1, 1, 1, 1, 5, 1, 1, # 0x90
1, 0, 2, 2, 3, 2, 4, 2, 4, 2, 2, 0, 3, 1, 1, 4, # 0xa0
2, 2, 3, 3, 4, 3, 3, 2, 4, 4, 4, 0, 3, 3, 3, 0, # 0xb0
0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0xc0
1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, # 0xd0
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0xe0
1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, # 0xf0
)
# Pre-cache the Unicode data saying which of these first 256 characters are
# letters. We'll need it often.
SINGLE_BYTE_LETTERS = [
unicodedata.category(unichr(i)).startswith('L')
for i in range(256)
]
# A table telling us how to interpret the first word of a letter's Unicode
# name. The number indicates how frequently we expect this script to be used
# on computers. Many scripts not included here are assumed to have a frequency
# of "0" -- if you're going to write in Linear B using Unicode, you're
# probably aware enough of encoding issues to get it right.
#
# The lowercase name is a general category -- for example, Han characters and
# Hiragana characters are very frequently adjacent in Japanese, so they all go
# into category 'cjk'. Letters of different categories are assumed not to
# appear next to each other often.
SCRIPT_TABLE = {
'LATIN': (3, 'latin'),
'CJK': (2, 'cjk'),
'ARABIC': (2, 'arabic'),
'CYRILLIC': (2, 'cyrillic'),
'GREEK': (2, 'greek'),
'HEBREW': (2, 'hebrew'),
'KATAKANA': (2, 'cjk'),
'HIRAGANA': (2, 'cjk'),
'HIRAGANA-KATAKANA': (2, 'cjk'),
'HANGUL': (2, 'cjk'),
'DEVANAGARI': (2, 'devanagari'),
'THAI': (2, 'thai'),
'FULLWIDTH': (2, 'cjk'),
'MODIFIER': (2, None),
'HALFWIDTH': (1, 'cjk'),
'BENGALI': (1, 'bengali'),
'LAO': (1, 'lao'),
'KHMER': (1, 'khmer'),
'TELUGU': (1, 'telugu'),
'MALAYALAM': (1, 'malayalam'),
'SINHALA': (1, 'sinhala'),
'TAMIL': (1, 'tamil'),
'GEORGIAN': (1, 'georgian'),
'ARMENIAN': (1, 'armenian'),
'KANNADA': (1, 'kannada'), # mostly used for looks of disapproval
'MASCULINE': (1, 'latin'),
'FEMININE': (1, 'latin')
}

View file

@ -0,0 +1,99 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
# GPL 2014
from __future__ import with_statement, print_function
import itertools
import mimetypes
import random
import sys
__all__ = ['MultiPartForm']
# from /usr/lib/python3.4/email/generator.py
# Helper used by Generator._make_boundary
_width = len(repr(sys.maxsize-1))
_fmt = '%%0%dd' % _width
def _make_boundary():
# Craft a random boundary.
token = random.randrange(sys.maxsize)
boundary = ('=' * 15) + (_fmt % token) + '=='
return boundary
class MultiPartForm(object):
"""Accumulate the data to be used when posting a form."""
def __init__(self):
self.form_fields = []
self.files = []
self.boundary = _make_boundary()
return
def get_content_type(self):
return 'multipart/form-data; boundary=%s' % self.boundary
def add_field(self, name, value):
"""Add a simple field to the form data."""
if isinstance(name, unicode):
name = name.encode('utf-8')
if isinstance(value, unicode):
value = value.encode('utf-8')
self.form_fields.append((name, value))
return
def add_file(self, fieldname, filename, fileHandle, mimetype=None):
"""Add a file to be uploaded."""
if isinstance(fieldname, unicode):
fieldname = fieldname.encode('utf-8')
if isinstance(filename, unicode):
filename = filename.encode('utf-8')
if hasattr(fileHandle, 'read'):
body = fileHandle.read()
else:
body = fileHandle
if mimetype is None:
mimetype = mimetypes.guess_type(filename)[0] or 'application/octet-stream'
self.files.append((fieldname, filename, mimetype, body))
return
def __str__(self):
"""Return a string representing the form data, including attached files."""
# Build a list of lists, each containing "lines" of the
# request. Each part is separated by a boundary string.
# Once the list is built, return a string where each
# line is separated by '\r\n'.
parts = []
part_boundary = '--' + self.boundary
# Add the form fields
parts.extend(
[ part_boundary,
'Content-Disposition: form-data; name="%s"' % name,
'',
value,
]
for name, value in self.form_fields
)
# Add the files to upload
parts.extend(
[ part_boundary,
'Content-Disposition: file; name="%s"; filename="%s"' % \
(field_name, filename),
'Content-Type: %s' % content_type,
'',
body,
]
for field_name, filename, content_type, body in self.files
)
# Flatten the list and add closing boundary marker,
# then return CR+LF separated data
flattened = list(itertools.chain(*parts))
flattened.append('--' + self.boundary + '--')
flattened.append('')
return '\r\n'.join(flattened)

View file

@ -0,0 +1,457 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import math
import re
import string
def toAZ(num):
"""
Converts an integer to bijective base 26 string using A-Z
>>> for i in range(1, 1000): assert fromAZ(toAZ(i)) == i
>>> toAZ(1)
'A'
>>> toAZ(4461)
'FOO'
>>> toAZ(1234567890)
'CYWOQVJ'
"""
if num < 1: raise ValueError("must supply a positive integer")
digits = string.ascii_uppercase
az = ''
while num != 0:
num, r = divmod(num, 26)
u, r = divmod(r - 1, 26)
num += u
az = digits[r] + az
return az
def fromAZ(num):
"""
Converts a bijective base 26 string to an integer
>>> fromAZ('A')
1
>>> fromAZ('AA')
27
>>> fromAZ('AAA')
703
>>> fromAZ('FOO')
4461
"""
num = num.replace('-','')
digits = string.ascii_uppercase
r = 0
for exp, char in enumerate(reversed(num)):
r = r + (pow(26, exp) * (digits.index(char) + 1))
return r
def to26(q):
"""
Converts an integer to base 26
>>> for i in range(0, 1000): assert from26(to26(i)) == i
>>> to26(0)
'A'
>>> to26(347485647)
'BDGKMAP'
"""
if q < 0: raise ValueError("must supply a positive integer")
base26 = string.ascii_uppercase
converted = []
while q != 0:
q, r = divmod(q, 26)
l = base26[r]
converted.insert(0, l)
return "".join(converted) or 'A'
def from26(q):
"""
Converts an base 26 string to an integer
>>> from26('A')
0
"""
base26 = string.ascii_uppercase
q = q.replace('-','')
r = 0
for i in q:
r = r * 26 + base26.index(i.upper())
return r
def to32(q):
"""
Converts an integer to base 32
We exclude 4 of the 26 letters: I L O U.
http://www.crockford.com/wrmg/base32.html
>>> for i in range(0, 1000): assert from32(to32(i)) == i
>>> to32(0)
'0'
>>> to32(347485647)
'ABCDEF'
>>> to32(555306645)
'GHJKMN'
>>> to32(800197332334559L)
'PQRSTVWXYZ'
>>> to32(32)
'10'
>>> to32(119292)
'3MFW'
>>> to32(939387374)
'VZVTFE'
>>> to32(-1)
Traceback (most recent call last):
...
ValueError: must supply a positive integer
"""
if q < 0: raise ValueError("must supply a positive integer")
letters = "0123456789ABCDEFGHJKMNPQRSTVWXYZ"
converted = []
while q != 0:
q, r = divmod(q, 32)
l = letters[r]
converted.insert(0, l)
return "".join(converted) or '0'
def from32(q):
"""
Converts an base 32 string to an integer
We exclude 4 of the 26 letters: I L O U.
http://www.crockford.com/wrmg/base32.html
>>> from32('A')
10
>>> from32('i')
1
>>> from32('Li1l')
33825
>>> from32('10')
32
"""
_32map = {
'0': 0,
'O': 0,
'1': 1,
'I': 1,
'L': 1,
'2': 2,
'3': 3,
'4': 4,
'5': 5,
'6': 6,
'7': 7,
'8': 8,
'9': 9,
'A': 10,
'B': 11,
'C': 12,
'D': 13,
'E': 14,
'F': 15,
'G': 16,
'H': 17,
'J': 18,
'K': 19,
'M': 20,
'N': 21,
'P': 22,
'Q': 23,
'R': 24,
'S': 25,
'T': 26,
'V': 27,
'W': 28,
'X': 29,
'Y': 30,
'Z': 31,
}
base32 = ('0123456789' + string.ascii_uppercase)[:32]
q = q.replace('-','')
q = ''.join([base32[_32map[i.upper()]] for i in q])
return int(q, 32)
def to36(q):
"""
Converts an integer to base 36 (a useful scheme for human-sayable IDs
like 'fuck' (739172), 'shit' (1329077) or 'hitler' (1059538851)).
>>> to36(35)
'z'
>>> to36(119292)
'2k1o'
>>> int(to36(939387374), 36)
939387374
>>> to36(0)
'0'
>>> to36(-393)
Traceback (most recent call last):
...
ValueError: must supply a positive integer
"""
if q < 0: raise ValueError("must supply a positive integer")
letters = "0123456789abcdefghijklmnopqrstuvwxyz"
converted = []
while q != 0:
q, r = divmod(q, 36)
converted.insert(0, letters[r])
return "".join(converted) or '0'
def from36(q):
return int(q, 36)
def int_value(strValue, default=u''):
"""
>>> int_value('abc23')
u'23'
>>> int_value(' abc23')
u'23'
>>> int_value('ab')
u''
"""
try:
val = re.compile('(\d+)').findall(unicode(strValue).strip())[0]
except:
val = default
return val
def float_value(strValue, default=u''):
"""
>>> float_value('abc23.4')
u'23.4'
>>> float_value(' abc23.4')
u'23.4'
>>> float_value('ab')
u''
"""
try:
val = re.compile('([\d.]+)').findall(unicode(strValue).strip())[0]
except:
val = default
return val
def format_number(number, longName, shortName):
"""
Return the number in a human-readable format (23 KB, 23.4 MB, 23.42 GB)
>>> format_number(123, 'Byte', 'B')
'123 Bytes'
>>> format_number(1234, 'Byte', 'B')
'1 KB'
>>> format_number(1234567, 'Byte', 'B')
'1.2 MB'
>>> format_number(1234567890, 'Byte', 'B')
'1.15 GB'
>>> format_number(1234567890123456789, 'Byte', 'B')
'1,096.5166 PB'
>>> format_number(-1234567890123456789, 'Byte', 'B')
'-1,096.5166 PB'
"""
if abs(number) < 1024:
return '%s %s%s' % (format_thousands(number), longName, number != 1 and 's' or '')
prefix = ['K', 'M', 'G', 'T', 'P']
for i in range(5):
if abs(number) < math.pow(1024, i + 2) or i == 4:
n = number / math.pow(1024, i + 1)
return '%s %s%s' % (format_thousands('%.*f' % (i, n)), prefix[i], shortName)
def format_thousands(number, separator = ','):
"""
Return the number with separators (1,000,000)
>>> format_thousands(1)
'1'
>>> format_thousands(1000)
'1,000'
>>> format_thousands(1000000)
'1,000,000'
"""
string = str(number).split('.')
l = []
for i, character in enumerate(reversed(string[0])):
if i and (not (i % 3)):
l.insert(0, separator)
l.insert(0, character)
string[0] = ''.join(l)
return '.'.join(string)
def format_bits(number):
return format_number(number, 'bit', 'b')
def format_bytes(number):
return format_number(number, 'byte', 'B')
def format_pixels(number):
return format_number(number, 'pixel', 'px')
def format_currency(amount, currency="$"):
if amount:
temp = "%.2f" % amount
profile=re.compile(r"(\d)(\d\d\d[.,])")
while 1:
temp, count = re.subn(profile,r"\1,\2",temp)
if not count:
break
if temp.startswith('-'):
return "-"+ currency + temp[1:-3]
return currency + temp[:-3]
else:
return ""
def plural(amount, unit, plural='s'):
'''
>>> plural(1, 'unit')
'1 unit'
>>> plural(2, 'unit')
'2 units'
'''
if abs(amount) != 1:
if plural == 's':
unit = unit + plural
else: unit = plural
return "%s %s" % (format_thousands(amount), unit)
def format_duration(ms, verbosity=0, years=True, hours=True, milliseconds=True):
'''
verbosity
0: D:HH:MM:SS
1: Dd Hh Mm Ss
2: D days H hours M minutes S seconds
years
True: 366 days are 1 year 1 day
False: 366 days are 366 days
hours
True: 30 seconds are 00:00:30
False: 30 seconds are 00:30
milliseconds
True: always display milliseconds
False: never display milliseconds
>>> format_duration(1000 * 60 * 60 * 24 * 366)
'1:001:00:00:00.000'
>>> format_duration(1000 * 60 * 60 * 24 * 366, years=False)
'366:00:00:00.000'
>>> format_duration(1000 * 60 * 60 * 24 * 365 + 2003, verbosity=2)
'1 year 2 seconds 3 milliseconds'
>>> format_duration(1000 * 30, hours=False, milliseconds=False)
'00:30'
'''
if not ms and ms != 0:
return ''
if years:
y = int(ms / 31536000000)
d = int(ms % 31536000000 / 86400000)
else:
d = int(ms / 86400000)
h = int(ms % 86400000 / 3600000)
m = int(ms % 3600000 / 60000)
s = int(ms % 60000 / 1000)
ms = ms % 1000
if verbosity == 0:
if years and y:
duration = "%d:%03d:%02d:%02d:%02d" % (y, d, h, m, s)
elif d:
duration = "%d:%02d:%02d:%02d" % (d, h, m, s)
elif hours or h:
duration = "%02d:%02d:%02d" % (h, m, s)
else:
duration = "%02d:%02d" % (m, s)
if milliseconds:
duration += ".%03d" % ms
else:
if verbosity == 1:
durations = ["%sd" % d, "%sh" % h, "%sm" % m, "%ss" % s]
if years:
durations.insert(0, "%sy" % y)
if milliseconds:
durations.append("%sms" % ms)
else:
durations = [plural(d, 'day'), plural(h,'hour'),
plural(m, 'minute'), plural(s, 'second')]
if years:
durations.insert(0, plural(y, 'year'))
if milliseconds:
durations.append(plural(ms, 'millisecond'))
durations = filter(lambda x: not x.startswith('0'), durations)
duration = ' '.join(durations)
return duration
def ms2runtime(ms, shortenLong=False):
# deprecated - use format_duration
'''
>>> ms2runtime(5000)
'5 seconds'
>>> ms2runtime(500000)
'8 minutes 20 seconds'
>>> ms2runtime(50000000)
'13 hours 53 minutes 20 seconds'
>>> ms2runtime(50000000-20000)
'13 hours 53 minutes'
'''
if shortenLong and ms > 1000 * 60 * 60 * 24 * 464:
return format_duration(ms, verbosity=1, milliseconds=False)
return format_duration(ms, verbosity=2, milliseconds=False)
def ms2playtime(ms, hours=False):
# deprecated - use format_duration
'''
>>> ms2playtime(5000)
'00:05'
>>> ms2playtime(500000)
'08:20'
>>> ms2playtime(50000000)
'13:53:20'
'''
return format_duration(ms, hours=False, years=False, milliseconds=False)
def ms2time(ms):
# deprecated - use format_duration
'''
>>> ms2time(44592123)
'12:23:12.123'
'''
return format_duration(ms, years=False)
def time2ms(timeString):
'''
>>> time2ms('12:23:12.123')
44592123
'''
ms = 0.0
p = timeString.split(':')
for i in range(len(p)):
_p = p[i]
if _p.endswith('.'): _p =_p[:-1]
ms = ms * 60 + float(_p)
return int(ms * 1000)
def shift_time(offset, timeString):
newTime = time2ms(timeString) + offset
return ms2time(newTime)

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,405 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
# GPL 2008
import sys
import re
import string
from six.moves.html_entities import name2codepoint
from six import unichr
# Configuration for add_links() function
LEADING_PUNCTUATION = ['(', '<', '&lt;']
TRAILING_PUNCTUATION = ['.', ',', ')', '>', '\n', '&gt;', "'", '"']
# list of possible strings used for bullets in bulleted lists
DOTS = ['&middot;', '*', '\xe2\x80\xa2', '&#149;', '&bull;', '&#8226;']
unencoded_ampersands_re = re.compile(r'&(?!(\w+|#\d+);)')
word_split_re = re.compile(r'(\s+)')
punctuation_re = re.compile('^(?P<lead>(?:%s)*)(?P<middle>.*?)(?P<trail>(?:%s)*)$' % \
('|'.join([re.escape(x) for x in LEADING_PUNCTUATION]),
'|'.join([re.escape(x) for x in TRAILING_PUNCTUATION])))
simple_email_re = re.compile(r'^\S+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9._-]+$')
link_target_attribute_re = re.compile(r'(<a [^>]*?)target=[^\s>]+')
html_gunk_re = re.compile(r'(?:<br clear="all">|<i><\/i>|<b><\/b>|<em><\/em>|<strong><\/strong>|<\/?smallcaps>|<\/?uppercase>)', re.IGNORECASE)
hard_coded_bullets_re = re.compile(r'((?:<p>(?:%s).*?[a-zA-Z].*?</p>\s*)+)' % '|'.join([re.escape(x) for x in DOTS]), re.DOTALL)
trailing_empty_content_re = re.compile(r'(?:<p>(?:&nbsp;|\s|<br \/>)*?</p>\s*)+\Z')
if sys.version[0] == 2:
del x # Temporary variable
def escape(html):
'''
Returns the given HTML with ampersands, quotes and carets encoded
>>> escape('html "test" & <brothers>')
'html &quot;test&quot; &amp; &lt;brothers&gt;'
'''
if not isinstance(html, basestring):
html = str(html)
return html.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;').replace('"', '&quot;').replace("'", '&apos;')
def linebreaks(value):
'''
Converts newlines into <p> and <br />
'''
value = re.sub(r'\r\n|\r|\n', '\n', value) # normalize newlines
paras = re.split('\n{2,}', value)
paras = ['<p>%s</p>' % p.strip().replace('\n', '<br />') for p in paras]
return '\n\n'.join(paras)
def strip_tags(value):
"""
Returns the given HTML with all tags stripped
>>> strip_tags('some <h2>title</h2> <script>asdfasdf</script>')
'some title asdfasdf'
"""
return re.sub(r'<[^>]*?>', '', value)
stripTags = strip_tags
def strip_spaces_between_tags(value):
"Returns the given HTML with spaces between tags normalized to a single space"
return re.sub(r'>\s+<', '> <', value)
def strip_entities(value):
"Returns the given HTML with all entities (&something;) stripped"
return re.sub(r'&(?:\w+|#\d);', '', value)
def fix_ampersands(value):
"Returns the given HTML with all unencoded ampersands encoded correctly"
return unencoded_ampersands_re.sub('&amp;', value)
def add_links(text, trim_url_limit=None, nofollow=False):
"""
Converts any URLs in text into clickable links. Works on http://, https:// and
www. links. Links can have trailing punctuation (periods, commas, close-parens)
and leading punctuation (opening parens) and it'll still do the right thing.
If trim_url_limit is not None, the URLs in link text will be limited to
trim_url_limit characters.
If nofollow is True, the URLs in link text will get a rel="nofollow" attribute.
"""
trim_url = lambda x, limit=trim_url_limit: limit is not None and (x[:limit] + (len(x) >=limit and '...' or '')) or x
words = word_split_re.split(text)
nofollow_attr = nofollow and ' rel="nofollow"' or ''
for i, word in enumerate(words):
match = punctuation_re.match(word)
if match:
lead, middle, trail = match.groups()
if middle.startswith('www.') or ('@' not in middle and not middle.startswith('http://') and \
len(middle) > 0 and middle[0] in string.letters + string.digits and \
(middle.endswith('.org') or middle.endswith('.net') or middle.endswith('.com'))):
middle = '<a href="http://%s"%s>%s</a>' % (middle, nofollow_attr, trim_url(middle))
if middle.startswith('http://') or middle.startswith('https://'):
middle = '<a href="%s"%s>%s</a>' % (middle, nofollow_attr, trim_url(middle))
if '@' in middle and not middle.startswith('www.') and not ':' in middle \
and simple_email_re.match(middle):
middle = '<a href="mailto:%s">%s</a>' % (middle, middle)
if lead + middle + trail != word:
words[i] = lead + middle + trail
return ''.join(words)
urlize = add_links
def clean_html(text):
"""
Cleans the given HTML. Specifically, it does the following:
* Converts <b> and <i> to <strong> and <em>.
* Encodes all ampersands correctly.
* Removes all "target" attributes from <a> tags.
* Removes extraneous HTML, such as presentational tags that open and
immediately close and <br clear="all">.
* Converts hard-coded bullets into HTML unordered lists.
* Removes stuff like "<p>&nbsp;&nbsp;</p>", but only if it's at the
bottom of the text.
"""
from text import normalize_newlines
text = normalize_newlines(text)
text = re.sub(r'<(/?)\s*b\s*>', '<\\1strong>', text)
text = re.sub(r'<(/?)\s*i\s*>', '<\\1em>', text)
text = fix_ampersands(text)
# Remove all target="" attributes from <a> tags.
text = link_target_attribute_re.sub('\\1', text)
# Trim stupid HTML such as <br clear="all">.
text = html_gunk_re.sub('', text)
# Convert hard-coded bullets into HTML unordered lists.
def replace_p_tags(match):
s = match.group().replace('</p>', '</li>')
for d in DOTS:
s = s.replace('<p>%s' % d, '<li>')
return '<ul>\n%s\n</ul>' % s
text = hard_coded_bullets_re.sub(replace_p_tags, text)
# Remove stuff like "<p>&nbsp;&nbsp;</p>", but only if it's at the bottom of the text.
text = trailing_empty_content_re.sub('', text)
return text
# This pattern matches a character entity reference (a decimal numeric
# references, a hexadecimal numeric reference, or a named reference).
charrefpat = re.compile(r'&(#(\d+|x[\da-fA-F]+)|[\w.:-]+);?')
def decode_html(html):
"""
>>> decode_html('me &amp; you and &#36;&#38;%')
u'me & you and $&%'
>>> decode_html('&#x80;')
u'\u20ac'
>>> decode_html('Anniversary of Daoud&apos;s Republic')
u"Anniversary of Daoud's Republic"
"""
if isinstance(html, bytes):
html = html.decode('utf-8')
uchr = unichr
def entitydecode(match, uchr=uchr):
entity = match.group(1)
if entity == '#x80':
return u''
elif entity.startswith('#x'):
return uchr(int(entity[2:], 16))
elif entity.startswith('#'):
return uchr(int(entity[1:]))
elif entity in name2codepoint:
return uchr(name2codepoint[entity])
elif entity == 'apos':
return "'"
else:
return match.group(0)
return charrefpat.sub(entitydecode, html).replace(u'\xa0', ' ')
def highlight(text, query, hlClass="hl"):
"""
>>> highlight('me &amp; you and &#36;&#38;%', 'and')
'me &amp; you <span class="hl">and</span> &#36;&#38;%'
"""
if query:
text = text.replace('<br />', '|')
query = re.escape(query).replace('\ ', '.')
m = re.compile("(%s)" % query, re.IGNORECASE).findall(text)
for i in m:
text = re.sub("(%s)" % re.escape(i).replace('\ ', '.'), '<span class="%s">\\1</span>' % hlClass, text)
text = text.replace('|', '<br />')
return text
def escape_html(value):
'''
>>> escape_html(u'<script> foo')
u'&lt;script&gt; foo'
>>> escape_html(u'&lt;script&gt; foo')
u'&lt;script&gt; foo'
'''
return escape(decode_html(value))
def sanitize_html(html, tags=None, global_attributes=[]):
'''
>>> sanitize_html('http://foo.com, bar')
u'<a href="http://foo.com">http://foo.com</a>, bar'
>>> sanitize_html('http://foo.com/foobar?foo, bar')
u'<a href="http://foo.com/foobar?foo">http://foo.com/foobar?foo</a>, bar'
>>> sanitize_html('(see: www.foo.com)')
u'(see: <a href="http://www.foo.com">www.foo.com</a>)'
>>> sanitize_html('foo@bar.com')
u'<a href="mailto:foo@bar.com">foo@bar.com</a>'
>>> sanitize_html(sanitize_html('foo@bar.com'))
u'<a href="mailto:foo@bar.com">foo@bar.com</a>'
>>> sanitize_html('<a href="http://foo.com" onmouseover="alert()">foo</a>')
u'<a href="http://foo.com">foo</a>'
>>> sanitize_html('<a href="javascript:alert()">foo</a>')
u'&lt;a href="javascript:alert()"&gt;foo'
>>> sanitize_html('[http://foo.com foo]')
u'<a href="http://foo.com">foo</a>'
>>> sanitize_html('<div style="direction: rtl">foo</div>')
u'<div style="direction: rtl">foo</div>'
>>> sanitize_html('<script>alert()</script>')
u'&lt;script&gt;alert()&lt;/script&gt;'
>>> sanitize_html("'foo' < 'bar' && \"foo\" > \"bar\"")
u'\'foo\' &lt; \'bar\' &amp;&amp; "foo" &gt; "bar"'
>>> sanitize_html('<b>foo')
u'<b>foo</b>'
>>> sanitize_html('<b>foo</b></b>')
u'<b>foo</b>'
>>> sanitize_html('Anniversary of Daoud&apos;s Republic')
u"Anniversary of Daoud's Republic"
'''
if not tags:
valid_url = '^((https?:\/\/|\/|mailto:).*?)'
tags = [
# inline formatting
{'name': 'b'},
{'name': 'bdi'},
{'name': 'code'},
{'name': 'em'},
{'name': 'i'},
{'name': 'q'},
{'name': 's'},
{'name': 'span'},
{'name': 'strong'},
{'name': 'sub'},
{'name': 'sup'},
{'name': 'u'},
# block formatting
{'name': 'blockquote'},
{'name': 'cite'},
{
'name': 'div',
'optional': ['style'],
'validation': {
'style': '^direction: rtl$'
}
},
{'name': 'h1'},
{'name': 'h2'},
{'name': 'h3'},
{'name': 'h4'},
{'name': 'h5'},
{'name': 'h6'},
{'name': 'p'},
{'name': 'pre'},
# lists
{'name': 'li'},
{'name': 'ol'},
{'name': 'ul'},
# tables
{'name': 'table'},
{'name': 'tbody'},
{'name': 'td'},
{'name': 'tfoot'},
{'name': 'th'},
{'name': 'thead'},
{'name': 'tr'},
# other
{'name': '[]'},
{
'name': 'a',
'required': ['href'],
'validation': {
'href': valid_url
}
},
{'name': 'br'},
{
'name': 'iframe',
'optional': ['width', 'height'],
'required': ['src'],
'validation': {
'width': '^\d+$',
'height': '^\d+$',
'src': valid_url
}
},
{
'name': 'img',
'optional': ['width', 'height'],
'required': ['src'],
'validation': {
'width': '^\d+$',
'height': '^\d+$',
'src': valid_url
},
},
{'name': 'figure'},
{'name': 'figcaption'}
]
tag_re = re.compile('<(/)?([^\ /]+)(.*?)(/)?>')
attr_re = re.compile('([^=\ ]+)="([^"]+)"')
escaped = {}
level = 0
non_closing_tags = ['img', 'br']
required_attributes = {}
validation = {}
valid_attributes = {}
valid_tags = set([tag['name'] for tag in tags if tag['name'] != '[]'])
for tag in tags:
valid_attributes[tag['name']] = tag.get('required', []) \
+ tag.get('optional', []) \
+ global_attributes
required_attributes[tag['name']] = tag.get('required', [])
validation[tag['name']] = tag.get('validation', {})
if '[]' in validation:
html = re.sub(
re.compile('\[((https?:\/\/|\/).+?) (.+?)\]', re.IGNORECASE),
'<a href="\\1">\\3</a>', html);
parts = split_tags(html)
for i, part in enumerate(parts):
is_tag = i % 2
if is_tag:
t = tag_re.findall(part)
if not t:
parts[i] = escape_html(decode_html(part))
continue
closing, name, attributes, end = t[0]
closing = closing != ''
a = attr_re.findall(attributes)
attrs = dict(a)
if not closing and not name in non_closing_tags:
level += 1
if not attrs and attributes or name not in valid_tags:
valid = False
else:
valid = True
for key in set(attrs) - set(valid_attributes[name]):
del attrs[key]
for key in required_attributes[tag['name']]:
if not key in attrs:
valid = False
if valid:
for attr in attrs:
if attr in validation[name]:
if not re.compile(validation[name][attr]).findall(attrs[attr]):
valid = False
break
if valid and closing:
valid = not escaped.get(level)
else:
escaped[level] = not valid
if closing:
level -= 1
if valid:
parts[i] = '<%s%s%s>' % (
('/' if closing else ''),
name,
(' ' + ' '.join(['%s="%s"' % (key, attrs[key]) for key, value in a if key in attrs])
if not closing and attrs else '')
)
else:
parts[i] = escape_html(decode_html(part))
else:
parts[i] = escape_html(decode_html(part))
html = ''.join(parts)
html = html.replace('\n\n', '<br/><br/>')
html = add_links(html)
return sanitize_fragment(html)
def split_tags(string):
tags = []
def collect(match):
tags.append(match.group(0))
return '\0'
strings = re.sub('<[^<>]+>', collect, string).split('\0')
tags.append('')
return [item for sublist in zip(strings, tags) for item in sublist][:-1]
def sanitize_fragment(html):
'''
#html5lib reorders arguments, so not usable
import html5lib
return html5lib.parseFragment(html).toxml().decode('utf-8')
'''
if not html:
return u''
import lxml.html
body = lxml.html.document_fromstring(html).find('body')
html = lxml.html.tostring(body, encoding='utf-8')[6:-7].decode('utf-8')
if html.startswith('<p>') and html.endswith('</p>'):
html = html[3:-4]
return html

View file

@ -0,0 +1,246 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
from __future__ import division
from hashlib import sha1
import Image
import ImageDraw
import ImageFont
ZONE_INDEX = []
for pixel_index in range(64):
x, y = pixel_index % 8, int(pixel_index / 8)
ZONE_INDEX.append(int(x / 2) + int(y / 4) * 4)
def drawText(image, position, text, font_file, font_size, color):
draw = ImageDraw.Draw(image)
font = ImageFont.truetype(font_file, font_size, encoding='unic')
draw.text(position, text, fill=color, font=font)
return draw.textsize(text, font=font)
def getHSL(rgb):
rgb = map(lambda x: x / 255, rgb)
maximum = max(rgb)
minimum = min(rgb)
hsl = [0.0, 0.0, 0.0]
hsl[2] = (maximum + minimum) / 2
if maximum == minimum:
hsl[0] = 0.0
hsl[1] = 0.0
else:
if maximum == rgb[0]:
hsl[0] = (60 * (rgb[1] - rgb[2]) / (maximum - minimum) + 360) % 360
elif maximum == rgb[1]:
hsl[0] = 60 * (rgb[2] - rgb[0]) / (maximum - minimum) + 120
else:
hsl[0] = 60 * (rgb[0] - rgb[1]) / (maximum - minimum) + 240
if hsl[2] <= 0.5:
hsl[1] = (maximum - minimum) / (2 * hsl[2])
else:
hsl[1] = (maximum - minimum) / (2 - 2 * hsl[2])
return tuple(hsl)
def getImageHash(image_file, mode):
image = Image.open(image_file).convert('RGB').resize((8, 8), Image.ANTIALIAS)
image_hash = 0
if mode == 'color':
# divide the image into 8 zones:
# 0 0 1 1 2 2 3 3
# 0 0 1 1 2 2 3 3
# 0 0 1 1 2 2 3 3
# 0 0 1 1 2 2 3 3
# 4 4 5 5 6 6 7 7
# 4 4 5 5 6 6 7 7
# 4 4 5 5 6 6 7 7
# 4 4 5 5 6 6 7 7
image_data = image.getdata()
zone_values = [[] for i in range(8)]
for pixel_index, pixel_value in enumerate(image_data):
zone_values[ZONE_INDEX[pixel_index]].append(pixel_value)
for zone_index, pixel_values in enumerate(zone_values):
# get the mean for each color channel
mean = map(lambda x: int(round(sum(x) / 8)), zip(*pixel_values))
# store the mean color of each zone as an 8-bit value:
# RRRGGGBB
color_index = sum((
int(mean[0] / 32) << 5,
int(mean[1] / 32) << 2,
int(mean[2] / 64)
))
image_hash += color_index * pow(2, zone_index * 8)
elif mode == 'shape':
# pixels brighter than the mean register as 1,
# pixels equal to or darker than the mean as 0
image_data = image.convert('L').getdata()
image_mean = sum(image_data) / 64
for pixel_index, pixel_value in enumerate(image_data):
if pixel_value > image_mean:
image_hash += pow(2, pixel_index)
image_hash = hex(image_hash)[2:].upper()
if image_hash.endswith('L'):
image_hash = image_hash[:-1]
image_hash = '0' * (16 - len(image_hash)) + image_hash
return image_hash
def getImageHeat(image_file):
image = Image.open(image_file).convert('RGB').resize((16, 16), Image.ANTIALIAS)
pixel = image.load()
image_heat = 0
for y in range(image.size[1]):
for x in range(image.size[0]):
pixel_heat = []
for y_ in range(max(y - 1, 0), min(y + 2, image.size[1])):
for x_ in range(max(x - 1, 0), min(x + 2, image.size[0])):
if x != x_ or y != y_:
for c in range(3):
pixel_heat.append(abs(pixel[x, y][c] - pixel[x_, y_][c]))
image_heat += sum(pixel_heat) / len(pixel_heat)
return image_heat / 256
def getImageHSL(image_file):
image = Image.open(image_file).convert('RGB').resize((1, 1), Image.ANTIALIAS)
return getHSL(image.getpixel((0, 0)))
def getRGB(hsl):
hsl = list(hsl)
hsl[0] /= 360
rgb = [0, 0, 0]
if hsl[1] == 0:
rgb = [hsl[2], hsl[2], hsl[2]]
else:
if hsl[2] < 1/2:
v2 = hsl[2] * (1 + hsl[1])
else:
v2 = hsl[1] + hsl[2] - (hsl[1] * hsl[2])
v1 = 2 * hsl[2] - v2
for i in range(3):
v3 = hsl[0] + (1 - i) * 1/3;
if v3 < 0:
v3 += 1
elif v3 > 1:
v3 -= 1
if v3 < 1/6:
rgb[i] = v1 + ((v2 - v1) * 6 * v3)
elif v3 < 1/2:
rgb[i] = v2
elif v3 < 2/3:
rgb[i] = v1 + ((v2 - v1) * 6 * (2/3 - v3))
else:
rgb[i] = v1
return tuple(map(lambda x: int(x * 255), rgb))
def getTextSize(image, text, font_file, font_size):
draw = ImageDraw.Draw(image)
font = ImageFont.truetype(font_file, font_size, encoding='unic')
return draw.textsize(text, font=font)
def wrapText(text, max_width, max_lines, font_file, font_size):
# wraps text to max_width and max_lines
def get_min_width():
# returns the width of the longest non-hyphenated word
min_width = 0
for word in words:
width = get_width(word)
if width <= max_width and width > min_width:
min_width = width
return min_width
def get_width(string):
return draw.textsize(string, font=font)[0]
image = Image.new('RGB', (1, 1))
draw = ImageDraw.Draw(image)
font = ImageFont.truetype(font_file, font_size, encoding='unic')
ellipsis = ''.decode('utf-8')
separators = ['-', '+', '/', ':']
if get_width(text) <= max_width:
# text fits in one line
lines = [text]
else:
lines = ['']
words = []
spaces = []
test_words = text.split(' ')
for word in test_words:
if get_width(word) <= max_width:
# word fits in one line
words.append(word)
spaces.append(' ')
else:
# word does not fit in one line
position = 0
test_word = word
for separator in separators:
test_word = test_word.replace(separator, ' ')
parts = test_word.split(' ')
for i, part in enumerate(parts):
words.append(part)
if i < len(parts) - 1:
position += len(part) + 1
spaces.append(word[position - 1])
else:
spaces.append(' ')
if max_lines:
# test if the same number of lines can be achieved with shorter
# lines, without hyphenating words that are not yet hyphenated
best_lines = len(wrapText(text, max_width, 0, font_file, font_size))
test_lines = best_lines
min_width = get_min_width()
while test_lines == best_lines and max_width >= min_width:
max_width -= 1
test_lines = len(wrapText(text, max_width, 0, font_file, font_size))
max_width += 1
for i, word in enumerate(words):
line = len(lines) - 1
word_width = get_width(word)
if word_width <= max_width:
# word fits in one line
test = (lines[line] + word + spaces[i]).strip()
if get_width(test) <= max_width:
# word fits in current line
lines[line] = test + (' ' if spaces[i] == ' ' else '')
elif max_lines == 0 or line < max_lines - 1:
# word fits in next line
lines.append(word + spaces[i])
else:
# word does not fit in last line
test = lines[line].strip() + ellipsis
if get_width(test) <= max_width:
# ellipsis fits in last line
lines[line] = test
else:
# ellipsis does not fit in last line
test_words = lines[line].split(' ')
while get_width(test) > max_width:
test_words.pop()
test = ' '.join(test_words) + ellipsis
if test == ellipsis:
# ellipsis does not fit after first word of last line
test = lines[line][:-1] + ellipsis
while get_width(test) > max_width:
test = test[:-2] + ellipsis
lines[line] = test
break
else:
# word does not fit in one line
chars = list(word)
for char in chars:
line = len(lines) - 1
test = (lines[line] + char + '-').strip()
if get_width(test) <= max_width:
# char fits in current line
lines[line] = test[:-1]
elif max_lines == 0 or line < max_lines - 1:
# char fits in next line
if test[-3] == ' ':
lines[line] = test[:-3]
else:
lines[line] = test[:-2] + '-'
lines.append(char)
else:
# char does not fit in last line
test = lines[line] + char + ellipsis
while get_width(test) > max_width:
test = test[:-2] + ellipsis
lines[line] = test
lines[line] += ' '
return lines

View file

@ -0,0 +1,246 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
# GPL 2008
_iso639_languages = [
("Unknown", "", "", "und"),
("Afar", "", "aa", "aar"),
("Abkhazian", "", "ab", "abk"),
("Afrikaans", "", "af", "afr"),
("Akan", "", "ak", "aka"),
("Albanian", "", "sq", "sqi"),
("Amharic", "", "am", "amh"),
("Arabic", "", "ar", "ara"),
("Aragonese", "", "an", "arg"),
("Armenian", "", "hy", "hye"),
("Assamese", "", "as", "asm"),
("Avaric", "", "av", "ava"),
("Avestan", "", "ae", "ave"),
("Aymara", "", "ay", "aym"),
("Azerbaijani", "", "az", "aze"),
("Bashkir", "", "ba", "bak"),
("Bambara", "", "bm", "bam"),
("Basque", "", "eu", "eus"),
("Belarusian", "", "be", "bel"),
("Bengali", "", "bn", "ben"),
("Bihari", "", "bh", "bih"),
("Bislama", "", "bi", "bis"),
("Bosnian", "", "bs", "bos"),
("Breton", "", "br", "bre"),
("Bulgarian", "", "bg", "bul"),
("Burmese", "", "my", "mya"),
("Catalan", "", "ca", "cat"),
("Chamorro", "", "ch", "cha"),
("Chechen", "", "ce", "che"),
("Chinese", "", "zh", "zho"),
("Church Slavic", "", "cu", "chu"),
("Chuvash", "", "cv", "chv"),
("Cornish", "", "kw", "cor"),
("Corsican", "", "co", "cos"),
("Cree", "", "cr", "cre"),
("Czech", "", "cs", "ces"),
("Danish", "Dansk", "da", "dan"),
("Divehi", "", "dv", "div"),
("Dutch", "Nederlands", "nl", "nld"),
("Dzongkha", "", "dz", "dzo"),
("English", "English", "en", "eng"),
("Esperanto", "", "eo", "epo"),
("Estonian", "", "et", "est"),
("Ewe", "", "ee", "ewe"),
("Faroese", "", "fo", "fao"),
("Fijian", "", "fj", "fij"),
("Finnish", "Suomi", "fi", "fin"),
("French", "Francais", "fr", "fra"),
("Western Frisian", "", "fy", "fry"),
("Fulah", "", "ff", "ful"),
("Georgian", "", "ka", "kat"),
("German", "Deutsch", "de", "deu"),
("Gaelic (Scots)", "", "gd", "gla"),
("Irish", "", "ga", "gle"),
("Galician", "", "gl", "glg"),
("Manx", "", "gv", "glv"),
("Greek, Modern", "", "el", "ell"),
("Guarani", "", "gn", "grn"),
("Gujarati", "", "gu", "guj"),
("Haitian", "", "ht", "hat"),
("Hausa", "", "ha", "hau"),
("Hebrew", "", "he", "heb"),
("Herero", "", "hz", "her"),
("Hindi", "", "hi", "hin"),
("Hiri Motu", "", "ho", "hmo"),
("Hungarian", "Magyar", "hu", "hun"),
("Igbo", "", "ig", "ibo"),
("Icelandic", "Islenska", "is", "isl"),
("Ido", "", "io", "ido"),
("Sichuan Yi", "", "ii", "iii"),
("Inuktitut", "", "iu", "iku"),
("Interlingue", "", "ie", "ile"),
("Interlingua", "", "ia", "ina"),
("Indonesian", "", "id", "ind"),
("Inupiaq", "", "ik", "ipk"),
("Italian", "Italiano", "it", "ita"),
("Javanese", "", "jv", "jav"),
("Japanese", "", "ja", "jpn"),
("Kalaallisut (Greenlandic)", "", "kl", "kal"),
("Kannada", "", "kn", "kan"),
("Kashmiri", "", "ks", "kas"),
("Kanuri", "", "kr", "kau"),
("Kazakh", "", "kk", "kaz"),
("Central Khmer", "", "km", "khm"),
("Kikuyu", "", "ki", "kik"),
("Kinyarwanda", "", "rw", "kin"),
("Kirghiz", "", "ky", "kir"),
("Komi", "", "kv", "kom"),
("Kongo", "", "kg", "kon"),
("Korean", "", "ko", "kor"),
("Kuanyama", "", "kj", "kua"),
("Kurdish", "", "ku", "kur"),
("Lao", "", "lo", "lao"),
("Latin", "", "la", "lat"),
("Latvian", "", "lv", "lav"),
("Limburgan", "", "li", "lim"),
("Lingala", "", "ln", "lin"),
("Lithuanian", "", "lt", "lit"),
("Luxembourgish", "", "lb", "ltz"),
("Luba-Katanga", "", "lu", "lub"),
("Ganda", "", "lg", "lug"),
("Macedonian", "", "mk", "mkd"),
("Marshallese", "", "mh", "mah"),
("Malayalam", "", "ml", "mal"),
("Maori", "", "mi", "mri"),
("Marathi", "", "mr", "mar"),
("Malay", "", "ms", "msa"),
("Malagasy", "", "mg", "mlg"),
("Maltese", "", "mt", "mlt"),
("Moldavian", "", "mo", "mol"),
("Mongolian", "", "mn", "mon"),
("Nauru", "", "na", "nau"),
("Navajo", "", "nv", "nav"),
("Ndebele, South", "", "nr", "nbl"),
("Ndebele, North", "", "nd", "nde"),
("Ndonga", "", "ng", "ndo"),
("Nepali", "", "ne", "nep"),
("Norwegian Nynorsk", "", "nn", "nno"),
("Norwegian Bokmål", "", "nb", "nob"),
("Norwegian", "Norsk", "no", "nor"),
("Chichewa; Nyanja", "", "ny", "nya"),
("Occitan (post 1500); Provençal", "", "oc", "oci"),
("Ojibwa", "", "oj", "oji"),
("Oriya", "", "or", "ori"),
("Oromo", "", "om", "orm"),
("Ossetian; Ossetic", "", "os", "oss"),
("Panjabi", "", "pa", "pan"),
("Persian", "", "fa", "fas"),
("Pali", "", "pi", "pli"),
("Polish", "", "pl", "pol"),
("Portuguese", "Portugues", "pt", "por"),
("Pushto", "", "ps", "pus"),
("Quechua", "", "qu", "que"),
("Romansh", "", "rm", "roh"),
("Romanian", "", "ro", "ron"),
("Rundi", "", "rn", "run"),
("Russian", "", "ru", "rus"),
("Sango", "", "sg", "sag"),
("Sanskrit", "", "sa", "san"),
("Serbian", "", "sr", "srp"),
("Croatian", "Hrvatski", "hr", "hrv"),
("Sinhala", "", "si", "sin"),
("Slovak", "", "sk", "slk"),
("Slovenian", "", "sl", "slv"),
("Northern Sami", "", "se", "sme"),
("Samoan", "", "sm", "smo"),
("Shona", "", "sn", "sna"),
("Sindhi", "", "sd", "snd"),
("Somali", "", "so", "som"),
("Sotho, Southern", "", "st", "sot"),
("Spanish", "Espanol", "es", "spa"),
("Sardinian", "", "sc", "srd"),
("Swati", "", "ss", "ssw"),
("Sundanese", "", "su", "sun"),
("Swahili", "", "sw", "swa"),
("Swedish", "Svenska", "sv", "swe"),
("Tahitian", "", "ty", "tah"),
("Tamil", "", "ta", "tam"),
("Tatar", "", "tt", "tat"),
("Telugu", "", "te", "tel"),
("Tajik", "", "tg", "tgk"),
("Tagalog", "", "tl", "tgl"),
("Thai", "", "th", "tha"),
("Tibetan", "", "bo", "bod"),
("Tigrinya", "", "ti", "tir"),
("Tonga (Tonga Islands)", "", "to", "ton"),
("Tswana", "", "tn", "tsn"),
("Tsonga", "", "ts", "tso"),
("Turkmen", "", "tk", "tuk"),
("Turkish", "", "tr", "tur"),
("Twi", "", "tw", "twi"),
("Uighur", "", "ug", "uig"),
("Ukrainian", "", "uk", "ukr"),
("Urdu", "", "ur", "urd"),
("Uzbek", "", "uz", "uzb"),
("Venda", "", "ve", "ven"),
("Vietnamese", "", "vi", "vie"),
("Volapük", "", "vo", "vol"),
("Welsh", "", "cy", "cym"),
("Walloon", "", "wa", "wln"),
("Wolof", "", "wo", "wol"),
("Xhosa", "", "xh", "xho"),
("Yiddish", "", "yi", "yid"),
("Yoruba", "", "yo", "yor"),
("Zhuang", "", "za", "zha"),
("Zulu", "", "zu", "zul"),
]
def codeToLang(code):
if code:
code = code.lower()
if len(code) == 2:
for l in _iso639_languages:
if l[2] == code:
return l[0]
elif len(code) == 3:
for l in _iso639_languages:
if l[3] == code:
return l[0]
return None
def langTo3Code(lang):
if lang:
lang = langEnglishName(lang)
if lang:
lang=lang.lower()
for l in _iso639_languages:
if l[0].lower() == lang:
return l[3]
return None
def langTo2Code(lang):
if lang:
lang = langEnglishName(lang)
if lang:
lang=lang.lower()
for l in _iso639_languages:
if l[0].lower() == lang:
return l[2]
return None
def langCode2To3(code):
return langTo3Code(codeToLang(code))
def langCode3To2(code):
return langTo2Code(codeToLang(code))
def langEnglishName(lang):
lang = lang.lower()
for l in _iso639_languages:
if l[1].lower() == lang or l[0].lower() == lang:
return l[0]
return None
def languages2Letter():
languages = []
for l in _iso639_languages:
if l[2]:
languages.append(l[2])
return languages

View file

@ -0,0 +1,183 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
from ox.utils import json
def minify(source, comment=''):
# see https://github.com/douglascrockford/JSMin/blob/master/README
def get_next_non_whitespace_token():
pass
tokens = tokenize(source)
length = len(tokens)
minified = '/*' + comment + '*/' if comment else ''
for i, token in enumerate(tokens):
if token['type'] in ['linebreak', 'whitespace']:
prevToken = None if i == 0 else tokens[i - 1]
next = i + 1
while next < length and tokens[next]['type'] in ['comment', 'linebreak', 'whitespace']:
next += 1
nextToken = None if next == length else tokens[next]
if token['type'] == 'linebreak':
# replace a linebreak between two tokens that are identifiers or
# numbers or strings or unary operators or grouping operators
# with a single newline, otherwise remove it
if prevToken and nextToken\
and (prevToken['type'] in ['identifier', 'number', 'string']\
or prevToken['value'] in ['++', '--', ')', ']', '}'])\
and (nextToken['type'] in ['identifier', 'number', 'string']\
or nextToken['value'] in ['+', '-', '++', '--', '~', '!', '(', '[', '{']):
minified += '\n'
elif token['type'] == 'whitespace':
# replace whitespace between two tokens that are identifiers or
# numbers, or between a token that ends with "+" or "-" and one that
# begins with "+" or "-", with a single space, otherwise remove it
if prevToken and nextToken\
and ((prevToken['type'] in ['identifier', 'number']\
and nextToken['type'] in ['identifier', 'number'])
or (prevToken['value'] in ['+', '-', '++', '--']
and nextToken['value'] in ['+', '-', '++', '--'])):
minified += ' '
elif token['type'] != 'comment':
# remove comments and leave all other tokens untouched
minified += token['value']
return minified
def parse_JSONC(source):
return json.loads(minify(source))
def tokenize(source):
# see https://github.com/mozilla/narcissus/blob/master/lib/jslex.js
IDENTIFIER = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ$_'
KEYWORD = [
'break',
'case', 'catch', 'class', 'const', 'continue',
'debugger', 'default', 'delete', 'do',
'else', 'enum', 'export', 'extends',
'finally', 'for', 'function',
'if', 'implements', 'import', 'in', 'instanceof', 'interface',
'let', 'module',
'new',
'package', 'private', 'protected', 'public',
'return',
'super', 'switch', 'static',
'this', 'throw', 'try', 'typeof',
'var', 'void',
'yield',
'while', 'with'
]
LINEBREAK = '\n\r'
NUMBER = '01234567890'
OPERATOR = [
# arithmetic
'+', '-', '*', '/', '%', '++', '--',
# assignment
'=', '+=', '-=', '*=', '/=', '%=',
'&=', '|=', '^=', '<<=', '>>=', '>>>=',
# bitwise
'&', '|', '^', '~', '<<', '>>', '>>>',
# comparison
'==', '!=', '===', '!==', '>', '>=', '<', '<=',
# conditional
'?', ':',
# grouping
'(', ')', '[', ']', '{', '}',
# logical
'&&', '||', '!',
# other
'.', ',', ';'
]
REGEXP = 'abcdefghijklmnopqrstuvwxyz'
STRING = '\'"'
WHITESPACE = ' \t'
def is_regexp():
# checks if a forward slash is the beginning of a regexp,
# as opposed to the beginning of an operator
i = len(tokens) - 1
# scan back to the previous significant token,
# or to the beginnig of the source
while i >= 0 and tokens[i]['type'] in ['comment', 'linebreak', 'whitespace']:
i -= 1
if i == -1:
# source begins with forward slash
is_regexp = True
else:
token = tokens[i]
is_regexp = (
token['type'] == 'identifier' and token['value'] in KEYWORD
) or (
token['type'] == 'operator' and not token['value'] in ['++', '--', ')', ']', '}']
)
return is_regexp
column = 1
cursor = 0
length = len(source)
tokens = []
line = 1
while cursor < length:
char = source[cursor]
start = cursor
cursor += 1
if char == '/' and cursor < length - 1 and source[cursor] in '/*':
type = 'comment'
cursor += 1
while cursor < length:
cursor += 1
if source[start + 1] == '/' and source[cursor] == '\n':
break
elif source[start + 1] == '*' and source[cursor:cursor + 2] == '*/':
cursor += 2
break
elif char in IDENTIFIER:
type = 'identifier'
while cursor < length and source[cursor] in IDENTIFIER + NUMBER:
cursor += 1
elif char in LINEBREAK:
type = 'linebreak'
while cursor < length and source[cursor] in LINEBREAK:
cursor += 1
elif char in NUMBER:
type = 'number'
while cursor < length and source[cursor] in NUMBER + '.':
cursor += 1
elif char == '/' and is_regexp():
type = 'regexp'
while cursor < length and source[cursor] != '/':
cursor += (2 if source[cursor] == '\\' else 1)
cursor += 1
while cursor < length and source[cursor] in REGEXP:
cursor += 1
elif char in OPERATOR:
type = 'operator'
if cursor < length:
string = char + source[cursor]
while cursor < length and string in OPERATOR:
cursor += 1
string += source[cursor]
elif char in STRING:
type = 'string'
while cursor < length and source[cursor] != source[start]:
cursor += (2 if source[cursor] == '\\' else 1)
cursor += 1
elif char in WHITESPACE:
type = 'whitespace'
while cursor < length and source[cursor] in WHITESPACE:
cursor += 1
value = source[start:cursor]
tokens.append({
'column': column,
'line': line,
'type': type,
'value': value
})
if type == 'comment':
lines = value.split('\n');
column = len(lines[-1])
line += len(lines) - 1
elif type == 'linebreak':
column = 1
column = 1
line += len(value)
else:
column += len(value)
return tokens

View file

@ -0,0 +1,21 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
from __future__ import with_statement, print_function
from .js import minify
from .utils import json
def load(f):
return loads(f.read())
def loads(source):
try:
minified = minify(source)
return json.loads(minified)
except json.JSONDecodeError as e:
s = minified.split('\n')
context = s[e.lineno-1][max(0, e.colno-1):e.colno+30]
msg = e.msg + ' at ' + context
raise json.JSONDecodeError(msg, minified, e.pos)

View file

@ -0,0 +1,28 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import math
def center(lat_sw, lng_sw, lat_ne=None, lng_ne=None):
if not lat_ne and not lng_ne:
return min(lat_sw, lng_sw) + abs(lat_sw-lng_sw)/2
else:
return (center(lat_sw,lng_sw), center(lat_ne, lng_ne))
def area(lat_sw, lng_sw, lat_ne, lng_ne):
return (lat_ne - lat_sw) * (lng_ne - lng_sw)
def latlngspan2latlng(lat, lng, latSpan, lngSpan):
return dict(
lat_sw = lat - latSpan, lng_sw = lng - lngSpan,
lat_ne = lat + latSpan, lng_ne = lng + latSpan
)
def parse_location_string(location_string):
l = location_string.split('+')
if len(l) == 1:
l = location_string.split(';')
l = [i.strip() for i in l]
l = filter(lambda x: x, l)
return l

View file

@ -0,0 +1,486 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
# GPL 2012
from __future__ import division
import hashlib
import os
import re
import unicodedata
from .normalize import normalize_name
from .text import get_sort_name, find_re
from .file import EXTENSIONS
__all__ = ['parse_movie_path', 'create_movie_path', 'get_oxid']
LANGUAGES = ['en', 'fr', 'de', 'es', 'it']
'''
Naming scheme:
X/[Group, The; Lastname, Firstname/]The Title[ (YEAR[-[YEAR]])]/
The Title[ ([SXX][EYY[+ZZ|-ZZ]])[ Episode Title]][.Version][.Part XY[.Part Title][.en][.fr].xyz
'''
def format_path(data, directory_key='director'):
def format_underscores(string):
return re.sub('^\.|\.$|:|/|\?|<|>', '_', string)
director = data['directorSort'] or ['Unknown Director']
title = data['seriesTitle' if data['isEpisode'] else 'title'] or 'Untitled'
year = data['seriesYear' if data['isEpisode'] else 'year'] or None
parts = map(format_underscores, filter(lambda x: x != None, [
u'; '.join(director[:10]),
u'%s%s' % (title, u' (%s)' % year if year else ''),
u'%s%s%s%s%s%s' % (
data['title'] or 'Untitled',
u'.%s' % data['version'] if data['version'] else '',
u'.Part %s' % data['part'] if data['part'] else '',
u'.%s' % data['partTitle'] if data['partTitle'] else '',
u'.%s' % data['language'] if data['language'] else '',
u'.%s' % data['extension'] if data['extension'] else ''
)
]))
if data.get('subdirectory'):
parts.insert(-1, data['subdirectory'])
return unicodedata.normalize('NFD', u'/'.join(parts))
def parse_item_files(files):
# parses a list of file objects associated with one item (file objects
# as returned by parse_path, but extended with 'path' and 'time')
# and returns a list of version objects (in case of english-only subtitles,
# version[i]['files'][j]['normalizedPath'] will be modified)
def get_file_key(file):
return '\n'.join([
file['version'] or '',
file['part'] or '',
file['language'] or '',
file['extension'] or ''
])
def get_version_key(file, extension=True):
return '%s/%s-part/%s' % (
file['version'] or '',
'single' if file['part'] == None else 'multi',
file['extension'] if extension else ''
)
# filter out duplicate files (keep shortest path, sorted alphabetically)
# since same version+part+language+extension can still differ in part title,
# ''/'en' or 'mpg'/'mpeg', or have an unparsed section in their path
unique_files = []
duplicate_files = []
for key in [get_file_key(file) for file in files]:
key_files = sorted(
sorted([file for file in files if get_file_key(file) == key]),
key=lambda x: len(x['path'])
)
unique_files.append(key_files[0])
duplicate_files += key_files[1:]
# determine versions ('version.single|multi-part.videoextension')
version_files = {}
time = {}
video_files = [file for file in unique_files if file['type'] == 'video']
versions = set([file['version'] for file in video_files])
for version in versions:
for file in [file for file in video_files if file['version'] == version]:
version_key = get_version_key(file)
version_files[version_key] = (version_files[version_key] if version_key in version_files else []) + [file]
time[version_key] = sorted([time[version_key], file['time']])[-1] if version_key in time else file['time']
# determine preferred video extension (newest)
extension = {}
for key in set(['/'.join(version_key.split('/')[:-1]) + '/' for version_key in version_files]):
extensions = set([version_key.split('/')[-1] for version_key in version_files if version_key.startswith(key)])
extension[key] = sorted(extensions, key=lambda x: time[key + x])[-1]
# associate other (non-video) files
other_files = [file for file in unique_files if file['type'] != 'video']
versions = set([file['version'] for file in other_files])
for version in versions:
for file in [file for file in other_files if file['version'] == version]:
key = get_version_key(file, extension=False)
if key in extension:
version_files[key + extension[key]].append(file)
else:
version_files[key] = (version_files[key] if key in version_files else []) + [file]
extension[key] = ''
# determine main files (video + srt)
full = {}
language = {}
main_files = {}
for version_key in version_files:
parts = sorted(list(set([file['part'] for file in version_files[version_key]])))
# determine if all parts have one video file
video_files = [file for file in version_files[version_key] if file['type'] == 'video']
full[version_key] = len(video_files) == len(parts)
main_files[version_key] = video_files if full[version_key] else []
# determine preferred subtitle language
language[version_key] = None
subtitle_files = [file for file in version_files[version_key] if file['extension'] == 'srt']
for subtitle_language in sorted(
list(set([file['language'] for file in subtitle_files])),
key=lambda x: LANGUAGES.index(x) if x in LANGUAGES else x
):
language_files = [file for file in subtitle_files if file['language'] == subtitle_language]
if len(subtitle_files) == len(parts):
language[version_key] = subtitle_language
main_files[version_key] += language_files
break
# determine main version (best srt language, then video time)
main_version = None
full_version_keys = sorted(
[version_key for version_key in version_files if full[version_key]],
key=lambda x: time[x],
reverse=True
)
if full_version_keys:
language_version_keys = sorted(
[version_key for version_key in full_version_keys if language[version_key]],
key=lambda x: LANGUAGES.index(language[x]) if language[x] in LANGUAGES else language[x]
)
main_version = language_version_keys[0] if language_version_keys else full_version_keys[0]
# add duplicate files
for file in duplicate_files:
key = get_version_key(file, extension=False)
version_key = '%s%s' % (key, extension[key] if key in extension else '')
version_files[version_key] = (version_files[version_key] if version_key in version_files else []) + [file]
# remove unneeded '.en'
for version_key in version_files:
for extension in EXTENSIONS['subtitle']:
subtitle_files = [file for file in version_files[version_key] if file['extension'] == extension]
subtitle_languages = list(set([file['language'] for file in subtitle_files]))
if len(subtitle_languages) == 1 and subtitle_languages[0] == LANGUAGES[0]:
for subtitle_file in subtitle_files:
subtitle_file['normalizedPath'] = format_path(dict(subtitle_file, **{'language': None}))
# return data
data = []
for version_key in version_files:
data.append({
'files': sorted(
[dict(file, isMainFile=file in main_files[version_key]) for file in version_files[version_key]],
key=lambda x: x['path']
),
'isFullVersion': full[version_key],
'isMainVersion': version_key == main_version,
'subtitleLanguage': language[version_key] if version_key in language else None,
'version': version_key
})
return data
def parse_path(path, directory_key='director'):
'''
# all keys
>>> parse_path('Frost, Mark; Lynch, David/Twin Peaks (1991)/Twin Peaks (S01E01) Pilot.European Version.Part 1.Welcome to Twin Peaks.en.fr.MPEG')['normalizedPath']
'Frost, Mark; Lynch, David/Twin Peaks (1991)/Twin Peaks (S01E00) Pilot.European Version.Part 1.Welcome to Twin Peaks.en.fr.mpg'
# pop directory title off file name
>>> parse_path("Unknown Director/www.xxx.com.._/www.xxx.com....Director's Cut.avi")['version']
"Director's Cut"
# handle dots
>>> parse_path("Unknown Director/Unknown Title (2000)/... Mr. .com....Director's Cut.srt")['version']
"Director's Cut"
# multiple years, season zero, multiple episodes, dots in episode title and part title
>>> parse_path('Groening, Matt/The Simpsons (1989-2012)/The Simpsons (S00E01-02) D.I.Y..Uncensored Version.Part 1.D.I.Y..de.avi')['normalizedPath']
'Groening, Matt/The Simpsons (1989-2012)/The Simpsons (S01E01+02) D.I.Y..Uncensored Version.Part 1.D.I.Y..de.avi'
# handle underscores
>>> parse_path('Unknown Director/_com_ 1_0 _ NaN.._/_com_ 1_0 _ NaN....avi')['title']
'.com: 1/0 / NaN...'
# TODO: '.com.avi'
'''
def parse_title(string):
return title, year
def parse_type(string):
for type in EXTENSIONS:
if string in EXTENSIONS[type]:
return type
return None
def parse_underscores(string):
# '^_' or '_$' is '.'
string = re.sub('^_', '.', string)
string = re.sub('_$', '.', string)
# '_.foo$' or '_ (' is '?'
string = re.sub('_(?=(\.\w+$| \())', '?', string)
# ' _..._ ' is '<...>'
string = re.sub('(?<= )_(.+)_(?= )', '<\g<1>>', string)
# 'foo_bar' or 'foo _ bar' is '/'
string = re.sub('(?<=\w)_(?=\w)', '/', string)
string = re.sub(' _ ', ' / ', string)
# 'foo_ ' is ':'
string = re.sub('(?<=\w)_ ', ': ', string)
return string
data = {}
parts = map(lambda x: parse_underscores(x.strip()), path.split('/'))
# subdirectory
if len(parts) > 4:
data['subdirectory'] = '/'.join(parts[3:-1])
parts = parts[:3] + parts[-1:]
else:
data['subdirectory'] = None
length = len(parts)
director, title, file = [
parts[-3] if length > 2 else None,
parts[-2] if length > 1 else None,
parts[-1]
]
# directorSort, director
data['directorSort'] = data['director'] = []
if director:
data['directorSort'] = filter(
lambda x: x != 'Unknown Director',
director.split('; ')
)
data['director'] = map(
lambda x: ' '.join(reversed(x.split(', '))),
data['directorSort']
)
# title, year
data['title'] = data['year'] = None
if title:
match = re.search(' \(\d{4}(-(\d{4})?)?\)$', title)
data['title'] = title[:-len(match.group(0))] if match else title
data['year'] = match.group(0)[2:-1] if match else None
file_title = re.sub('[/:]', '_', data['title'])
# (remove title from beginning of filename if the rest contains a dot)
file = re.sub('^' + re.escape(file_title) + '(?=.*\.)', '', file)
# (split by nospace+dot+word, but remove spaces preceding extension)
parts = re.split('(?<!\s)\.(?=\w)', re.sub('\s+(?=.\w+$)', '', file))
title, parts, extension = [
parts[0],
parts[1:-1],
parts[-1] if len(parts) > 1 else None
]
if not data['title'] and title:
data['title'] = title
# season, episode, episodes, episodeTitle
data['season'] = data['episode'] = data['episodeTitle'] = None
data['episodes'] = []
match = re.search(' \((S\d{2})?(E\d{2}([+-]\d{2})?)?\)(.+)?', title)
if match:
if match.group(1):
data['season'] = int(match.group(1)[1:])
if match.group(2):
if len(match.group(2)) == 3:
data['episode'] = int(match.group(2)[1:])
else:
data['episodes'] = range(int(match.group(2)[1:3]), int(match.group(2)[-2:]) + 1)
if match.group(4):
data['episodeTitle'] = match.group(4)[1:]
while data['episodeTitle'] and len(parts) and re.search('^\w+\.*$', parts[0]) and not re.search('^[a-z]{2}$', parts[0]):
data['episodeTitle'] += '.%s' % parts.pop(0)
# isEpisode, seriesTitle, seriesYear
data['isEpisode'] = False
data['seriesTitle'] = data['seriesYear'] = None
if data['season'] != None or data['episode'] != None or data['episodes']:
data['isEpisode'] = True
data['seriesTitle'] = data['title']
season = 'S%02d' % data['season'] if data['season'] != None else ''
episode = ''
if data['episode'] != None:
episode = 'E%02d' % data['episode']
elif data['episodes']:
episode = 'E%02d%s%02d' % (
data['episodes'][0], '+' if len(data['episodes']) == 2 else '-', data['episodes'][-1]
)
episodeTitle = ' %s' % data['episodeTitle'] if data['episodeTitle'] else ''
data['title'] += ' (%s%s)%s' % (season, episode, episodeTitle)
data['seriesYear'] = data['year']
data['year'] = None
# version
data['version'] = parts.pop(0) if len(parts) and re.search('^[A-Z0-9]', parts[0]) and not re.search('^Part .', parts[0]) else None
# part
data['part'] = parts.pop(0)[5:] if len(parts) and re.search('^Part .', parts[0]) else None
# partTitle
data['partTitle'] = parts.pop(0) if len(parts) and re.search('^[A-Z0-9]', parts[0]) and data['part'] else None
while data['partTitle'] and len(parts) and not re.search('^[a-z]{2}$', parts[0]):
data['partTitle'] += '.%s' % parts.pop(0)
# language
data['language'] = parts.pop(0) if len(parts) and re.search('^[a-z]{2}$', parts[0]) else None
# extension
data['extension'] = re.sub('^mpeg$', 'mpg', extension.lower()) if extension else None
# type
data['type'] = parse_type(data['extension'])
# normalizedPath
data['normalizedPath'] = format_path(data)
return data
def parse_movie_path(path):
"""
"A/Abrams, J.J.; Lieber, Jeffrey; Lindelof, Damon/Lost (2004)/Lost.Season 3.Episode 21.Greatest Hits.avi"
"B/Balada, Ivan/Metrum (1967)/Metrum.Part 1.en.srt"
"N/Nakata, Hideo/L - Change the World (2008)/L - Change the World.Part 2.srt"
"R/Reitz, Edgar/Heimat (1984-2006)/Heimat.Season 2.Episode 8.The Wedding.Part 2.avi"
"F/Feuillade, Louis/Les vampires (1915)/Les vampires.Episode 10.Part 2.avi"
title: 'Les vampires', year: '1915', episode: 10, part: 2
"G/Godard, Jean-Luc/Histoire(s) du cinema_ Toutes les histoires (1988)/Histoire(s) du cinema_ Toutes les histoires.avi"
"G/Godard, Jean-Luc/Six fois deux (1976)/Six fois deux.Part 1A.Y a personne.avi"
"G/Godard, Jean-Luc; Miéville, Anne-Marie/France_tour_detour_deux_enfants (1977)/France_tour_detour_deux_enfants.Part 5.Impression_Dictée.avi"
"L/Labarthe, André S_/Cinéastes de notre temps (1964-)/Cinéastes de notre temps.Episode.Jean Renoir le patron, première partie_ La Recherche du relatif.avi"
"S/Scott, Ridley/Blade Runner (1982)/Blade Runner.Directors's Cut.avi"
or
T/Title (Year)/Title.avi
"""
episodeTitle = episodeYear = seriesTitle = None
episodeDirector = []
parts = path.split('/')
#title/year
if len(parts) == 4:
title = parts[2]
elif len(parts) > 1:
title = parts[1]
else:
title = parts[0]
title = title.replace('_ ', ': ')
if title.endswith('_'):
title = title[:-1] + '.'
if title.startswith('_'):
title = '.' + title[1:]
year = find_re(title, '(\(\d{4}\))')
if not year:
year = find_re(title, '(\(\d{4}-\d*\))')
if year and title.endswith(year):
title = title[:-len(year)].strip()
year = year[1:-1]
if '-' in year:
year = find_re(year, '\d{4}')
#director
if len(parts) == 4:
director = parts[1]
if director.endswith('_'):
director = "%s." % director[:-1]
director = director.split('; ')
director = [normalize_name(d).strip() for d in director]
director = filter(lambda d: d not in ('Unknown Director', 'Various Directors'), director)
else:
director = []
#extension/language
fileparts = [x.replace('||', '. ') for x in parts[-1].replace('. ', '||').split('.')]
extension = len(fileparts) > 1 and fileparts[-1] or ''
if len(fileparts) > 1 and len(fileparts[-2]) == 2:
language = fileparts[-2]
else:
language = ''
#season/episode/episodeTitle
match = re.compile('(.+?) \((S(\d+))?(E(\d+))?\)( (.+?))?\.').match(parts[-1])
if match:
seriesTitle = match.group(1)
season = match.group(3)
episode = match.group(5)
episodeTitle = (match.group(6) or '').strip()
if episode != None:
episode = int(episode)
if season != None:
season = int(season)
if episode and not season:
season = 1
else:
season = find_re(parts[-1], '\.Season (\d+)\.')
if season:
season = int(season)
else:
season = None
episode = find_re(parts[-1], '\.Episode[s]* ([\d+]+)\.')
if episode:
episode = episode.split('+')[0]
episode = int(episode)
else:
episode = None
if episode and 'Episode %d'%episode in fileparts:
episodeTitle = fileparts.index('Episode %d' % episode) + 1
episodeTitle = fileparts[episodeTitle]
if episodeTitle == extension or episodeTitle.startswith('Part'):
episodeTitle = None
if not season and 'Episode' in fileparts:
episodeTitle = fileparts.index('Episode') + 1
episodeTitle = fileparts[episodeTitle]
if episodeTitle == extension or episodeTitle.startswith('Part'):
episodeTitle = None
else:
season = 1
if season:
seriesTitle = title
title = u'%s (S%02d)' % (seriesTitle, season)
if isinstance(episode, int):
title = u'%s (S%02dE%02d)' % (seriesTitle, season, episode)
if episodeTitle:
title = u'%s %s' % (title, episodeTitle)
#part
part = find_re(parts[-1], '\.Part (\d+)\.')
if part:
part = int(part)
else:
part = 0
return {
'director': director,
'episodeDirector': episodeDirector,
'episode': episode,
'episodeTitle': episodeTitle,
'episodeYear': episodeYear,
'extension': extension,
'language': language,
'part': part,
'season': season,
'seriesTitle': seriesTitle,
'title': title,
'year': year,
}
def create_movie_path(title, director, year,
season, episode, episodeTitle, episodeDirector, episodeYear,
part, language, extension):
'''
{
title: '', director: [''], year: '',
season: int, episode: int, episodeTitle: '', episodeDirector: [''], episodeYear: '',
part: int, language: '', extension: '', extra: bool
})
'''
partTitle = None
director = '; '.join(map(get_sort_name, director))
episodeDirector = '; '.join(map(get_sort_name, episodeDirector))
filename = [title]
if season:
filename += ['Season %d' % season]
if episode:
filename += ['Episode %d' % episode]
if episodeTitle:
filename += [episodeTitle]
if part:
filename += ['Part %s' % part]
if partTitle:
filename += [partTitle]
if extension:
filename += [extension]
filename = '.'.join(filename)
path = os.path.join(director[0], director, '%s (%s)' % (title, year), filename)
return path
def get_oxid(title, director=[], year='',
season='', episode='', episode_title='', episode_director=[], episode_year=''):
def get_hash(string):
return hashlib.sha1(string.encode('utf-8')).hexdigest().upper()
director = ', '.join(director)
episode_director = ', '.join(episode_director)
if not season and not episode and not episode_title:
oxid = get_hash(director)[:8] + get_hash('\n'.join([title, str(year)]))[:8]
else:
oxid = get_hash('\n'.join([director, title, str(year), str(season)]))[:8] + \
get_hash('\n'.join([str(episode), episode_director, episode_title, str(episode_year)]))[:8]
return u'0x' + oxid

View file

@ -0,0 +1,157 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
# GPL 2008
from __future__ import with_statement, print_function
import os
import gzip
import re
from six import BytesIO
import struct
from six.moves import urllib
from chardet.universaldetector import UniversalDetector
DEBUG = False
# Default headers for HTTP requests.
DEFAULT_HEADERS = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:28.0) Gecko/20100101 Firefox/28.0',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-us,en;q=0.5',
'Accept-Encoding': 'gzip'
}
def status(url, data=None, headers=DEFAULT_HEADERS):
try:
f = open_url(url, data, headers)
s = f.code
except urllib.error.HTTPError as e:
s = e.code
return s
def exists(url, data=None, headers=DEFAULT_HEADERS):
s = status(url, data, headers)
if s >= 200 and s < 400:
return True
return False
def get_headers(url, data=None, headers=DEFAULT_HEADERS):
try:
f = open_url(url, data, headers)
f.headers['Status'] = "%s" % f.code
headers = f.headers
f.close()
except urllib.error.HTTPError as e:
e.headers['Status'] = "%s" % e.code
headers = e.headers
return dict(headers)
def open_url(url, data=None, headers=DEFAULT_HEADERS):
if isinstance(url, bytes):
url = url.decode('utf-8')
url = url.replace(' ', '%20')
req = urllib.request.Request(url, data, headers)
return urllib.request.urlopen(req)
def read_url(url, data=None, headers=DEFAULT_HEADERS, return_headers=False, unicode=False):
if DEBUG:
print('ox.net.read_url', url)
f = open_url(url, data, headers)
result = f.read()
f.close()
if f.headers.get('content-encoding', None) == 'gzip':
result = gzip.GzipFile(fileobj=BytesIO(result)).read()
if unicode:
ctype = f.headers.get('content-type', '').lower()
if 'charset' in ctype:
encoding = ctype.split('charset=')[-1]
else:
encoding = detect_encoding(result)
if not encoding:
encoding = 'latin-1'
result = result.decode(encoding)
if return_headers:
f.headers['Status'] = "%s" % f.code
headers = {}
for key in f.headers:
headers[key.lower()] = f.headers[key]
return headers, result
return result
def detect_encoding(data):
data_lower = data.lower().decode('utf-8', 'ignore')
charset = re.compile('content="text/html; charset=(.*?)"').findall(data_lower)
if not charset:
charset = re.compile('meta charset="(.*?)"').findall(data_lower)
if charset:
return charset[0].lower()
detector = UniversalDetector()
p = 0
l = len(data)
s = 1024
while p < l:
detector.feed(data[p:p+s])
if detector.done:
break
p += s
detector.close()
return detector.result['encoding']
def save_url(url, filename, overwrite=False):
if not os.path.exists(filename) or overwrite:
dirname = os.path.dirname(filename)
if not os.path.exists(dirname):
os.makedirs(dirname)
data = read_url(url)
f = open(filename, 'w')
f.write(data)
f.close()
def oshash(url):
def get_size(url):
req = urllib.request.Request(url, headers=DEFAULT_HEADERS.copy())
req.get_method = lambda : 'HEAD'
u = urllib.request.urlopen(req)
if u.code != 200 or not 'Content-Length' in u.headers:
raise IOError
return int(u.headers['Content-Length'])
def get_range(url, start, end):
headers = DEFAULT_HEADERS.copy()
headers['Range'] = 'bytes=%s-%s' % (start, end)
req = urllib.request.Request(url, headers=headers)
u = urllib.request.urlopen(req)
return u.read()
try:
longlongformat = 'q' # long long
bytesize = struct.calcsize(longlongformat)
filesize = get_size(url)
hash = filesize
head = get_range(url, 0, min(filesize, 65536))
if filesize > 65536:
tail = get_range(url, filesize-65536, filesize)
if filesize < 65536:
for offset in range(0, filesize, bytesize):
buffer = head[offset:offset+bytesize]
(l_value,)= struct.unpack(longlongformat, buffer)
hash += l_value
hash = hash & 0xFFFFFFFFFFFFFFFF #cut off 64bit overflow
else:
for offset in range(0, 65536, bytesize):
buffer = head[offset:offset+bytesize]
(l_value,)= struct.unpack(longlongformat, buffer)
hash += l_value
hash = hash & 0xFFFFFFFFFFFFFFFF #cut of 64bit overflow
for offset in range(0, 65536, bytesize):
buffer = tail[offset:offset+bytesize]
(l_value,)= struct.unpack(longlongformat, buffer)
hash += l_value
hash = hash & 0xFFFFFFFFFFFFFFFF
returnedhash = "%016x" % hash
return returnedhash
except(IOError):
return "IOError"

View file

@ -0,0 +1,201 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
# GPL 2008
import re
import unicodedata
_articles = ('the', 'la', 'a', 'die', 'der', 'le', 'el',
"l'", 'il', 'das', 'les', 'o', 'ein', 'i', 'un', 'los', 'de',
'an', 'una', 'las', 'eine', 'den', 'gli', 'het', 'os', 'lo',
'az', 'det', 'ha-', 'een', 'ang', 'oi', 'ta', 'al-', 'dem',
'mga', 'uno', "un'", 'ett', u'\xcf', 'eines', u'\xc7', 'els',
u'\xd4\xef', u'\xcf\xe9')
# Articles in a dictionary.
_articlesDict = dict([(x, x) for x in _articles])
_spArticles = []
for article in _articles:
if article[-1] not in ("'", '-'): article += ' '
_spArticles.append(article)
_noarticles = (
'los angeles',
'i am ',
'i be area',
'i call ',
'i come ',
'i confess',
'i hired ',
'i killed ',
'i know ',
'i live ',
'i love',
'i married',
'i never',
'i shot',
'i start',
'i was',
)
def canonical_title(title):
"""Return the title in the canonic format 'Movie Title, The'.
>>> canonical_title('The Movie Title')
'Movie Title, The'
>>> canonical_title('Los Angeles Plays Itself')
'Los Angeles Plays Itself'
"""
try:
if _articlesDict.has_key(title.split(', ')[-1].lower()): return title
except IndexError: pass
ltitle = title.lower()
for start in _noarticles:
if ltitle.startswith(start):
return title
for article in _spArticles:
if ltitle.startswith(article):
lart = len(article)
title = '%s, %s' % (title[lart:], title[:lart])
if article[-1] == ' ': title = title[:-1]
break
## XXX: an attempt using a dictionary lookup.
##for artSeparator in (' ', "'", '-'):
## article = _articlesDict.get(ltitle.split(artSeparator)[0])
## if article is not None:
## lart = len(article)
## # check titles like "una", "I'm Mad" and "L'abbacchio".
## if title[lart:] == '' or (artSeparator != ' ' and
## title[lart:][1] != artSeparator): continue
## title = '%s, %s' % (title[lart:], title[:lart])
## if artSeparator == ' ': title = title[1:]
## break
return title
def normalize_title(title):
"""Return the title in the normal "The Title" format.
>>> normalize_title('Movie Title, The')
'The Movie Title'
"""
stitle = title.split(', ')
if len(stitle) > 1 and _articlesDict.has_key(stitle[-1].lower()):
sep = ' '
if stitle[-1][-1] in ("'", '-'): sep = ''
title = '%s%s%s' % (stitle[-1], sep, ', '.join(stitle[:-1]))
return title
def normalize_imdbid(imdbId):
"""Return 7 digit imdbId.
>>> normalize_imdbid('http://www.imdb.com/title/tt0159206/')
'0159206'
>>> normalize_imdbid(159206)
'0159206'
>>> normalize_imdbid('tt0159206')
'0159206'
"""
if isinstance(imdbId, basestring):
imdbId = re.sub('.*(\d{7}).*', '\\1', imdbId)
elif isinstance(imdbId, int):
imdbId = "%07d" % imdbId
return imdbId
# Common suffixes in surnames.
_sname_suffixes = (
'al', 'ben', 'da', 'de', 'del', 'den', 'der', 'des', 'di', 'dos', 'du',
'e', 'el', 'la', 'le', 'the', 'vom', 'von', 'van', 'y'
)
def canonical_name(name):
"""Return the given name in canonical "Surname, Name" format.
It assumes that name is in the 'Name Surname' format.
>>> canonical_name('Jean Luc Godard')
'Godard, Jean Luc'
>>> canonical_name('Ivan Ivanov-Vano')
'Ivanov-Vano, Ivan'
>>> canonical_name('Gus Van Sant')
'Van Sant, Gus'
>>> canonical_name('Brian De Palma')
'De Palma, Brian'
"""
# XXX: some statistics (over 1852406 names):
# - just a surname: 51921
# - single surname, single name: 1792759
# - composed surname, composed name: 7726
# - composed surname, single name: 55623
# (2: 49259, 3: 5502, 4: 551)
# - single surname, composed name: 186604
# (2: 178315, 3: 6573, 4: 1219, 5: 352)
# Don't convert names already in the canonical format.
if name in ('Unknown Director', ):
return name
if name.find(', ') != -1: return name
sname = name.split(' ')
snl = len(sname)
if snl == 2:
# Just a name and a surname: how boring...
name = '%s, %s' % (sname[1], sname[0])
elif snl > 2:
lsname = [x.lower() for x in sname]
if snl == 3: _indexes = (0, snl-2)
else: _indexes = (0, snl-2, snl-3)
# Check for common surname prefixes at the beginning and near the end.
for index in _indexes:
if lsname[index] not in _sname_suffixes: continue
try:
# Build the surname.
surn = '%s %s' % (sname[index], sname[index+1])
del sname[index]
del sname[index]
try:
# Handle the "Jr." after the name.
if lsname[index+2].startswith('jr'):
surn += ' %s' % sname[index]
del sname[index]
except (IndexError, ValueError):
pass
name = '%s, %s' % (surn, ' '.join(sname))
break
except ValueError:
continue
else:
name = '%s, %s' % (sname[-1], ' '.join(sname[:-1]))
return name
def normalize_name(name):
"""Return a name in the normal "Name Surname" format.
>>> normalize_name('Godard, Jean Luc')
'Jean Luc Godard'
>>> normalize_name('Ivanov-Vano, Ivan')
'Ivan Ivanov-Vano'
>>> normalize_name('Van Sant, Gus')
'Gus Van Sant'
>>> normalize_name('De Palma, Brian')
'Brian De Palma'
"""
sname = name.split(', ')
if len(sname) == 2:
name = '%s %s' % (sname[1], sname[0])
return name
def normalize_path(path):
path = path.replace(':', '_').replace('/', '_')
if path.endswith('.'): path = path[:-1] + '_'
return path
def strip_accents(s):
if isinstance(s, str):
s = unicode(s)
return ''.join((c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn'))

View file

@ -0,0 +1,32 @@
# -*- coding: utf-8 -*-
# ci:si:et:sw=4:sts=4:ts=4
import re
from . import cache
from .text import find_re
from .utils import json, ET
def get_embed_code(url, maxwidth=None, maxheight=None):
embed = {}
header = cache.get_headers(url)
if header.get('content-type', '').startswith('text/html'):
html = cache.read_url(url)
json_oembed = filter(lambda l: 'json+oembed' in l, re.compile('<link.*?>').findall(html))
xml_oembed = filter(lambda l: 'xml+oembed' in l, re.compile('<link.*?>').findall(html))
if json_oembed:
oembed_url = find_re(json_oembed[0], 'href="(.*?)"')
if maxwidth:
oembed_url += '&maxwidth=%d' % maxwidth
if maxheight:
oembed_url += '&maxheight=%d' % maxheight
embed = json.loads(cache.read_url(oembed_url))
elif xml_oembed:
oembed_url = find_re(json_oembed[0], 'href="(.*?)"')
if maxwidth:
oembed_url += '&maxwidth=%d' % maxwidth
if maxheight:
oembed_url += '&maxheight=%d' % maxheight
data = cache.read_url(oembed_url)
for e in ET.fromstring(data):
embed[e.tag] = e.text
return embed

View file

@ -0,0 +1,106 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
from __future__ import with_statement, division, print_function
import chardet
import re
import codecs
import ox
__all__ = []
def _detect_encoding(fp):
bomDict={ # bytepattern : name
(0x00, 0x00, 0xFE, 0xFF): "utf_32_be",
(0xFF, 0xFE, 0x00, 0x00): "utf_32_le",
(0xFE, 0xFF, None, None): "utf_16_be",
(0xFF, 0xFE, None, None): "utf_16_le",
(0xEF, 0xBB, 0xBF, None): "utf_8",
}
# go to beginning of file and get the first 4 bytes
oldFP = fp.tell()
fp.seek(0)
(byte1, byte2, byte3, byte4) = tuple(map(ord, fp.read(4)))
# try bom detection using 4 bytes, 3 bytes, or 2 bytes
bomDetection = bomDict.get((byte1, byte2, byte3, byte4))
if not bomDetection:
bomDetection = bomDict.get((byte1, byte2, byte3, None))
if not bomDetection:
bomDetection = bomDict.get((byte1, byte2, None, None))
## if BOM detected, we're done :-)
fp.seek(oldFP)
if bomDetection:
return bomDetection
encoding = 'latin-1'
#more character detecting magick using http://chardet.feedparser.org/
fp.seek(0)
rawdata = fp.read()
#if data can be decoded as utf-8 use that, try chardet otherwise
#chardet detects utf-8 as ISO-8859-2 most of the time
try:
data = unicode(rawdata, 'utf-8')
encoding = 'utf-8'
except:
encoding = chardet.detect(rawdata)['encoding']
fp.seek(oldFP)
return encoding
def load(filename, offset=0):
'''
filename path to an srt file
offset in seconds shift all in/out points by offset
returns list with objects that have in,out,value and id
'''
srt = []
def parse_time(t):
return offset + ox.time2ms(t.replace(',', '.')) / 1000
with open(filename) as f:
encoding = _detect_encoding(f)
data = f.read()
try:
data = unicode(data, encoding)
except:
try:
data = unicode(data, 'latin-1')
except:
print("failed to detect encoding, giving up")
return srt
data = data.replace('\r\n', '\n')
srts = re.compile('(\d\d:\d\d:\d\d[,.]\d\d\d)\s*?-->\s*?(\d\d:\d\d:\d\d[,.]\d\d\d).*?\n(.*?)\n\n', re.DOTALL)
i = 0
for s in srts.findall(data):
_s = {'id': str(i),
'in': parse_time(s[0]),
'out': parse_time(s[1]),
'value': s[2].strip()
}
srt.append(_s)
i += 1
return srt
def encode(data):
'''
encodes list of objects with in,out,value into srt
result is utf-8 encoded bytestring
'''
srt = u''
i = 1
for s in data:
srt += '%d\r\n%s --> %s\r\n%s\r\n\r\n' % (
i,
ox.format_duration(s['in']*1000, years=False).replace('.', ','),
ox.format_duration(s['out']*1000, years=False).replace('.', ','),
s['value'].replace('\n', '\r\n').strip()
)
i += 1
return codecs.BOM_UTF8 + srt.encode('utf-8')

View file

@ -0,0 +1,593 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
# GPL 2008
import math
import re
import unicodedata
ARTICLES = list(set([
# def sg, def pl, indef sg, indef pl (each m/f/n)
'der', 'die', 'das', 'ein', 'eine', # de
'the', 'a', 'an', # en
'el', 'la', 'lo', 'los', 'las', 'un', 'una', 'unos', 'unas', # es
'le', "l'", 'la', 'les', 'un', 'une', 'des', # fr
'il', 'lo', "l'" 'la', '_i', 'gli', 'le', # it
'de', 'het', 'een', # nl
'o', 'a', 'os', '_as', 'um', 'uma', '_uns', 'umas' # pt
# some _disabled because of collisions
]))
# see http://en.wikipedia.org/wiki/List_of_common_Chinese_surnames
# and http://en.wikipedia.org/wiki/List_of_Korean_family_names
ASIAN_NAMES = [
'chan', 'chang', 'chao',
'chen', 'cheong', 'cheung',
'chong', 'choo',
'chu', 'chun',
'hou', 'hsieh', 'hsu', 'hu', 'huang',
'kuo',
'li', 'liang', 'lin', 'liu',
'_park',
'sun', 'sung',
'tsao',
'wang', 'Wong',
'yang', 'yeong', 'yeung'
]
PREFIXES = [
'al', 'bin', 'da', 'de', 'del', 'dem', 'den', 'der', 'di', 'dos', 'du',
'e', 'el', 'la', 'san', 'the', 'van', 'vom', 'von', 'y', 'zu'
]
MIDFIXES = ['und']
SUFFIXES = ['ii', 'iii', 'jr', 'jr.', 'ph.d.', 'phd', 'sr', 'sr.']
UA_ALIASES = {
'browser': {
'Chrome': '(CriOS|CrMo)',
'Firefox': '(Fennec|Firebird|Iceweasel|Minefield|Namoroka|Phoenix|SeaMonkey|Shiretoko)',
'Nokia Browser': '(OviBrowser)'
},
'robot': {},
'system': {
'BSD': '(FreeBSD|NetBSD|OpenBSD)',
'Linux': '(CrOS|MeeGo|webOS)',
'Unix': '(AIX|HP-UX|IRIX|SunOS)'
}
}
UA_NAMES = {
'browser': {
'chromeframe': 'Chrome Frame',
'FBForIPhone': 'WebKit',
'Gecko': 'Mozilla',
'IEMobile': 'Internet Explorer',
'konqueror': 'Konqueror',
'Mozilla': 'Netscape',
'MSIE': 'Internet Explorer',
'NokiaBrowser': 'Nokia Browser',
'Trident': 'Internet Explorer'
},
'robot': {},
'system': {
'BB': 'BlackBerry',
'CPU OS': 'iOS',
'iPhone': 'iOS',
'iPhone OS': 'iOS',
'J2ME/MIDP': 'Java',
'Mac_PowerPC': 'Mac OS',
'Mac_PPC': 'Mac OS',
'Macintosh': 'Mac OS',
'PLAYSTATION': 'PlayStation',
'S': 'Nokia',
'Series': 'Nokia',
'Win': 'Windows',
'Windows Phone OS': 'Windows Phone',
'X11': 'Linux'
}
}
UA_REGEXPS = {
'browser': [
'(Camino)\/(\d+)',
'(Chimera)\/(\d+)',
'(chromeframe)\/(\d+)',
'(Epiphany)\/(\d+)', # before Chrome, Chromium and Safari
'(Chromium)\/(\d+)', # before Chrome
'(Chrome)\/(\d+)',
'(FBForIPhone)',
'(Firefox)\/(\d+)',
'(Galeon)\/(\d+)',
'(IEMobile)\/(\d+)',
'(iCab) (\d+)',
'(iCab)\/(\d+)',
'(konqueror)\/(\d+)',
'(Konqueror)\/(\d+)',
'(Lynx)\/(\d+)',
'(Netscape)\d?\/(\d+)',
'(NokiaBrowser)\/(\d+)',
'(OmniWeb)\/(\d+)',
'(Opera)\/.+Version\/(\d+)',
'(OviBrowser)\/(\d+)',
'Version\/(\d+).+(Safari)',
'(WebKit)\/(\d+)',
'(MSIE) (\d\d?(?!\d))', # last, since Opera used to mask as MSIE
'(Trident)\/.*?rv:(\d+)',
'(Gecko)',
'(Mozilla)\/(3|4)'
],
'robot': [
'(BingPreview)\/(\d+)',
'(Google Web Preview).+Chrome\/(\d+)',
'(Googlebot)\/(\d+)',
'(WebCrawler)\/(\d+)',
'(Yahoo! Slurp)\/(\d+)'
],
'system': [
'(Android) (\d+)',
'(Android)',
'(BB)(\d+)',
'(BeOS)',
'(BlackBerry) (\d+)',
'(BlackBerry)',
'(Darwin)',
'(BSD) (FreeBSD|NetBSD|OpenBSD)',
'(CPU OS) (\d+)',
'(iPhone OS) (\d+)',
'(iPhone)', # Opera
'(J2ME\/MIDP)',
'(Linux).+(CentOS|CrOS|Debian|Fedora|Gentoo|Mandriva|MeeGo|Mint|Red Hat|SUSE|Ubuntu|webOS)',
'(CentOS|CrOS|Debian|Fedora|Gentoo|Mandriva|MeeGo|Mint|Red Hat|SUSE|Ubuntu|webOS).+(Linux)',
'(Linux)',
'(Mac OS X) (10.\d+)',
'(Mac OS X)',
'(Mac_PowerPC)',
'(Mac_PPC)',
'(Macintosh)',
'Nintendo (Wii).+NX\/(\d+)',
'(PLAYSTATION) (\d+)',
'(PlayStation) Vita (\d+)',
'(RIM Tablet OS) (\d+)',
'(S)(60);',
'(Series) ?(40|60)',
'(Symbian OS)',
'(SymbianOS)\/(\d+)',
'(SymbOS)',
'(OS\/2)',
'(Unix) (AIX|HP-UX|IRIX|SunOS)',
'(Unix)',
'(Windows) (NT \d\.\d)',
'(Windows Phone) (\d+)',
'(Windows Phone OS) (\d+)',
'(Windows) (3\.1|95|98|2000|2003|CE|ME|Mobile|NT|XP)', # Opera
'(Win) (9x 4\.90)', # Firefox
'(Win)(16)', # Firefox
'(Win)(9\d)', # Firefox
'(Win)(NT)', # Firefox
'(Win)(NT4\.0)', # Firefox
'(X11)'
]
}
UA_VERSIONS = {
'browser': {},
'robot': {},
'system': {
'10.0': '10.0 (Cheetah)',
'10.1': '10.1 (Puma)',
'10.2': '10.2 (Jaguar)',
'10.3': '10.3 (Panther)',
'10.4': '10.4 (Tiger)',
'10.5': '10.5 (Leopard)',
'10.6': '10.6 (Snow Leopard)',
'10.7': '10.7 (Lion)',
'10.8': '10.8 (Mountain Lion)',
'10.9': '10.9 (Mavericks)',
'10.10': '10.10 (Yosemite)',
'40': 'Series 40',
'60': 'Series 60',
'NT 3.1': 'NT 3.1 (3.1)',
'NT 3.5': 'NT 3.5 (NT)',
'NT 4.0': 'NT 4.0 (NT)',
'NT 4.1': 'NT 4.1 (98)',
'9x 4.90': 'NT 4.9 (ME)',
'NT 5.0': 'NT 5.0 (2000)',
'NT 5.1': 'NT 5.1 (XP)',
'NT 5.2': 'NT 5.2 (2003)',
'NT 6.0': 'NT 6.0 (Vista)',
'NT 6.1': 'NT 6.1 (7)',
'NT 6.2': 'NT 6.2 (8)',
'NT 6.3': 'NT 6.3 (8.1)',
'16': 'NT 3.1 (3.1)',
'3.1': 'NT 3.1 (3.1)',
'95': 'NT 4.0 (95)',
'NT': 'NT 4.0 (NT)',
'NT4.0': 'NT 4.0 (NT)',
'98': 'NT 4.1 (98)',
'ME': 'NT 4.9 (ME)',
'2000': 'NT 5.0 (2000)',
'XP': 'NT 5.1 (XP)',
'2003': 'NT 5.2 (2003)'
}
}
def get_sort_name(name):
"""
>>> get_sort_name('Alfred Hitchcock')
'Hitchcock, Alfred'
>>> get_sort_name('Jean-Luc Godard')
'Godard, Jean-Luc'
>>> get_sort_name('Rainer Werner Fassbinder')
'Fassbinder, Rainer Werner'
>>> get_sort_name('Brian De Palma')
'De Palma, Brian'
>>> get_sort_name('Johan van der Keuken')
'van der Keuken, Johan'
>>> get_sort_name('Edward D. Wood Jr.')
'Wood Jr., Edward D.'
>>> get_sort_name('Bing Wang')
'Wang Bing'
>>> get_sort_name('Frank Capra III')
'Capra III, Frank'
>>> get_sort_name('The Queen of England')
'Queen of England, The'
>>> get_sort_name('Sham 69')
'Sham 69'
>>> get_sort_name('Scorsese, Martin')
'Scorsese, Martin'
"""
if not ' ' in name or ', ' in name:
return name
if name.lower().startswith('the '):
return get_sort_title(name)
def add_name():
if len(first_names):
last_names.insert(0, first_names.pop())
def find_name(names):
return len(first_names) and first_names[-1].lower() in names
first_names = name.split(' ')
last_names = []
if re.search('^[0-9]+$', first_names[-1]):
add_name()
if find_name(SUFFIXES):
add_name()
add_name()
if find_name(MIDFIXES):
add_name()
add_name()
while find_name(PREFIXES):
add_name()
name = ' '.join(last_names)
if len(first_names):
separator = ' ' if last_names[0].lower() in ASIAN_NAMES else ', '
name += separator + ' '.join(first_names)
return name
def get_sort_title(title):
"""
>>> get_sort_title('Themroc')
'Themroc'
>>> get_sort_title('Die Hard')
'Hard, Die'
>>> get_sort_title("L'atalante")
"atalante, L'"
"""
for article in ARTICLES:
spaces = 0 if article.endswith("'") else 1
if title.lower().startswith(article + ' ' * spaces):
length = len(article)
return title[length + spaces:] + ', ' + title[:length]
return title
def find_re(string, regexp):
result = re.compile(regexp, re.DOTALL).findall(string)
if result:
return result[0].strip()
return ''
def find_string(string, string0='', string1 = ''):
"""Return the string between string0 and string1.
If string0 or string1 is left out, begining or end of string is used.
>>> find_string('i am not there', string1=' not there')
'i am'
>>> find_string('i am not there', 'i am ', ' there')
'not'
>>> find_string('i am not there', 'i am not t')
'here'
"""
if string0:
string0 = re.escape(string0)
else:
string0 = '^'
if string1:
string1 = re.escape(string1)
else:
string1 = '$'
return find_re(string, string0 + '(.*?)' + string1)
def parse_useragent(useragent):
data = {}
for key in UA_REGEXPS:
for alias, regexp in UA_ALIASES[key].items():
alias = alias if key == 'browser' else alias + ' \\1'
useragent = re.sub(regexp, alias, useragent)
for regexp in UA_REGEXPS[key]:
data[key] = {'name': '', 'version': '', 'string': ''}
match = re.compile(regexp).search(useragent)
if match:
matches = list(match.groups())
if len(matches) == 1:
matches.append('')
swap = re.match('^\d', matches[0]) or matches[1] == 'Linux'
name = matches[1 if swap else 0]
version = matches[0 if swap else 1].replace('_', '.')
name = UA_NAMES[key][name] if name in UA_NAMES[key] else name
version = UA_VERSIONS[key][version] if version in UA_VERSIONS[key] else version
string = name
if version:
string = string + ' ' + (
'(' + version + ')' if name in ['BSD', 'Linux', 'Unix'] else version
)
data[key] = {
'name': name,
'version': version,
'string': string
}
break;
return data
def remove_special_characters(text):
"""
Removes special characters inserted by Word.
"""
text = text.replace(u'\u2013', '-')
text = text.replace(u'\u2026O', "'")
text = text.replace(u'\u2019', "'")
text = text.replace(u'', "'")
text = text.replace(u'', "'")
text = text.replace(u'', "-")
return text
def wrap(text, width):
"""
A word-wrap function that preserves existing line breaks and most spaces in
the text. Expects that existing line breaks are posix newlines (\n).
See http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/148061
"""
return reduce(lambda line, word, width=width: '%s%s%s' %
(line,
' \n'[(len(line[line.rfind('\n')+1:])
+ len(word.split('\n',1)[0]
) >= width)],
word),
text.split(' ')
)
def wrap_string(string, length=80, separator='\n', balance=False):
'''
>>> wrap_string(u"Anticonstitutionellement, Paris s'eveille", 16)
u"Anticonstitution\\nellement, Paris \\ns'eveille"
>>> wrap_string(u'All you can eat', 12, '\\n', True)
u'All you \\ncan eat'
'''
words = string.split(' ')
if balance:
# balance lines: test if same number of lines
# can be achieved with a shorter line length
lines = wrap_string(string, length, separator, False).split(separator)
if len(lines) > 1:
while length > max([len(x) for x in words]):
length -= 1
if len(wrap_string(string, length, separator, False).split(separator)) > len(lines):
length += 1
break
lines = ['']
for word in words:
if len(lines[len(lines) - 1] + word + u' ') <= length + 1:
# word fits in current line
lines[len(lines) - 1] += word + u' ';
else:
if len(word) <= length:
# word fits in next line
lines.append(word + u' ')
else:
# word is longer than line
position = length - len(lines[len(lines) - 1])
lines[len(lines) - 1] += word[0:position]
for i in range(position, len(word), length):
lines.append(word[i:i+length]);
lines[len(lines) - 1] += u' '
return separator.join(lines).strip()
def truncate_string(string, length, padding='...', position='right'):
# >>> truncate_string('anticonstitutionellement', 16, '...', 'left')
# '...utionellement'
# >>> truncate_string('anticonstitutionellement', 16, '...', 'center')
# 'anticon...lement'
# >>> truncate_string('anticonstitutionellement', 16, '...', 'right')
# 'anticonstitut...'
stringLength = len(string);
paddingLength = len(padding)
if stringLength > length:
if position == 'left':
string = '%s%s' % (padding, string[stringLength + paddingLength - length:])
elif position == 'center':
left = int(math.ceil(float(length - paddingLength) / 2))
right = int(stringLength - math.floor(float(length - paddingLength) / 2))
string = '%s%s%s' % (string[:left], padding, string[right:])
elif position == 'right':
string = '%s%s' % (string[:length - paddingLength], padding)
return string;
def truncate_words(s, num):
"""Truncates a string after a certain number of chacters, but ends with a word
>>> truncate_words('Truncates a string after a certain number of chacters, but ends with a word', 23)
'Truncates a string...'
>>> truncate_words('Truncates a string', 23)
'Truncates a string'
"""
length = int(num)
if len(s) <= length:
return s
words = s.split()
ts = ""
while words and len(ts) + len(words[0]) < length:
ts += " " + words.pop(0)
if words:
ts += "..."
return ts.strip()
def trim_string(string, num):
"""Truncates a string after a certain number of chacters, adding ... at -10 characters
>>> trim_string('Truncates a string after a certain number of chacters', 23)
'Truncates ...f chacters'
>>> trim_string('Truncates a string', 23)
'Truncates a string'
"""
if len(string) > num:
string = string[:num - 13] + '...' + string[-10:]
return string
def get_valid_filename(s):
"""
Returns the given string converted to a string that can be used for a clean
filename. Specifically, leading and trailing spaces are removed;
all non-filename-safe characters are removed.
>>> get_valid_filename("john's portrait in 2004.jpg")
'john_s_portrait_in_2004.jpg'
"""
s = s.strip()
s = s.replace(' ', '_')
s = re.sub(r'[^-A-Za-z0-9_.\[\]\ ]', '_', s)
s = s.replace('__', '_').replace('__', '_')
return s
def get_text_list(list_, last_word='or'):
"""
>>> get_text_list([u'a', u'b', u'c', u'd'])
u'a, b, c or d'
>>> get_text_list([u'a', u'b', u'c'], 'and')
u'a, b and c'
>>> get_text_list([u'a', u'b'], 'and')
u'a and b'
>>> get_text_list([u'a'])
u'a'
>>> get_text_list([])
''
"""
if len(list_) == 0: return ''
if len(list_) == 1: return list_[0]
return u'%s %s %s' % (u', '.join([unicode(i) for i in list_][:-1]), last_word, list_[-1])
def get_list_text(text, last_word='or'):
"""
>>> get_list_text(u'a, b, c or d')
[u'a', u'b', u'c', u'd']
>>> get_list_text(u'a, b and c', u'and')
[u'a', u'b', u'c']
>>> get_list_text(u'a and b', u'and')
[u'a', u'b']
>>> get_list_text(u'a')
[u'a']
>>> get_list_text(u'')
[]
"""
list_ = []
if text:
list_ = text.split(u', ')
if list_:
i=len(list_)-1
last = list_[i].split(last_word)
if len(last) == 2:
list_[i] = last[0].strip()
list_.append(last[1].strip())
return list_
def normalize_newlines(text):
return re.sub(r'\r\n|\r|\n', '\n', text)
def recapitalize(text):
"Recapitalizes text, placing caps after end-of-sentence punctuation."
#capwords = ()
text = text.lower()
capsRE = re.compile(r'(?:^|(?<=[\.\?\!] ))([a-z])')
text = capsRE.sub(lambda x: x.group(1).upper(), text)
#for capword in capwords:
# capwordRE = re.compile(r'\b%s\b' % capword, re.I)
# text = capwordRE.sub(capword, text)
return text
def phone2numeric(phone):
"Converts a phone number with letters into its numeric equivalent."
letters = re.compile(r'[A-PR-Y]', re.I)
char2number = lambda m: {'a': '2', 'c': '2', 'b': '2', 'e': '3',
'd': '3', 'g': '4', 'f': '3', 'i': '4', 'h': '4', 'k': '5',
'j': '5', 'm': '6', 'l': '5', 'o': '6', 'n': '6', 'p': '7',
's': '7', 'r': '7', 'u': '8', 't': '8', 'w': '9', 'v': '8',
'y': '9', 'x': '9'}.get(m.group(0).lower())
return letters.sub(char2number, phone)
def compress_string(s):
import cStringIO, gzip
zbuf = cStringIO.StringIO()
zfile = gzip.GzipFile(mode='wb', compresslevel=6, fileobj=zbuf)
zfile.write(s)
zfile.close()
return zbuf.getvalue()
smart_split_re = re.compile('("(?:[^"\\\\]*(?:\\\\.[^"\\\\]*)*)"|\'(?:[^\'\\\\]*(?:\\\\.[^\'\\\\]*)*)\'|[^\\s]+)')
def smart_split(text):
"""
Generator that splits a string by spaces, leaving quoted phrases together.
Supports both single and double quotes, and supports escaping quotes with
backslashes. In the output, strings will keep their initial and trailing
quote marks.
>>> list(smart_split('This is "a person\\'s" test.'))
['This', 'is', '"a person\\'s"', 'test.']
"""
for bit in smart_split_re.finditer(text):
bit = bit.group(0)
if bit[0] == '"':
yield '"' + bit[1:-1].replace('\\"', '"').replace('\\\\', '\\') + '"'
elif bit[0] == "'":
yield "'" + bit[1:-1].replace("\\'", "'").replace("\\\\", "\\") + "'"
else:
yield bit
def words(text):
"""
returns words in text, removing punctuation
"""
text = text.split()
return map(lambda x: re.sub("(([.!?:-_]|'s)$)", '', x), text)
def sort_string(string):
string = string.replace(u'Æ', 'AE').replace(u'Ø', 'O').replace(u'Þ', 'Th')
#pad numbered titles
string = re.sub('(\d),(\d{3})', '\\1\\2', string)
string = re.sub('(\d+)', lambda x: '%010d' % int(x.group(0)), string)
return unicodedata.normalize('NFKD', string)
def sorted_strings(strings, key=None):
if not key:
key = lambda k: sort_string(k)
return sorted(strings, key=key)

View file

@ -0,0 +1,74 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
# GPL 2007-2012
from threading import Event
from hashlib import sha1
import os
from .bencode import bencode, bdecode
__all__ = ['create_torrent', 'get_info_hash', 'get_torrent_info', 'get_files', 'get_torrent_size']
def create_torrent(file, url, params = {}, flag = Event(),
progress = lambda x: None, progress_percent = 1):
"Creates a torrent for a given file, using url as tracker url"
from makemetafile import make_meta_file
return make_meta_file(file, url, params, flag, progress, progress_percent)
def get_info_hash(torrentFile):
"Returns Torrent Info Hash from torrent file"
metainfo_file = open(torrentFile, 'rb')
metainfo = bdecode(metainfo_file.read())
info = metainfo['info']
return sha1(bencode(info)).hexdigest()
def get_torrent_info(data=None, file=None):
if file:
if not isinstance(file, bytes):
file = file.encode('utf-8')
with open(file, 'rb') as f:
data = f.read()
"Returns Torrent Info from torrent file"
tinfo = {}
metainfo = bdecode(data)
info = metainfo['info']
piece_length = info['piece length']
if 'length' in info:
# let's assume we just have one file
file_length = info['length']
else:
# let's assume we have a directory structure
file_length = 0;
for f in info['files']:
file_length += f['length']
for key in info:
if key != 'pieces':
tinfo[key] = info[key]
for key in metainfo:
if key != 'info':
tinfo[key] = metainfo[key]
tinfo['size'] = file_length
tinfo['hash'] = sha1(bencode(info)).hexdigest()
tinfo['announce'] = metainfo['announce']
if file:
tinfo['timestamp'] = os.stat(file).st_ctime
return tinfo
def get_files(data):
files = []
info = get_torrent_info(data=data)
if 'files' in info:
for f in info['files']:
path = [info['name'], ]
path.extend(f['path'])
files.append(os.path.join(*path))
else:
files.append(info['name'])
return files
def get_torrent_size(file):
"Returns Size of files in torrent file in bytes"
return get_torrent_info(file=file)['size']

View file

@ -0,0 +1,320 @@
# Written by Petru Paler, Uoti Urpala, Ross Cohen and John Hoffman
# see LICENSE.txt for license information
from types import IntType, LongType, StringType, ListType, TupleType, DictType
try:
from types import BooleanType
except ImportError:
BooleanType = None
try:
from types import UnicodeType
except ImportError:
UnicodeType = None
from cStringIO import StringIO
def decode_int(x, f):
f += 1
newf = x.index('e', f)
try:
n = int(x[f:newf])
except:
n = long(x[f:newf])
if x[f] == '-':
if x[f + 1] == '0':
raise ValueError
elif x[f] == '0' and newf != f+1:
raise ValueError
return (n, newf+1)
def decode_string(x, f):
colon = x.index(':', f)
try:
n = int(x[f:colon])
except (OverflowError, ValueError):
n = long(x[f:colon])
if x[f] == '0' and colon != f+1:
raise ValueError
colon += 1
return (x[colon:colon+n], colon+n)
def decode_unicode(x, f):
s, f = decode_string(x, f+1)
return (s.decode('UTF-8'),f)
def decode_list(x, f):
r, f = [], f+1
while x[f] != 'e':
v, f = decode_func[x[f]](x, f)
r.append(v)
return (r, f + 1)
def decode_dict(x, f):
r, f = {}, f+1
lastkey = None
while x[f] != 'e':
k, f = decode_string(x, f)
#why is this needed
#if lastkey >= k:
# raise ValueError
lastkey = k
r[k], f = decode_func[x[f]](x, f)
return (r, f + 1)
decode_func = {}
decode_func['l'] = decode_list
decode_func['d'] = decode_dict
decode_func['i'] = decode_int
decode_func['0'] = decode_string
decode_func['1'] = decode_string
decode_func['2'] = decode_string
decode_func['3'] = decode_string
decode_func['4'] = decode_string
decode_func['5'] = decode_string
decode_func['6'] = decode_string
decode_func['7'] = decode_string
decode_func['8'] = decode_string
decode_func['9'] = decode_string
#decode_func['u'] = decode_unicode
def bdecode(x, sloppy = 1):
try:
r, l = decode_func[x[0]](x, 0)
# except (IndexError, KeyError):
except (IndexError, KeyError, ValueError):
raise ValueError, "bad bencoded data"
if not sloppy and l != len(x):
raise ValueError, "bad bencoded data"
return r
def test_bdecode():
try:
bdecode('0:0:')
assert 0
except ValueError:
pass
try:
bdecode('ie')
assert 0
except ValueError:
pass
try:
bdecode('i341foo382e')
assert 0
except ValueError:
pass
assert bdecode('i4e') == 4L
assert bdecode('i0e') == 0L
assert bdecode('i123456789e') == 123456789L
assert bdecode('i-10e') == -10L
try:
bdecode('i-0e')
assert 0
except ValueError:
pass
try:
bdecode('i123')
assert 0
except ValueError:
pass
try:
bdecode('')
assert 0
except ValueError:
pass
try:
bdecode('i6easd')
assert 0
except ValueError:
pass
try:
bdecode('35208734823ljdahflajhdf')
assert 0
except ValueError:
pass
try:
bdecode('2:abfdjslhfld')
assert 0
except ValueError:
pass
assert bdecode('0:') == ''
assert bdecode('3:abc') == 'abc'
assert bdecode('10:1234567890') == '1234567890'
try:
bdecode('02:xy')
assert 0
except ValueError:
pass
try:
bdecode('l')
assert 0
except ValueError:
pass
assert bdecode('le') == []
try:
bdecode('leanfdldjfh')
assert 0
except ValueError:
pass
assert bdecode('l0:0:0:e') == ['', '', '']
try:
bdecode('relwjhrlewjh')
assert 0
except ValueError:
pass
assert bdecode('li1ei2ei3ee') == [1, 2, 3]
assert bdecode('l3:asd2:xye') == ['asd', 'xy']
assert bdecode('ll5:Alice3:Bobeli2ei3eee') == [['Alice', 'Bob'], [2, 3]]
try:
bdecode('d')
assert 0
except ValueError:
pass
try:
bdecode('defoobar')
assert 0
except ValueError:
pass
assert bdecode('de') == {}
assert bdecode('d3:agei25e4:eyes4:bluee') == {'age': 25, 'eyes': 'blue'}
assert bdecode('d8:spam.mp3d6:author5:Alice6:lengthi100000eee') == {'spam.mp3': {'author': 'Alice', 'length': 100000}}
try:
bdecode('d3:fooe')
assert 0
except ValueError:
pass
try:
bdecode('di1e0:e')
assert 0
except ValueError:
pass
try:
bdecode('d1:b0:1:a0:e')
assert 0
except ValueError:
pass
try:
bdecode('d1:a0:1:a0:e')
assert 0
except ValueError:
pass
try:
bdecode('i03e')
assert 0
except ValueError:
pass
try:
bdecode('l01:ae')
assert 0
except ValueError:
pass
try:
bdecode('9999:x')
assert 0
except ValueError:
pass
try:
bdecode('l0:')
assert 0
except ValueError:
pass
try:
bdecode('d0:0:')
assert 0
except ValueError:
pass
try:
bdecode('d0:')
assert 0
except ValueError:
pass
bencached_marker = []
class Bencached:
def __init__(self, s):
self.marker = bencached_marker
self.bencoded = s
BencachedType = type(Bencached('')) # insufficient, but good as a filter
def encode_bencached(x,r):
assert x.marker == bencached_marker
r.append(x.bencoded)
def encode_int(x,r):
r.extend(('i',str(x),'e'))
def encode_bool(x,r):
encode_int(int(x),r)
def encode_string(x,r):
r.extend((str(len(x)),':',x))
def encode_unicode(x,r):
#r.append('u')
encode_string(x.encode('UTF-8'),r)
def encode_list(x,r):
r.append('l')
for e in x:
encode_func[type(e)](e, r)
r.append('e')
def encode_dict(x,r):
r.append('d')
ilist = x.items()
ilist.sort()
for k,v in ilist:
r.extend((str(len(k)),':',k))
encode_func[type(v)](v, r)
r.append('e')
encode_func = {}
encode_func[BencachedType] = encode_bencached
encode_func[IntType] = encode_int
encode_func[LongType] = encode_int
encode_func[StringType] = encode_string
encode_func[ListType] = encode_list
encode_func[TupleType] = encode_list
encode_func[DictType] = encode_dict
if BooleanType:
encode_func[BooleanType] = encode_bool
if UnicodeType:
encode_func[UnicodeType] = encode_unicode
def bencode(x):
r = []
try:
encode_func[type(x)](x, r)
except:
print "*** error *** could not encode type %s (value: %s)" % (type(x), x)
assert 0
return ''.join(r)
def test_bencode():
assert bencode(4) == 'i4e'
assert bencode(0) == 'i0e'
assert bencode(-10) == 'i-10e'
assert bencode(12345678901234567890L) == 'i12345678901234567890e'
assert bencode('') == '0:'
assert bencode('abc') == '3:abc'
assert bencode('1234567890') == '10:1234567890'
assert bencode([]) == 'le'
assert bencode([1, 2, 3]) == 'li1ei2ei3ee'
assert bencode([['Alice', 'Bob'], [2, 3]]) == 'll5:Alice3:Bobeli2ei3eee'
assert bencode({}) == 'de'
assert bencode({'age': 25, 'eyes': 'blue'}) == 'd3:agei25e4:eyes4:bluee'
assert bencode({'spam.mp3': {'author': 'Alice', 'length': 100000}}) == 'd8:spam.mp3d6:author5:Alice6:lengthi100000eee'
try:
bencode({1: 'foo'})
assert 0
except AssertionError:
pass
try:
import psyco
psyco.bind(bdecode)
psyco.bind(bencode)
except ImportError:
pass

View file

@ -0,0 +1,100 @@
# Written by Bram Cohen
# see LICENSE.txt for license information
from types import StringType, LongType, IntType, ListType, DictType
from re import compile
reg = compile(r'^[^/\\.~][^/\\]*$')
ints = (LongType, IntType)
def check_info(info):
if type(info) != DictType:
raise ValueError, 'bad metainfo - not a dictionary'
pieces = info.get('pieces')
if type(pieces) != StringType or len(pieces) % 20 != 0:
raise ValueError, 'bad metainfo - bad pieces key'
piecelength = info.get('piece length')
if type(piecelength) not in ints or piecelength <= 0:
raise ValueError, 'bad metainfo - illegal piece length'
name = info.get('name')
if type(name) != StringType:
raise ValueError, 'bad metainfo - bad name'
if not reg.match(name):
raise ValueError, 'name %s disallowed for security reasons' % name
if info.has_key('files') == info.has_key('length'):
raise ValueError, 'single/multiple file mix'
if info.has_key('length'):
length = info.get('length')
if type(length) not in ints or length < 0:
raise ValueError, 'bad metainfo - bad length'
else:
files = info.get('files')
if type(files) != ListType:
raise ValueError
for f in files:
if type(f) != DictType:
raise ValueError, 'bad metainfo - bad file value'
length = f.get('length')
if type(length) not in ints or length < 0:
raise ValueError, 'bad metainfo - bad length'
path = f.get('path')
if type(path) != ListType or path == []:
raise ValueError, 'bad metainfo - bad path'
for p in path:
if type(p) != StringType:
raise ValueError, 'bad metainfo - bad path dir'
if not reg.match(p):
raise ValueError, 'path %s disallowed for security reasons' % p
for i in xrange(len(files)):
for j in xrange(i):
if files[i]['path'] == files[j]['path']:
raise ValueError, 'bad metainfo - duplicate path'
def check_message(message):
if type(message) != DictType:
raise ValueError
check_info(message.get('info'))
if type(message.get('announce')) != StringType:
raise ValueError
def check_peers(message):
if type(message) != DictType:
raise ValueError
if message.has_key('failure reason'):
if type(message['failure reason']) != StringType:
raise ValueError
return
peers = message.get('peers')
if type(peers) == ListType:
for p in peers:
if type(p) != DictType:
raise ValueError
if type(p.get('ip')) != StringType:
raise ValueError
port = p.get('port')
if type(port) not in ints or p <= 0:
raise ValueError
if p.has_key('peer id'):
id = p['peer id']
if type(id) != StringType or len(id) != 20:
raise ValueError
elif type(peers) != StringType or len(peers) % 6 != 0:
raise ValueError
interval = message.get('interval', 1)
if type(interval) not in ints or interval <= 0:
raise ValueError
minint = message.get('min interval', 1)
if type(minint) not in ints or minint <= 0:
raise ValueError
if type(message.get('tracker id', '')) != StringType:
raise ValueError
npeers = message.get('num peers', 0)
if type(npeers) not in ints or npeers < 0:
raise ValueError
dpeers = message.get('done peers', 0)
if type(dpeers) not in ints or dpeers < 0:
raise ValueError
last = message.get('last', 0)
if type(last) not in ints or last < 0:
raise ValueError

View file

@ -0,0 +1,270 @@
# Written by Bram Cohen
# multitracker extensions by John Hoffman
# see LICENSE.txt for license information
from os.path import getsize, split, join, abspath, isdir
from os import listdir
from hashlib import sha1 as sha
from copy import copy
from string import strip
from bencode import bencode
from btformats import check_info
from threading import Event
from time import time
from traceback import print_exc
try:
from sys import getfilesystemencoding
ENCODING = getfilesystemencoding()
except:
from sys import getdefaultencoding
ENCODING = getdefaultencoding()
defaults = [
('announce_list', '',
'a list of announce URLs - explained below'),
('httpseeds', '',
'a list of http seed URLs - explained below'),
('piece_size_pow2', 0,
"which power of 2 to set the piece size to (0 = automatic)"),
('comment', '',
"optional human-readable comment to put in .torrent"),
('filesystem_encoding', '',
"optional specification for filesystem encoding " +
"(set automatically in recent Python versions)"),
('target', '',
"optional target file for the torrent")
]
default_piece_len_exp = 18
ignore = ['core', 'CVS']
def print_announcelist_details():
print (' announce_list = optional list of redundant/backup tracker URLs, in the format:')
print (' url[,url...][|url[,url...]...]')
print (' where URLs separated by commas are all tried first')
print (' before the next group of URLs separated by the pipe is checked.')
print (" If none is given, it is assumed you don't want one in the metafile.")
print (' If announce_list is given, clients which support it')
print (' will ignore the <announce> value.')
print (' Examples:')
print (' http://tracker1.com|http://tracker2.com|http://tracker3.com')
print (' (tries trackers 1-3 in order)')
print (' http://tracker1.com,http://tracker2.com,http://tracker3.com')
print (' (tries trackers 1-3 in a randomly selected order)')
print (' http://tracker1.com|http://backup1.com,http://backup2.com')
print (' (tries tracker 1 first, then tries between the 2 backups randomly)')
print ('')
print (' httpseeds = optional list of http-seed URLs, in the format:')
print (' url[|url...]')
def make_meta_file(file, url, params = {}, flag = Event(),
progress = lambda x: None, progress_percent = 1):
if params.has_key('piece_size_pow2'):
piece_len_exp = params['piece_size_pow2']
else:
piece_len_exp = default_piece_len_exp
if params.has_key('target') and params['target'] != '':
f = params['target']
else:
a, b = split(file)
if b == '':
f = a + '.torrent'
else:
f = join(a, b + '.torrent')
if piece_len_exp == 0: # automatic
size = calcsize(file)
if size > 8L*1024*1024*1024: # > 8 gig =
piece_len_exp = 21 # 2 meg pieces
elif size > 2*1024*1024*1024: # > 2 gig =
piece_len_exp = 20 # 1 meg pieces
elif size > 512*1024*1024: # > 512M =
piece_len_exp = 19 # 512K pieces
elif size > 64*1024*1024: # > 64M =
piece_len_exp = 18 # 256K pieces
elif size > 16*1024*1024: # > 16M =
piece_len_exp = 17 # 128K pieces
elif size > 4*1024*1024: # > 4M =
piece_len_exp = 16 # 64K pieces
else: # < 4M =
piece_len_exp = 15 # 32K pieces
piece_length = 2 ** piece_len_exp
encoding = None
if params.has_key('filesystem_encoding'):
encoding = params['filesystem_encoding']
if not encoding:
encoding = ENCODING
if not encoding:
encoding = 'ascii'
info = makeinfo(file, piece_length, encoding, flag, progress, progress_percent)
if flag.isSet():
return
check_info(info)
h = open(f, 'wb')
data = {'info': info, 'announce': strip(url), 'creation date': long(time())}
if params.has_key('comment') and params['comment']:
data['comment'] = params['comment']
if params.has_key('real_announce_list'): # shortcut for progs calling in from outside
data['announce-list'] = params['real_announce_list']
elif params.has_key('announce_list') and params['announce_list']:
l = []
for tier in params['announce_list'].split('|'):
l.append(tier.split(','))
data['announce-list'] = l
if params.has_key('real_httpseeds'): # shortcut for progs calling in from outside
data['httpseeds'] = params['real_httpseeds']
elif params.has_key('httpseeds') and params['httpseeds']:
data['httpseeds'] = params['httpseeds'].split('|')
if params.has_key('url-list') and params['url-list']:
data['url-list'] = params['url-list'].split('|')
if params.has_key('playtime') and params['playtime']:
data['info']['playtime'] = params['playtime']
h.write(bencode(data))
h.close()
def calcsize(file):
if not isdir(file):
return getsize(file)
total = 0L
for s in subfiles(abspath(file)):
total += getsize(s[1])
return total
def uniconvertl(l, e):
r = []
try:
for s in l:
r.append(uniconvert(s, e))
except UnicodeError:
raise UnicodeError('bad filename: '+join(*l))
return r
def uniconvert(s, e):
try:
if s.__class__.__name__ != 'unicode':
s = unicode(s,e)
except UnicodeError:
raise UnicodeError('bad filename: '+s)
return s.encode('utf-8')
def makeinfo(file, piece_length, encoding, flag, progress, progress_percent=1):
file = abspath(file)
if isdir(file):
subs = subfiles(file)
subs.sort()
pieces = []
sh = sha()
done = 0L
fs = []
totalsize = 0.0
totalhashed = 0L
for p, f in subs:
totalsize += getsize(f)
for p, f in subs:
pos = 0L
size = getsize(f)
fs.append({'length': size, 'path': uniconvertl(p, encoding)})
h = open(f, 'rb')
while pos < size:
a = min(size - pos, piece_length - done)
sh.update(h.read(a))
if flag.isSet():
return
done += a
pos += a
totalhashed += a
if done == piece_length:
pieces.append(sh.digest())
done = 0
sh = sha()
if progress_percent:
progress(totalhashed / totalsize)
else:
progress(a)
h.close()
if done > 0:
pieces.append(sh.digest())
return {'pieces': ''.join(pieces),
'piece length': piece_length, 'files': fs,
'name': uniconvert(split(file)[1], encoding) }
else:
size = getsize(file)
pieces = []
p = 0L
h = open(file, 'rb')
while p < size:
x = h.read(min(piece_length, size - p))
if flag.isSet():
return
pieces.append(sha(x).digest())
p += piece_length
if p > size:
p = size
if progress_percent:
progress(float(p) / size)
else:
progress(min(piece_length, size - p))
h.close()
return {'pieces': ''.join(pieces),
'piece length': piece_length, 'length': size,
'name': uniconvert(split(file)[1], encoding) }
def subfiles(d):
r = []
stack = [([], d)]
while len(stack) > 0:
p, n = stack.pop()
if isdir(n):
for s in listdir(n):
if s not in ignore and s[:1] != '.':
stack.append((copy(p) + [s], join(n, s)))
else:
r.append((p, n))
return r
def completedir(dir, url, params = {}, flag = Event(),
vc = lambda x: None, fc = lambda x: None):
files = listdir(dir)
files.sort()
ext = '.torrent'
if params.has_key('target'):
target = params['target']
else:
target = ''
togen = []
for f in files:
if f[-len(ext):] != ext and (f + ext) not in files:
togen.append(join(dir, f))
total = 0
for i in togen:
total += calcsize(i)
subtotal = [0]
def callback(x, subtotal = subtotal, total = total, vc = vc):
subtotal[0] += x
vc(float(subtotal[0]) / total)
for i in togen:
fc(i)
try:
t = split(i)[-1]
if t not in ignore and t[0] != '.':
if target != '':
params['target'] = join(target,t+ext)
make_meta_file(i, url, params, flag, progress = callback, progress_percent = 0)
except ValueError:
print_exc()

View file

@ -0,0 +1,20 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
try:
from django.utils import datetime
except:
from datetime import datetime
try:
import simplejson as json
except ImportError:
try:
import json
except ImportError:
from django.utils import simplejson as json
try:
import xml.etree.ElementTree as ET
except:
import elementtree.ElementTree as ET

View file

@ -0,0 +1,9 @@
# vi:si:et:sw=4:sts=4:ts=4
# encoding: utf-8
__version__ = '1.0.0'
from . import imdb
from . import wikipedia
from . import google
from . import piratecinema
from . import oxdb

View file

@ -0,0 +1,20 @@
from ox.cache import read_url
import re
import lxml.html
def get_data(id):
info = {}
base = 'http://www.abebooks.com'
url = '%s/servlet/SearchResults?isbn=%s&sts=t' % (base, id)
data = read_url(url)
urls = re.compile('href="(/servlet/BookDetailsPL[^"]+)"').findall(data)
if urls:
details = '%s%s' % (base, urls[0])
data = read_url(details)
doc = lxml.html.document_fromstring(data)
for e in doc.xpath("//*[contains(@id, 'biblio')]"):
key = e.attrib['id'].replace('biblio-', '')
value = e.text_content()
if value and key not in ('bookcondition', 'binding'):
info[key] = value
return info

View file

@ -0,0 +1,85 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import re
from ox import strip_tags, find_re
from ox.cache import read_url
def get_id(url):
return url.split("/")[-1]
def get_data(id):
'''
>>> get_data('129689')['cast'][1][1]
u'Marianne'
>>> get_data('129689')['credits'][0][0]
u'Jean-Luc Godard'
>>> get_data('129689')['posters'][0]
u'http://image.allmusic.com/00/adg/cov200/dru800/u812/u81260bbffr.jpg'
>>> get_data('129689')['rating']
u'4.5'
'''
if id.startswith('http'):
id = get_id(id)
data = {
"url": get_url(id)
}
html = read_url(data["url"], unicode=True)
data['aka'] = parse_list(html, 'AKA')
data['category'] = find_re(html, '<dt>category</dt>.*?<dd>(.*?)</dd>')
data['countries'] = parse_list(html, 'countries')
data['director'] = parse_entry(html, 'directed by')
data['genres'] = parse_list(html, 'genres')
data['keywords'] = parse_list(html, 'keywords')
data['posters'] = [find_re(html, '<img src="(http://cps-.*?)"')]
data['produced'] = parse_list(html, 'produced by')
data['rating'] = find_re(html, 'Stars" title="(.*?) Stars"')
data['released'] = parse_entry(html, 'released by')
data['releasedate'] = parse_list(html, 'release date')
data['runtime'] = parse_entry(html, 'run time').replace('min.', '').strip()
data['set'] = parse_entry(html, 'set in')
data['synopsis'] = strip_tags(find_re(html, '<div class="toggle-text" itemprop="description">(.*?)</div>')).strip()
data['themes'] = parse_list(html, 'themes')
data['types'] = parse_list(html, 'types')
data['year'] = find_re(html, '<span class="year">.*?(\d+)')
#data['stills'] = [re.sub('_derived.*?/', '', i) for i in re.compile('<a href="#" title="movie still".*?<img src="(.*?)"', re.DOTALL).findall(html)]
data['stills'] = re.compile('<a href="#" title="movie still".*?<img src="(.*?)"', re.DOTALL).findall(html)
#html = read_url("http://allmovie.com/work/%s/cast" % id, unicode=True)
#data['cast'] = parse_table(html)
#html = read_url("http://allmovie.com/work/%s/credits" % id, unicode=True)
#data['credits'] = parse_table(html)
html = read_url("http://allmovie.com/work/%s/review" % id, unicode=True)
data['review'] = strip_tags(find_re(html, '<div class="toggle-text" itemprop="description">(.*?)</div>')).strip()
return data
def get_url(id):
return "http://allmovie.com/work/%s" % id
def parse_entry(html, title):
html = find_re(html, '<dt>%s</dt>.*?<dd>(.*?)</dd>' % title)
return strip_tags(html).strip()
def parse_list(html, title):
html = find_re(html, '<dt>%s</dt>.*?<dd>(.*?)</dd>' % title.lower())
r = map(strip_tags, re.compile('<li>(.*?)</li>', re.DOTALL).findall(html))
if not r and html:
r = [strip_tags(html)]
return r
def parse_table(html):
return [
[
strip_tags(r).strip().replace('&nbsp;', '')
for r in x.split('<td width="305">-')
]
for x in find_re(html, '<div id="results-table">(.*?)</table>').split('</tr>')[:-1]
]
def parse_text(html, title):
return strip_tags(find_re(html, '%s</td>.*?<td colspan="2"><p>(.*?)</td>' % title)).strip()
if __name__ == '__main__':
print get_data('129689')
# print get_data('177524')

View file

@ -0,0 +1,77 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import re
from six.moves.urllib.parse import quote
from ox import find_re, strip_tags, decode_html
from ox.cache import read_url
import lxml
def findISBN(title, author):
q = '%s %s' % (title, author)
url = "http://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Dstripbooks&field-keywords=" + "%s&x=0&y=0" % quote(q)
data = read_url(url, unicode=True)
links = re.compile('href="(http://www.amazon.com/.*?/dp/.*?)"').findall(data)
id = find_re(re.compile('href="(http://www.amazon.com/.*?/dp/.*?)"').findall(data)[0], '/dp/(.*?)/')
data = get_data(id)
if author in data['authors']:
return data
return {}
def get_data(id):
url = "http://www.amazon.com/title/dp/%s/" % id
data = read_url(url, unicode=True)
def find_data(key):
return find_re(data, '<li><b>%s:</b>(.*?)</li>'% key).strip()
r = {}
r['amazon'] = url
r['title'] = find_re(data, '<span id="productTitle" class="a-size-large">(.*?)</span>')
r['authors'] = []
doc = lxml.html.document_fromstring(data)
for e in doc.xpath("//span[contains(@class, 'author')]"):
print e
for secondary in e.xpath(".//span[contains(@class, 'a-color-secondary')]"):
if 'Author' in secondary.text:
author = e.xpath(".//span[contains(@class, 'a-size-medium')]")
if author:
r['authors'].append(author[0].text.strip())
else:
r['authors'].append(e.xpath('.//a')[0].text.strip())
break
elif 'Translator' in secondary.text:
r['translator'] = [e.xpath('.//a')[0].text]
break
r['publisher'] = find_data('Publisher')
r['language'] = find_data('Language')
r['isbn-10'] = find_data('ISBN-10')
r['isbn-13'] = find_data('ISBN-13').replace('-', '')
r['dimensions'] = find_re(data, '<li><b>.*?Product Dimensions:.*?</b>(.*?)</li>')
r['pages'] = find_data('Paperback')
if not r['pages']:
r['pages'] = find_data('Hardcover')
r['review'] = strip_tags(find_re(data, '<h3 class="productDescriptionSource">Review</h3>.*?<div class="productDescriptionWrapper">(.*?)</div>').replace('<br />', '\n')).strip()
for e in doc.xpath('//noscript'):
for c in e.getchildren():
if c.tag == 'div':
r['description'] = strip_tags(decode_html(lxml.html.tostring(c))).strip()
break
r['cover'] = re.findall('src="(.*?)" id="prodImage"', data)
if r['cover']:
r['cover'] = r['cover'][0].split('._BO2')[0]
if not r['cover'].endswith('.jpg'):
r['cover'] = r['cover'] + '.jpg'
if 'no-image-avail-img' in r['cover']:
del r['cover']
else:
del r['cover']
return r

View file

@ -0,0 +1,67 @@
import json
import re
from ox.cache import read_url
HEADERS = {
'User-Agent': 'iTunes/10.4 (Macintosh; Intel Mac OS X 10.7) AppleWebKit/534.48.3',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-us, en;q=0.50',
'X-Apple-Store-Front': '143441-1,12',
'X-Apple-Tz': '7200',
'Accept-Encoding': 'gzip, deflate'
}
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7) '
USER_AGENT += 'AppleWebKit/534.48.3 (KHTML, like Gecko) Version/5.1 Safari/534.48.3'
def get_movie_data(title, director):
if isinstance(title, unicode):
title = title.encode('utf-8')
if isinstance(director, unicode):
director = director.encode('utf-8')
data = {}
# itunes section (preferred source for link)
url = 'http://ax.search.itunes.apple.com/WebObjects/MZSearch.woa/wa/advancedSearch'
url += '?media=movie&movieTerm=' + title
url += '&actorNames=&directorProducerName=' + director
url += '&releaseYearTerm=&descriptionTerm=&genreIndex=1&ratingIndex=1'
HEADERS['Referer'] = url
html = read_url(url, headers=HEADERS, unicode=True)
regexp = '<a href="(http://itunes.apple.com/us/movie/.*?)" class="artwork-link"><div class="artwork">'
regexp += '<img width=".*?" height=".*?" alt=".*?" class="artwork" src="(.*?)" /></div></a>'
results = re.compile(regexp).findall(html)
if results:
data['link'] = results[0][0]
data['poster'] = results[0][1].replace('140x140', '600x600')
html = read_url(data['link'], headers=HEADERS, unicode=True)
results = re.compile('video-preview-url="(.*?)"').findall(html)
if results:
data['trailer'] = results[0]
# trailers section (preferred source for poster and trailer)
host = 'http://trailers.apple.com'
url = host + '/trailers/home/scripts/quickfind.php?callback=searchCallback&q=' + title
js = json.loads(read_url(url, unicode=True)[16:-4])
results = js['results']
if results:
url = host + results[0]['location']
if not 'link' in data:
data['link'] = url
headers = {
'User-Agent': USER_AGENT
}
html = read_url(url, headers=headers, unicode=True)
results = re.compile('"(' + host + '.*?poster\.jpg)"').findall(html)
if results:
data['poster'] = results[0].replace('poster.jpg', 'poster-xlarge.jpg')
html = read_url(url + 'includes/playlists/web.inc', headers=headers, unicode=True)
results = re.compile('"(' + host + '\S+\.mov)"').findall(html)
if results:
data['trailer'] = results[-1]
return data
if __name__ == '__main__':
print get_movie_data('Alphaville', 'Jean-Luc Godard')
print get_movie_data('Sin City', 'Roberto Rodriguez')
print get_movie_data('Breathless', 'Jean-Luc Godard')
print get_movie_data('Capitalism: A Love Story', 'Michael Moore')
print get_movie_data('Film Socialisme', 'Jean-Luc Godard')

View file

@ -0,0 +1,26 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
from .. import cache
from ..utils import json
def get_id(url):
return url.split("/")[-1]
def get_url(id):
return "http://www.archive.org/details/%s" % id
def get_data(id):
data = {}
url = get_url(id)
details = cache.read_url('%s?output=json' % url)
details = json.loads(details)
for key in ('title', 'description', 'runtime'):
data[key] = details['metadata'][key]
if isinstance(data[key], list):
data[key] = data[key][0]
data['url'] = url
data['image'] = 'http://archive.org/download/%s/format=thumbnail' % id
data['ogg'] = 'http://archive.org/download/%s/format=Ogg+video' % id
data['mp4'] = 'http://archive.org/download/%s/format=512Kb+MPEG4' % id
return data

View file

@ -0,0 +1,71 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import json
import os
import re
from ox import find_re, strip_tags
from ox.cache import read_url
def get_data(id, language='en'):
if language == 'de':
url = 'http://films.arsenal-berlin.de/index.php/Detail/Object/Show/object_id/%d/lang/de_DE' % id
else:
url = 'http://films.arsenal-berlin.de/index.php/Detail/Object/Show/object_id/%d' % id
html = read_url(url, unicode=True)
if 'ID does not exist' in html:
return None
if 'Willkommen in der Datenbank des Arsenal' in html:
return None
data = {}
data[u'id'] = id
data[u'url'] = url
m = re.compile('<h1>(.*?)</h1>').findall(html)
if m:
data[u'title'] = m[0]
m = re.compile("<b>Director: </b><a href='.*?'>(.*?)</a>").findall(html)
if m:
data[u'director'] = m[0]
m = re.compile("caUI.initImageScroller\(\[\{url:'(.*?)'").findall(html)
if m:
data[u'image'] = m[0]
units = re.compile("<div class='unit'>(.*?)</div>", re.DOTALL).findall(html)
for x in map(re.compile('<b>(.*?)</b>: (.*)', re.DOTALL).findall, units):
if x:
#data[x[0][0].lower()] = strip_tags(x[0][1])
key = x[0][0].lower()
data[key] = x[0][1]
if key == "forum catalogue pdf":
data[key] = find_re(data[key], '"(http:.*?)"')
else:
data[key] = strip_tags(data[key])
if "running time (minutes)" in data:
data[u'runtime'] = float(data.pop("running time (minutes)").replace(',', '.')) * 60
for key in ('year', 'length in metres', 'forum participation year', 'number of reels'):
if key in data and data[key].isdigit():
data[key] = int(data[key])
return data
def backup(filename):
if os.path.exists(filename):
with open(filename) as f:
data = json.load(f)
else:
data = {}
start = max(map(int, data)) or 1
for i in range(start, 11872):
info = get_data(i)
if info:
data[i] = info
if len(data) % 10 == 0:
print 'save', filename, len(data)
with open(filename, 'w') as f:
json.dump(data, f)
else:
print 'ignore', i
with open(filename, 'w') as f:
json.dump(data, f)
return data

View file

@ -0,0 +1,33 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
# GPL 2009
import os
from ox.utils import json
def get(key):
user_auth = os.environ.get('oxAUTH', os.path.expanduser('~/.ox/auth.json'))
auth = {}
if os.path.exists(user_auth):
f = open(user_auth, "r")
data = f.read()
f.close()
auth = json.loads(data)
if key in auth:
return auth[key]
print "please add key %s to json file '%s'" % (key, user_auth)
raise Exception,"no key %s found" % key
def update(key, value):
user_auth = os.environ.get('oxAUTH', os.path.expanduser('~/.ox/auth.json'))
auth = {}
if os.path.exists(user_auth):
f = open(user_auth, "r")
data = f.read()
f.close()
auth = json.loads(data)
auth[key] = value
f = open(user_auth, "w")
f.write(json.dumps(auth, indent=2))
f.close()

View file

@ -0,0 +1,100 @@
# -*- coding: UTF-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import re
import ox.cache
from ox.cache import read_url
from ox.html import strip_tags
from ox.text import find_re
import imdb
def get_id(url):
return url.split("/")[-1]
def get_url(id):
return "http://www.criterion.com/films/%s" % id
def get_data(id, timeout=ox.cache.cache_timeout, get_imdb=False):
'''
>>> get_data('1333').get('imdbId')
u'0060304'
>>> get_data('236')['posters'][0]
u'http://s3.amazonaws.com/criterion-production/release_images/1586/ThirdManReplace.jpg'
>>> get_data('786')['posters'][0]
u'http://s3.amazonaws.com/criterion-production/product_images/185/343_box_348x490.jpg'
'''
data = {
"url": get_url(id)
}
try:
html = read_url(data["url"], timeout=timeout, unicode=True)
except:
html = ox.cache.read_url(data["url"], timeout=timeout)
data["number"] = find_re(html, "<li>Spine #(\d+)")
data["title"] = find_re(html, "<h1 class=\"movietitle\">(.*?)</h1>")
data["title"] = data["title"].split(u' \u2014 The Television Version')[0]
data["director"] = strip_tags(find_re(html, "<h2 class=\"director\">(.*?)</h2>"))
results = find_re(html, '<div class="left_column">(.*?)</div>')
results = re.compile("<li>(.*?)</li>").findall(results)
data["country"] = results[0]
data["year"] = results[1]
data["synopsis"] = strip_tags(find_re(html, "<div class=\"content_block last\">.*?<p>(.*?)</p>"))
result = find_re(html, "<div class=\"purchase\">(.*?)</div>")
if 'Blu-Ray' in result or 'Essential Art House DVD' in result:
r = re.compile('<h3 class="section_title first">Other Editions</h3>(.*?)</div>', re.DOTALL).findall(html)
if r:
result = r[0]
result = find_re(result, "<a href=\"(.*?)\"")
if not "/boxsets/" in result:
data["posters"] = [result]
else:
html_ = read_url(result, unicode=True)
result = find_re(html_, '<a href="http://www.criterion.com/films/%s.*?">(.*?)</a>' % id)
result = find_re(result, "src=\"(.*?)\"")
if result:
data["posters"] = [result.replace("_w100", "")]
else:
data["posters"] = []
data['posters'] = [re.sub('(\?\d+)$', '', p) for p in data['posters']]
result = find_re(html, "<img alt=\"Film Still\" height=\"252\" src=\"(.*?)\"")
if result:
data["stills"] = [result]
data["trailers"] = []
else:
data["stills"] = filter(lambda x: x, [find_re(html, "\"thumbnailURL\", \"(.*?)\"")])
data["trailers"] = filter(lambda x: x, [find_re(html, "\"videoURL\", \"(.*?)\"")])
if timeout == ox.cache.cache_timeout:
timeout = -1
if get_imdb:
# removed year, as "title (year)" may fail to match
data['imdbId'] = imdb.get_movie_id(data['title'], data['director'], timeout=timeout)
return data
def get_ids(page=None):
ids = []
if page:
url = "http://www.criterion.com/library/expanded_view?m=dvd&p=%s&pp=50&s=spine" % page
html = read_url(url)
results = re.compile("films/(\d+)").findall(html)
ids += results
results = re.compile("boxsets/(.*?)\"").findall(html)
for result in results:
html = read_url("http://www.criterion.com/boxsets/" + result)
results = re.compile("films/(\d+)").findall(html)
ids += results
return set(ids)
html = read_url("http://www.criterion.com/library/expanded_view?m=dvd&p=1&pp=50&s=spine", unicode=True)
results = re.compile("\&amp;p=(\d+)\&").findall(html)
pages = max(map(int, results))
for page in range(1, pages):
ids += get_ids(page)
return sorted(set(ids), key=int)
if __name__ == '__main__':
print get_ids()

View file

@ -0,0 +1,21 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import re
from six.moves.urllib.parse import unquote
from ox.cache import read_url
def get_video_url(url):
'''
>>> get_video_url('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3opar_priere-pour-refuznik-1-jeanluc-goda_shortfilms').split('?auth')[0]
'http://www.dailymotion.com/cdn/FLV-320x240/video/x3opar_priere-pour-refuznik-1-jean-luc-god_shortfilms.flv'
>>> get_video_url('http://www.dailymotion.com/relevance/search/priere%2Bpour%2Brefuznik/video/x3ou94_priere-pour-refuznik-2-jeanluc-goda_shortfilms').split('?auth')[0]
'http://www.dailymotion.com/cdn/FLV-320x240/video/x3ou94_priere-pour-refuznik-2-jean-luc-god_shortfilms.flv'
'''
data = read_url(url)
video = re.compile('''video", "(.*?)"''').findall(data)
for v in video:
v = unquote(v).split('@@')[0]
return v
return ''

View file

@ -0,0 +1,22 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import re
from six.moves import urllib
import ox
from ox import strip_tags, decode_html
from ox.cache import read_url
def find(query, timeout=ox.cache.cache_timeout):
if not isinstance(query, bytes):
query = query.encode('utf-8')
params = urllib.parse.urlencode({'q': query})
url = 'http://duckduckgo.com/html/?' + params
data = read_url(url, timeout=timeout).decode('utf-8')
results = []
regex = '<a .*?class="large" href="(.+?)">(.*?)</a>.*?<div class="snippet">(.*?)</div>'
for r in re.compile(regex, re.DOTALL).findall(data):
results.append((strip_tags(decode_html(r[1])), r[0], strip_tags(decode_html(r[2]))))
return results

View file

@ -0,0 +1,49 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import re
import time
from ox import strip_tags, find_re
from ox.cache import read_url
import google
def get_show_url(title):
'''
Search Epguide Url for Show via Show Title.
Use Google to search the url, this is also done on Epguide.
'''
for (name, url, desc) in google.find('allintitle: site:epguides.com %s' % title, 1):
if url.startswith('http://epguides.com'):
if re.search(title, name):
return url
return None
def get_show_data(url):
data = read_url(url, unicode=True)
r = {}
r['title'] = strip_tags(find_re(data, '<h1>(.*?)</h1>'))
r['imdb'] = find_re(data, '<h1><a href=".*?/title/tt(\d.*?)">.*?</a></h1>')
r['episodes'] = {}
#1. 1- 1 1001 7 Aug 05 You Can't Miss the Bear
for episode in re.compile('(\d+?)\..*?(\d+?-.\d.*?) .*?(\d+?) .*?(.*?) <a target="_blank" href="(.*?)">(.*?)</a>').findall(data):
air_date = episode[3].strip()
#'22 Sep 04' -> 2004-09-22
try:
air_date = time.strftime('%Y-%m-%d', time.strptime(air_date, '%d %b %y'))
except:
pass
s = episode[1].split('-')[0].strip()
e = episode[1].split('-')[-1].strip()
try:
r['episodes']['S%02dE%02d' % (int(s), int(e))] = {
'prod code': episode[2],
'air date': air_date,
'url': episode[4],
'title':episode[5],
}
except:
print "oxweb.epguides failed,", url
return r

View file

@ -0,0 +1,39 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import re
import string
import subprocess
import ox
import os
def get_ids():
result = []
for i in string.ascii_uppercase:
url = "http://www.filmsdivision.org/search.php?title=%s" % i
data = ox.cache.read_url(url)
links = re.compile('view_video.php\?movId=(.*?)[\'"]', re.DOTALL).findall(data)
result += links
return list(set(result))
def get_data(id):
result = {}
url = "http://www.filmsdivision.org/view_video.php?movId=%s" % id
data = ox.cache.read_url(url)
result['title'] = re.compile('<td.*?class="vdoheadtxt".*?>(.*?)</td>').findall(data)[0]
result['year'] = re.compile('Release: (\d{4})').findall(data)[0]
result['duration'] = int(re.compile('Duration: (\d+)mins').findall(data)[0]) * 60
result['producer'] = re.compile('Producer: (.*?)\t').findall(data)[0].strip()
if 'Director:' in data:
result['director'] = re.compile('Director: (.*?)\t').findall(data)[0].strip()
else:
result['director'] = "Unknown Director"
result['url'] = re.compile('value="(.*?.wmv)"').findall(data)[0]
return result
def download_video(url, filename):
dirname = os.path.dirname(filename)
if not os.path.exists(dirname):
os.makedirs(dirname)
p = subprocess.Popen(['gst-launch', 'mmssrc', 'location=%s'%url, '!', 'filesink', 'locaiton='%filename])
p.wait()
return p.returncode == 0

View file

@ -0,0 +1,74 @@
# -*- coding: UTF-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import re
from lxml.html import document_fromstring
from ox.cache import read_url
from ox import find_re, strip_tags
from ox.web.imdb import ImdbCombined
def get_data(id, timeout=-1):
'''
>>> get_data('the-matrix')['poster']
'http://content7.flixster.com/movie/16/90/52/1690525_gal.jpg'
>>> get_data('0133093')['poster']
'http://content7.flixster.com/movie/16/90/52/1690525_gal.jpg'
>>> get_data('2-or-3-things-i-know-about-her')['poster']
'http://content6.flixster.com/movie/10/95/43/10954392_gal.jpg'
>>> get_data('0078875')['rottentomatoes_id']
'http://www.rottentomatoes.com/m/the-tin-drum/'
'''
if len(id) == 7:
try:
int(id)
id = get_id(imdb=id)
except:
pass
data = {
"url": get_url(id),
}
html = read_url(data['url'], timeout=timeout, unicode=True)
doc = document_fromstring(html)
props = {
'og:title': 'title',
'og:image': 'poster',
'og:url': 'rottentomatoes_id',
}
for meta in doc.head.findall('meta'):
prop = meta.attrib.get('property', None)
content = meta.attrib.get('content', '')
if prop in props and content:
data[props[prop]] = content
for p in doc.body.find_class('synopsis'):
data['synopsis'] = p.text.strip()
if 'poster' in data and data['poster']:
data['poster'] = data['poster'].replace('_pro.jpg', '_gal.jpg')
if not 'title' in data:
return None
return data
def get_id(url=None, imdb=None):
'''
>>> get_id(imdb='0133093')
u'the-matrix'
#>>> get_id(imdb='0060304')
#u'2-or-3-things-i-know-about-her'
'''
if imdb:
i = ImdbCombined(imdb)
title = i['title']
return title.replace(' ', '-').lower().replace("'", '')
return url.split('/')[-1]
def get_url(id):
return "http://www.flixster.com/movie/%s"%id

View file

@ -0,0 +1,42 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import json
from ox.cache import read_url
from ox import find_re
class Freebase(dict):
def __init__(self, id, timeout=-1):
url = "http://ids.freebaseapps.com/get_ids?id=/authority/imdb/title/tt%s" % id
'''
"http://graph.freebase.com/imdb.title.tt%s" % id
might also be of interest at some point, right now not much info
'''
data = read_url(url, unicode=True)
try:
data = json.loads(data)
except ValueError:
return
'''
for key in data:
self[key] = data[key]
'''
for key in ('id', 'guid', 'name'):
self[key] = data[key]
keys = {
'wikipedia': '/wikipedia/en',
'netflix': '/authority/netflix/movie',
'nytimes': '/source/nytimes/movie',
'metacritic': '/source/metacritic/movie',
}
for key in keys:
links = filter(lambda x: x['namespace'] == keys[key],data['ids'])
if links:
self[key] = links[0]['uri']
if 'nytimes' in self:
self['nytimes'] = self['nytimes'].replace('_/overview', '%s/overview' % self['name'].replace(' ', '-'))
self['amgId'] = find_re(self['nytimes'], 'movie/(\d+)/')

View file

@ -0,0 +1,44 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import re
from six.moves import urllib
import ox
from ox import strip_tags, decode_html
DEFAULT_MAX_RESULTS = 10
DEFAULT_TIMEOUT = 24*60*60
def read_url(url, data=None, headers=ox.net.DEFAULT_HEADERS, timeout=DEFAULT_TIMEOUT):
return ox.cache.read_url(url, data, headers, timeout, unicode=True)
def quote_plus(s):
if not isinstance(s, bytes):
s = s.encode('utf-8')
return urllib.parse.quote_plus(s)
def find(query, max_results=DEFAULT_MAX_RESULTS, timeout=DEFAULT_TIMEOUT):
"""
Return max_results tuples with title, url, description
>>> find("The Matrix site:imdb.com", 1)[0][0]
u'The Matrix (1999) - IMDb'
>>> find("The Matrix site:imdb.com", 1)[0][1]
u'http://www.imdb.com/title/tt0133093/'
"""
results = []
offset = 0
while len(results) < max_results:
url = 'http://google.com/search?q=%s' % quote_plus(query)
if offset:
url += '&start=%d' % offset
data = read_url(url, timeout=timeout)
data = re.sub('<span class="f">(.*?)</span>', '\\1', data)
for a in re.compile('<a href="(htt\S+?)".*?>(.*?)</a>.*?<span class="st">(.*?)<\/span>').findall(data):
results.append((strip_tags(decode_html(a[1])), a[0], strip_tags(decode_html(a[2]))))
if len(results) >= max_results:
break
offset += 10
return results

View file

@ -0,0 +1,821 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
from __future__ import print_function
import re
import time
import unicodedata
from six.moves import urllib
from six import string_types
from .. import find_re, strip_tags, decode_html
from .. import cache
from . siteparser import SiteParser
from . import duckduckgo
from ..utils import datetime
from ..geo import normalize_country_name
def read_url(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None, unicode=False):
headers = headers.copy()
return cache.read_url(url, data, headers, timeout, unicode=unicode)
def get_url(id):
return "http://www.imdb.com/title/tt%s/" % id
class Imdb(SiteParser):
'''
>>> Imdb('0068646')['title']
u'The Godfather'
>>> Imdb('0133093')['title']
u'The Matrix'
'''
regex = {
'alternativeTitles': {
'page': 'releaseinfo',
're': [
'name="akas".*?<table.*?>(.*?)</table>',
"td>(.*?)</td>.*?<td>(.*?)</td>"
],
'type': 'list'
},
'aspectratio': {
'page': 'combined',
're': 'Aspect Ratio:</h5><div class="info-content">([\d\.]+)',
'type': 'float',
},
'budget': {
'page': 'business',
're': [
'<h5>Budget</h5>\s*?\$(.*?)<br',
lambda data: find_re(decode_html(data).replace(',', ''), '\d+')
],
'type': 'int'
},
'cast': {
'page': 'combined',
're': [
'<td class="nm">.*?>(.*?)</a>.*?<td class="char">(.*?)</td>',
lambda ll: [strip_tags(l) for l in ll]
],
'type': 'list'
},
'cinematographer': {
'page': 'combined',
're': [
lambda data: data.split('Series Crew')[0],
'Cinematography by</a>(.*?)</table>',
'<a href="/name/.*?/">(.*?)</a>'
],
'type': 'list'
},
'connections': {
'page': 'trivia?tab=mc',
're': '<h4 class="li_group">(.*?)</h4>(.*?)(<\/div>\n <a|<script)',
'type': 'list'
},
'country': {
'page': 'combined',
're': [
'<div class="info"><h5>Country:</h5>.*?<div class="info">',
#'<a href="/country/.*?">(.*?)</a>', #links changed to work with existing caches, just take all links
'<a.*?>(.*?)</a>',
],
'type': 'list'
},
'creator': {
'page': 'combined',
're': [
'<h5>Creator.?:</h5>.*?<div class="info-content">(.*?)</div>',
'<a href="/name/.*?>(.*?)</a>'
],
'type': 'list'
},
'director': {
'page': 'combined',
're': [
lambda data: data.split('<b>Series Crew</b>')[0],
'Directed by</a>(.*?)</table>',
'<a href="/name/.*?>(.*?)</a>'
],
'type': 'list'
},
'_director': {
'page': 'combined',
're': [
'<h5>Director:</h5>.*?<div class="info-content">(.*?)</div>',
'<a href="/name/.*?>(.*?)</a>'
],
'type': 'list'
},
'editor': {
'page': 'combined',
're': [
lambda data: data.split('Series Crew')[0],
'Film Editing by</a>(.*?)</table>',
'<a href="/name/.*?>(.*?)</a>'
],
'type': 'list'
},
'composer': {
'page': 'combined',
're': [
lambda data: data.split('Series Crew')[0],
'Original Music by</a>(.*?)</table>',
'<a href="/name/.*?>(.*?)</a>'
],
'type': 'list'
},
'episodeTitle': {
'page': 'combined',
're': '<div id="tn15title">.*?<em>(.*?)</em>',
'type': 'string'
},
'filmingLocations': {
'page': 'locations',
're': [
'<a href="/search/title\?locations=.*?".*?>(.*?)</a>',
lambda data: data.strip(),
],
'type': 'list'
},
'genre': {
'page': 'combined',
're': [
'<h5>Genre:</h5>(.*?)<hr',
'<a href="/Sections/Genres/.*?/">(.*?)</a>'
],
'type': 'list'
},
'gross': {
'page': 'business',
're': [
'<h5>Gross</h5>\s*?\$(.*?)<br',
lambda data: find_re(data.replace(',', ''), '\d+')
],
'type': 'int'
},
'keyword': {
'page': 'keywords',
're': '<a href="/keyword/.*?>(.*?)</a>',
'type': 'list'
},
'language': {
'page': 'combined',
're': [
'<div class="info"><h5>Language:</h5>.*?<div class="info">',
#'<a href="/language/.*?">(.*?)</a>', #links changed to work with existing caches, just take all links
'<a.*?>(.*?)</a>',
],
'type': 'list'
},
'summary': {
'page': 'plotsummary',
're': '<p class="plotSummary">(.*?)<\/p>',
'type': 'string'
},
'posterId': {
'page': 'combined',
're': '/primary-photo/media/rm(.*?)/tt',
'type': 'string'
},
'posterIds': {
'page': 'posters',
're': '/unknown-thumbnail/media/rm(.*?)/tt',
'type': 'list'
},
'producer': {
'page': 'combined',
're': [
lambda data: data.split('Series Crew')[0],
'Produced by</a>(.*?)</table>',
'<a href="/name/.*?/">(.*?)</a>'
],
'type': 'list'
},
'productionCompany': {
'page': 'combined',
're': [
'Production Companies</b><ul>(.*?)</ul>',
'<a href="/company/.*?/">(.*?)</a>'
],
'type': 'list'
},
'rating': {
'page': 'combined',
're': '<div class="starbar-meta">.*?<b>([\d,.]+?)/10</b>',
'type': 'float'
},
'releasedate': {
'page': 'releaseinfo',
're': [
'<td class="release_date">(.*?)</td>',
strip_tags,
],
'type': 'list'
},
'reviews': {
'page': 'externalreviews',
're': [
'<ol>(.*?)</ol>',
'<li><a href="(http.*?)".*?>(.*?)</a></li>'
],
'type': 'list'
},
'runtime': {
'page': 'combined',
're': '<h5>Runtime:</h5><div class="info-content">.*?([0-9]+ sec|[0-9]+ min).*?</div>',
'type': 'string'
},
'color': {
'page': 'combined',
're': [
'<h5>Color:</h5><div class="info-content">(.*?)</div>',
'<a.*?>(.*?)</a>'
],
'type': 'list'
},
'sound': {
'page': 'combined',
're': [
'<h5>Sound Mix:</h5><div class="info-content">(.*?)</div>',
'<a.*?>(.*?)</a>'
],
'type': 'list'
},
'season': {
'page': 'combined',
're': [
'<h5>Original Air Date:</h5>.*?<div class="info-content">(.*?)</div>',
'\(Season (\d+), Episode \d+\)',
],
'type': 'int'
},
'episode': {
'page': 'combined',
're': [
'<h5>Original Air Date:</h5>.*?<div class="info-content">(.*?)</div>',
'\(Season \d+, Episode (\d+)\)',
],
'type': 'int'
},
'series': {
'page': 'combined',
're': '<h5>TV Series:</h5>.*?<a href="/title/tt(\d{7})',
'type': 'string'
},
'isSeries': {
'page': 'combined',
're': '<span class="tv-extra">(TV series|TV mini-series) ',
'type': 'string'
},
'title': {
'page': 'combined',
're': '<h1>(.*?) <span>',
'type': 'string'
},
'trivia': {
'page': 'trivia',
're': [
'<div class="sodatext">(.*?)<(br|/div)',
lambda data: data[0]
],
'type': 'list',
},
'votes': {
'page': 'combined',
're': '<a href="ratings" class="tn15more">([\d,]*?) votes</a>',
'type': 'string'
},
'writer': {
'page': 'combined',
're': [
lambda data: data.split('Series Crew')[0],
'Writing credits</a>(.*?)</table>',
'<a href="/name/.*?/">(.*?)</a>'
],
'type': 'list'
},
'year': {
'page': 'combined',
're': '="og:title" content="[^"]*?\((\d{4}).*?"',
'type': 'int'
}
}
def read_url(self, url, timeout):
if not url in self._cache:
self._cache[url] = read_url(url, timeout=timeout, unicode=True)
return self._cache[url]
def __init__(self, id, timeout=-1):
#use akas.imdb.com to always get original title:
#http://www.imdb.com/help/show_leaf?titlelanguagedisplay
self.baseUrl = "http://akas.imdb.com/title/tt%s/" % id
super(Imdb, self).__init__(timeout)
url = self.baseUrl + 'combined'
page = self.read_url(url, timeout=-1)
if '<title>IMDb: Page not found</title>' in page \
or 'The requested URL was not found on our server.' in page:
return
if "<p>We're sorry, something went wrong.</p>" in page:
time.sleep(1)
super(Imdb, self).__init__(0)
if 'alternativeTitles' in self:
if len(self['alternativeTitles']) == 2 and \
isinstance(self['alternativeTitles'][0], string_types):
self['alternativeTitles'] = [self['alternativeTitles']]
#normalize country names
if 'country' in self:
self['country'] = [normalize_country_name(c) or c for c in self['country']]
if 'sound' in self:
self['sound'] = list(set(self['sound']))
types = {}
stop_words = [
'alternative spelling',
'alternative title',
'alternative transliteration',
'closing credits title',
'complete title',
'IMAX version',
'informal short title',
'International (Spanish title)',
'Japan (imdb display title)',
'longer version',
'new title',
'original subtitled version',
'pre-release title',
'promotional abbreviation',
'recut version',
'reissue title',
'restored version',
'script title',
'short title',
'(subtitle)',
'TV title',
'working title',
'World-wide (Spanish title)',
]
#ignore english japanese titles
#for movies that are not only from japan
if ['Japan'] != self.get('country', []):
stop_words += [
'Japan (English title)'
]
for t in self.get('alternativeTitles', []):
for type in t[0].split('/'):
type = type.strip()
stop_word = False
for key in stop_words:
if key in type:
stop_word = True
break
if not stop_word:
if not type in types:
types[type] = []
types[type].append(t[1])
titles = {}
for type in types:
for title in types[type]:
if not title in titles:
titles[title] = []
titles[title].append(type)
def select_title(type):
title = types[type][0]
count = 0
if len(types[type]) > 1:
for t in types[type]:
if len(titles[t]) > count:
count = len(titles[t])
title = t
return title
#FIXME: does work in python2.6, possible to import from __future__?
#types = {type: select_title(type) for type in types}
_types = {}
for type in types:
_types[type] = select_title(type)
types = _types
regexps = [
"^.+ \(imdb display title\) \(English title\)$",
"^USA \(imdb display title\)$",
"^International \(English title\)$",
"^International \(English title\)$",
"^UK \(imdb display title\)$",
"^International \(.+\) \(English title\)$",
"^World-wide \(English title\)$",
]
if 'Hong Kong' in self.get('country', []):
regexps += [
"Hong Kong \(English title\)"
]
english_countries = (
'USA', 'UK', 'United States', 'United Kingdom',
'Australia', 'New Zealand'
)
if not filter(lambda c: c in english_countries, self.get('country', [])):
regexps += [
"^[^(]+ \(English title\)$",
"^.+ \(.+\) \(English title\)$",
"^USA$",
"^UK$",
"^USA \(.+\)$",
"^UK \(.+\)$",
"^Australia \(.+\)$",
"World-wide \(English title\)",
"\(literal English title\)",
"^International \(.+ title\)$",
"^International \(.+\) \(.+ title\)$",
]
for regexp in regexps:
for type in types:
if re.compile(regexp).findall(type):
#print types[type], type
self['internationalTitle'] = types[type]
break
if 'internationalTitle' in self:
break
def cleanup_title(title):
if title.startswith('"') and title.endswith('"'):
title = title[1:-1]
if title.startswith("'") and title.endswith("'"):
title = title[1:-1]
title = re.sub('\(\#[.\d]+\)', '', title)
return title.strip()
for t in ('title', 'internationalTitle'):
if t in self:
self[t] = cleanup_title(self[t])
if 'internationalTitle' in self and \
self.get('title', '').lower() == self['internationalTitle'].lower():
del self['internationalTitle']
if 'alternativeTitles' in self:
alt = {}
for t in self['alternativeTitles']:
title = cleanup_title(t[1])
if title not in (self.get('title'), self.get('internationalTitle')):
if title not in alt:
alt[title] = []
for c in t[0].split('/'):
if not '(working title)' in c:
c = c.replace('International', '').replace('World-wide', '').split('(')[0].strip()
if c:
alt[title].append(c)
self['alternativeTitles'] = []
for t in sorted(alt, key=lambda a: sorted(alt[a])):
if alt[t]:
countries = sorted([normalize_country_name(c) or c for c in alt[t]])
self['alternativeTitles'].append((t, countries))
if not self['alternativeTitles']:
del self['alternativeTitles']
if 'internationalTitle' in self:
self['originalTitle'] = self['title']
self['title'] = self.pop('internationalTitle')
if 'runtime' in self and self['runtime']:
if 'min' in self['runtime']: base=60
else: base=1
self['runtime'] = int(find_re(self['runtime'], '([0-9]+)')) * base
if 'runtime' in self and not self['runtime']:
del self['runtime']
if 'votes' in self: self['votes'] = self['votes'].replace(',', '')
if 'cast' in self:
if isinstance(self['cast'][0], string_types):
self['cast'] = [self['cast']]
self['actor'] = [c[0] for c in self['cast']]
def cleanup_character(c):
c = c.replace('(uncredited)', '').strip()
return c
self['cast'] = [{'actor': x[0], 'character': cleanup_character(x[1])}
for x in self['cast']]
if 'connections' in self:
cc={}
if len(self['connections']) == 3 and isinstance(self['connections'][0], string_types):
self['connections'] = [self['connections']]
for rel, data, _ in self['connections']:
if isinstance(rel, bytes):
rel = rel.decode('utf-8')
#cc[rel] = re.compile('<a href="/title/tt(\d{7})/">(.*?)</a>').findall(data)
def get_conn(c):
r = {
'id': c[0],
'title': cleanup_title(c[1]),
}
description = c[2].split('<br />')
if len(description) == 2 and description[-1].strip() != '-':
r['description'] = description[-1].strip()
return r
cc[rel] = list(map(get_conn, re.compile('<a href="/title/tt(\d{7})/">(.*?)</a>(.*?)<\/div', re.DOTALL).findall(data)))
self['connections'] = cc
for key in ('country', 'genre'):
if key in self:
self[key] = list(filter(lambda x: x.lower() != 'home', self[key]))
#0092999
if '_director' in self:
if 'series' in self or 'isSeries' in self:
self['creator'] = self.pop('_director')
else:
del self['_director']
if 'isSeries' in self:
del self['isSeries']
self['isSeries'] = True
if 'episodeTitle' in self:
self['episodeTitle'] = re.sub('Episode \#\d+\.\d+', '', self['episodeTitle'])
if 'series' in self:
series = Imdb(self['series'], timeout=timeout)
self['seriesTitle'] = series['title']
if 'episodeTitle' in self:
self['seriesTitle'] = series['title']
if 'season' in self and 'episode' in self:
self['title'] = "%s (S%02dE%02d) %s" % (
self['seriesTitle'], self['season'], self['episode'], self['episodeTitle'])
else:
self['title'] = "%s (S01) %s" % (self['seriesTitle'], self['episodeTitle'])
self['season'] = 1
self['title'] = self['title'].strip()
if 'director' in self:
self['episodeDirector'] = self['director']
if not 'creator' in series and 'director' in series:
series['creator'] = series['director']
if len(series['creator']) > 10:
series['creator'] = series['director'][:1]
for key in ['creator', 'country']:
if key in series:
self[key] = series[key]
if 'year' in series:
self['seriesYear'] = series['year']
if not 'year' in self:
self['year'] = series['year']
if 'year' in self:
self['episodeYear'] = self['year']
if 'creator' in self:
self['seriesDirector'] = self['creator']
if 'originalTitle' in self:
del self['originalTitle']
else:
for key in ('seriesTitle', 'episodeTitle', 'season', 'episode'):
if key in self:
del self[key]
if 'creator' in self:
if 'director' in self:
self['episodeDirector'] = self['director']
self['director'] = self['creator']
#make lists unique but keep order
for key in ('director', 'language'):
if key in self:
self[key] = [x for i,x in enumerate(self[key])
if x not in self[key][i+1:]]
for key in ('actor', 'writer', 'producer', 'editor', 'composer'):
if key in self:
if isinstance(self[key][0], list):
self[key] = [i[0] for i in self[key] if i]
self[key] = sorted(list(set(self[key])), key=lambda a: self[key].index(a))
if 'budget' in self and 'gross' in self:
self['profit'] = self['gross'] - self['budget']
if 'releasedate' in self:
def parse_date(d):
try:
d = datetime.strptime(d, '%d %B %Y')
except:
try:
d = datetime.strptime(d, '%B %Y')
except:
return 'x'
return '%d-%02d-%02d' % (d.year, d.month, d.day)
self['releasedate'] = min([
parse_date(d) for d in self['releasedate']
])
if self['releasedate'] == 'x':
del self['releasedate']
if 'summary' in self:
if isinstance(self['summary'], list):
self['summary'] = self['summary'][0]
self['summary'] = self['summary'].split('</p')[0].strip()
class ImdbCombined(Imdb):
def __init__(self, id, timeout=-1):
_regex = {}
for key in self.regex:
if self.regex[key]['page'] in ('combined', 'releaseinfo'):
_regex[key] = self.regex[key]
self.regex = _regex
super(ImdbCombined, self).__init__(id, timeout)
def get_movie_by_title(title, timeout=-1):
'''
This only works for exact title matches from the data dump
Usually in the format
Title (Year)
"Series Title" (Year) {(#Season.Episode)}
"Series Title" (Year) {Episode Title (#Season.Episode)}
If there is more than one film with that title for the year
Title (Year/I)
>>> get_movie_by_title(u'"Father Knows Best" (1954) {(#5.34)}')
u'1602860'
>>> get_movie_by_title(u'The Matrix (1999)')
u'0133093'
>>> get_movie_by_title(u'Little Egypt (1951)')
u'0043748'
>>> get_movie_by_title(u'Little Egypt (1897/I)')
u'0214882'
>>> get_movie_by_title(u'Little Egypt')
None
>>> get_movie_by_title(u'"Dexter" (2006) {Father Knows Best (#1.9)}')
u'0866567'
'''
params = {'s':'tt','q': title}
if not isinstance(title, bytes):
try:
params['q'] = unicodedata.normalize('NFKC', params['q']).encode('latin-1')
except:
params['q'] = params['q'].encode('utf-8')
params = urllib.urlencode(params)
url = "http://akas.imdb.com/find?" + params
data = read_url(url, timeout=timeout, unicode=True)
#if search results in redirect, get id of current page
r = '<meta property="og:url" content="http://www.imdb.com/title/tt(\d{7})/" />'
results = re.compile(r).findall(data)
if results:
return results[0]
return None
def get_movie_id(title, director='', year='', timeout=-1):
'''
>>> get_movie_id('The Matrix')
u'0133093'
>>> get_movie_id('2 or 3 Things I Know About Her', 'Jean-Luc Godard')
u'0060304'
>>> get_movie_id('2 or 3 Things I Know About Her', 'Jean-Luc Godard', '1967')
u'0060304'
>>> get_movie_id(u"Histoire(s) du cinema: Le controle de l'univers", 'Jean-Luc Godard')
u'0179214'
>>> get_movie_id(u"Histoire(s) du cinéma: Le contrôle de l'univers", 'Jean-Luc Godard')
u'0179214'
'''
imdbId = {
(u'Le jour se l\xe8ve', u'Marcel Carn\xe9'): '0031514',
(u'Wings', u'Larisa Shepitko'): '0061196',
(u'The Ascent', u'Larisa Shepitko'): '0075404',
(u'Fanny and Alexander', u'Ingmar Bergman'): '0083922',
(u'Torment', u'Alf Sj\xf6berg'): '0036914',
(u'Crisis', u'Ingmar Bergman'): '0038675',
(u'To Joy', u'Ingmar Bergman'): '0043048',
(u'Humain, trop humain', u'Louis Malle'): '0071635',
(u'Place de la R\xe9publique', u'Louis Malle'): '0071999',
(u'God\u2019s Country', u'Louis Malle'): '0091125',
(u'Flunky, Work Hard', u'Mikio Naruse'): '0022036',
(u'The Courtesans of Bombay', u'Richard Robbins') : '0163591',
(u'Je tu il elle', u'Chantal Akerman') : '0071690',
(u'Hotel Monterey', u'Chantal Akerman') : '0068725',
(u'No Blood Relation', u'Mikio Naruse') : '023261',
(u'Apart from You', u'Mikio Naruse') : '0024214',
(u'Every-Night Dreams', u'Mikio Naruse') : '0024793',
(u'Street Without End', u'Mikio Naruse') : '0025338',
(u'Sisters of the Gion', u'Kenji Mizoguchi') : '0027672',
(u'Osaka Elegy', u'Kenji Mizoguchi') : '0028021',
(u'Blaise Pascal', u'Roberto Rossellini') : '0066839',
(u'Japanese Girls at the Harbor', u'Hiroshi Shimizu') : '0160535',
(u'The Private Life of Don Juan', u'Alexander Korda') : '0025681',
(u'Last Holiday', u'Henry Cass') : '0042665',
(u'A Colt Is My Passport', u'Takashi Nomura') : '0330536',
(u'Androcles and the Lion', u'Chester Erskine') : '0044355',
(u'Major Barbara', u'Gabriel Pascal') : '0033868',
(u'Come On Children', u'Allan King') : '0269104',
(u'Jimi Plays Monterey & Shake! Otis at Monterey', u'D. A. Pennebaker and Chris Hegedus') : '',
(u'Martha Graham: Dance on Film', u'Nathan Kroll') : '',
(u'Carmen', u'Carlos Saura'): '0085297',
(u'The Story of a Cheat', u'Sacha Guitry'): '0028201',
(u'Weekend', 'Andrew Haigh'): '1714210',
}.get((title, director), None)
if imdbId:
return imdbId
params = {'s':'tt','q': title}
if director:
params['q'] = u'"%s" %s' % (title, director)
if year:
params['q'] = u'"%s (%s)" %s' % (title, year, director)
google_query = "site:imdb.com %s" % params['q']
if not isinstance(params['q'], bytes):
try:
params['q'] = unicodedata.normalize('NFKC', params['q']).encode('latin-1')
except:
params['q'] = params['q'].encode('utf-8')
params = urllib.urlencode(params)
url = "http://akas.imdb.com/find?" + params
#print url
data = read_url(url, timeout=timeout, unicode=True)
#if search results in redirect, get id of current page
r = '<meta property="og:url" content="http://www.imdb.com/title/tt(\d{7})/" />'
results = re.compile(r).findall(data)
if results:
return results[0]
#otherwise get first result
r = '<td valign="top">.*?<a href="/title/tt(\d{7})/"'
results = re.compile(r).findall(data)
if results:
return results[0]
#print (title, director), ": '',"
#print google_query
#results = google.find(google_query, timeout=timeout)
results = duckduckgo.find(google_query, timeout=timeout)
if results:
for r in results[:2]:
imdbId = find_re(r[1], 'title/tt(\d{7})')
if imdbId:
return imdbId
#or nothing
return ''
def get_movie_poster(imdbId):
'''
>>> get_movie_poster('0133093')
'http://ia.media-imdb.com/images/M/MV5BMjEzNjg1NTg2NV5BMl5BanBnXkFtZTYwNjY3MzQ5._V1._SX338_SY475_.jpg'
>>> get_movie_poster('0994352')
'http://ia.media-imdb.com/images/M/MV5BMjA3NzMyMzU1MV5BMl5BanBnXkFtZTcwNjc1ODUwMg@@._V1._SX594_SY755_.jpg'
'''
info = ImdbCombined(imdbId)
if 'posterId' in info:
url = "http://www.imdb.com/media/rm%s/tt%s" % (info['posterId'], imdbId)
data = read_url(url).decode('utf-8', 'ignore')
poster = find_re(data, 'img.*?id="primary-img".*?src="(.*?)"')
return poster
elif 'series' in info:
return get_movie_poster(info['series'])
return ''
def get_episodes(imdbId, season=None):
episodes = {}
url = 'http://www.imdb.com/title/tt%s/episodes' % imdbId
if season:
url += '?season=%d' % season
data = cache.read_url(url)
for e in re.compile('<div data-const="tt(\d{7})".*?>.*?<div>S(\d+), Ep(\d+)<\/div>\n<\/div>', re.DOTALL).findall(data):
episodes['S%02dE%02d' %(int(e[1]), int(e[2]))] = e[0]
else:
data = cache.read_url(url)
match = re.compile('<strong>Season (\d+)</strong>').findall(data)
if match:
for season in range(1, int(match[0]) + 1):
episodes.update(get_episodes(imdbId, season))
return episodes
def max_votes():
url = 'http://www.imdb.com/search/title?num_votes=500000,&sort=num_votes,desc'
data = cache.read_url(url)
votes = max([int(v.replace(',', ''))
for v in re.compile('<td class="sort_col">([\d,]+)</td>').findall(data)])
return votes
def guess(title, director='', timeout=-1):
return get_movie_id(title, director, timeout=timeout)
if __name__ == "__main__":
import json
print(json.dumps(Imdb('0306414'), indent=2))
#print json.dumps(Imdb('0133093'), indent=2)

View file

@ -0,0 +1,300 @@
# vi:si:et:sw=4:sts=4:ts=4
# encoding: utf-8
import re
from ox.cache import read_url
from ox.html import strip_tags
from ox.text import find_re
def get_data(id):
'''
>>> get_data('1991/silence_of_the_lambs')['imdbId']
u'0102926'
>>> get_data('1991/silence_of_the_lambs')['posters'][0]
u'http://www.impawards.com/1991/posters/silence_of_the_lambs_ver1.jpg'
>>> get_data('1991/silence_of_the_lambs')['url']
u'http://www.impawards.com/1991/silence_of_the_lambs_ver1.html'
'''
data = {
'url': get_url(id)
}
html = read_url(data['url'], unicode=True)
data['imdbId'] = find_re(html, 'imdb.com/title/tt(\d{7})')
if not data['imdbId']:
data['imdbId'] = _id_map.get(id, '')
data['title'] = strip_tags(find_re(html, '<p class="name white">(.*?) \(<a href="alpha1.html">'))
data['year'] = find_re(html, '\(<a href="alpha1.html">(.*?)</a>\)')
data['posters'] = []
poster = find_re(html, '<img src="(posters.*?)"')
if poster:
poster = 'http://www.impawards.com/%s/%s' % (data['year'], poster)
data['posters'].append(poster)
results = re.compile('<a href = (%s.*?html)' % id[5:], re.DOTALL).findall(html)
for result in results:
result = result.replace('_xlg.html', '.html')
url = 'http://www.impawards.com/%s/%s' % (data['year'], result)
html = read_url(url, unicode=True)
result = find_re(html, '<a href = (\w*?_xlg.html)')
if result:
url = 'http://www.impawards.com/%s/%s' % (data['year'], result)
html = read_url(url, unicode=True)
poster = 'http://www.impawards.com/%s/%s' % (data['year'], find_re(html, '<img SRC="(.*?)"'))
else:
poster = 'http://www.impawards.com/%s/%s' % (data['year'], find_re(html, '<img src="(posters.*?)"'))
data['posters'].append(poster)
return data
def get_id(url):
split = url.split('/')
year = split[3]
split = split[4][:-5].split('_')
if split[-1] == 'xlg':
split.pop()
if find_re(split[-1], 'ver\d+$'):
split.pop()
id = '%s/%s' % (year, '_'.join(split))
return id
def get_ids(page=None):
ids = []
if page:
html = read_url('http://www.impawards.com/archives/page%s.html' % page, timeout = -1, unicode=True)
results = re.compile('<a href = \.\./(.*?)>', re.DOTALL).findall(html)
for result in results:
url = 'http://impawards.com/%s' % result
ids.append(get_id(url))
return set(ids)
#get all
html = read_url('http://www.impawards.com/archives/latest.html', timeout = 60*60, unicode=True)
pages = int(find_re(html, '<a href= page(.*?).html>')) + 1
for page in range(pages, 0, -1):
for id in get_ids(page):
if not id in ids:
ids.append(id)
return ids
def get_url(id):
url = u"http://www.impawards.com/%s.html" % id
html = read_url(url, unicode=True)
if find_re(html, "No Movie Posters on This Page"):
url = u"http://www.impawards.com/%s_ver1.html" % id
return url
_id_map = {
'1933/forty_second_street': '0024034',
'1933/tarzan_the_fearless': '0024645',
'1935/informer': '0026529',
'1935/thirty_nine_steps': '0026529',
'1935/top_hat': '0027125',
'1938/charlie_chaplin_cavalcade': '0284687',
'1943/falcon_and_the_co-eds': '035855',
'1969/angel_angel_down_we_go': '0065602',
'1970/crimson_altar': '0062833',
'1975/man_who_would_be_king_ver1': '0073341',
'1975/picnic_at_hanging_rock_ver1': '0073540',
'1979/electric_horseman_ver1': '0079100',
'1980/caligula_ver1': '0080491',
'1980/hollywood_knights_ver1': '0080881',
'1981/history_of_the_world_part_i': '0082517',
'1981/sea_wolves': '0081470',
'1983/krull_ver1': '0085811',
'1985/warriors_of_the_wind': '0087544',
'1989/friday_the_thirteenth_part_viii_ver1': '0097388',
'1989/high_hopes': '0095302',
'1989/millenium': '0097883',
'1989/story_of_women': '0096336',
'1990/edward_scissorhands_ver1': '0099487',
'1991/freddys_dead_ver1': '0101917',
'1993/robocop_three_ver1': '0107978',
'1993/waynes_world_two_ver1': '0108525',
'1994/above_the_rim_ver1': '0109035',
'1994/helas_pour_moi': '0107175',
'1994/house_of_the_spirits_ver1': '0107151',
'1994/i_dont_want_to_talk_about_it': '0106678',
'1994/in_custody': '0107199',
'1994/ladybird_ladybird': '0110296',
'1994/leon_the_pig_farmer': '0104710',
'1994/love_after_love': '0103710',
'1994/l_six_two_seven': '0104658',
'1994/martin_lawrence_you_so_crazy_ver1': '0111804',
'1994/savage_nights': '0105032',
'1994/sex_drugs_and_democracy': '0111135',
'1995/bye_bye_love': '0112606',
'1995/cold_comfort_farm': '0112701',
'1995/gumby_the_movie': '0113234',
'1995/les_miserables': '0113828',
'1995/mystery_of_rampo': '0110943',
'1995/pharaohs_army': '0114122',
'1995/pure_formality': '0110917',
'1995/quick_and_the_dead_ver1': '0114214',
'1995/reflections_in_the_dark': '0110956',
'1995/safe_ver1': '0114323',
'1995/search_and_destroy': '0114371',
'1995/secret_of_roan_inish_ver1': '0111112',
'1995/underneath': '0114788',
'1996/ghost_in_the_shell': '0113568',
'1996/hate': '0113247',
'1996/horseman_on_the_roof': '0113362',
'1996/kids_in_the_hall_brain_candy': '0116768',
'1996/maybe_maybe_not': '0109255',
'1996/prisoner_of_the_mountains': '0116754',
'1997/fifth_element_ver1': '0119116',
'1997/fools_rush_in_ver1': '0119141',
'1997/gi_jane_ver1': '0119173',
'1997/happy_together_ver1': '0118845',
'1997/lilies': '0116882',
'1997/mouth_to_mouth': '0112546',
'1997/mr_nice_guy': '0117786',
'1997/nenette_and_boni': '0117221',
'1997/paperback_romance': '0110405',
'1997/second_jungle_book': '0120087',
'1997/single_girl': '0113057',
'1997/super_speedway': '0120245',
'1997/temptress_moon': '0116295',
'1998/alarmist': '0119534',
'1998/barneys_great_adventure_the_movie': '0120598',
'1998/bulworth_ver1': '0118798',
'1998/celebration': '0154420',
'1998/east_palace_west_palace': '0119007',
'1998/hurricane_streets': '0119338',
'1998/i_married_a_strange_person': '0119346',
'1998/inheritors': '0141824',
'1998/killing_time': '0140312',
'1998/live_flesh': '0118819',
'1998/music_from_another_room': '0119734',
'1998/post_coitum_ver1': '0119923',
'1998/steam_the_turkish_bath': '0119248',
'1998/velocity_of_gary': '0120878',
'1999/after_life': '0165078',
'1999/emperor_and_the_assassin': '0162866',
'1999/fantasia_two_thousand': '0120910',
'1999/get_bruce': '0184510',
'1999/god_said_ha': '0119207',
'1999/jawbreaker': '0155776',
'1999/jeanne_and_the_perfect_guy': '0123923',
'1999/king_and_i': '0160429',
'1999/lovers_of_the_arctic_circle': '0133363',
'1999/plunkett_and_macleane': '0134033',
'1999/pokemon_the_first_movie': '0190641',
'1999/school_of_flesh': '0157208',
'1999/splendor': '0127296',
'1999/stranger_in_the_kingdom': '0126680',
'1999/train_of_life': '0170705',
'1999/twice_upon_a_yesterday': '0138590',
'1999/whiteboys': '0178988',
'1999/wildfire': '0194544',
'1999/windhorse': '0169388',
'2000/claim': '0218378',
'2000/color_of_paradise': '0191043',
'2000/criminal_lovers': '0205735',
'2000/everlasting_piece': '0218182',
'2000/girl_on_the_bridge_ver1': '0144201',
'2000/godzilla_two_thousand': '0188640',
'2000/goya_in_bordeaux': '0210717',
'2000/mad_about_mambo': '0156757',
'2000/picking_up_the_pieces': '0192455',
'2000/pokemon_the_movie_2000': '0257001',
'2000/seven_days_to_live': '0221928',
'2000/south_of_heaven_west_of_hell': '0179473',
'2000/suzhou_river': '0234837',
'2000/time_for_drunken_horses': '0259072',
'2000/venus_beauty_institute': '0174330',
'2001/circle': '0368646',
'2001/devils_backbone': '0256009',
'2001/kill_me_later': '0243595',
'2001/king_is_dancing': '0244173',
'2001/learning_curve': '0219126',
'2001/marco_polo__return_to_xanadu_ver1': '0296074',
'2001/me_you_them': '0244504',
'2001/our_lady_of_the_assassins': '0250809',
'2001/pinero': '0261066',
'2001/pokemon_three_the_movie_ver1': '0266860',
'2001/scratch': '0143861',
'2001/vampire_hunter_d_bloodlust_ver1': '0216651',
'2002/el_bosque_animado': '0310790',
'2002/fifty_first_state': '0227984',
'2002/les_destinees': '0216689',
'2002/sons_room': '0208990',
'2003/open_hearts': '0315543',
'2003/tulse_luper_suitcases': '0307596',
'2003/valentin': '0296915',
'2004/if_only_ver1': '0332136',
'2004/wondrous_oblivion': '0334725',
'2005/wu_ji': '0417976',
'2006/golden_door': '0465188',
'2006/kin': '1091189',
'2007/revenge_of_the_nerds': '0088000',
'2008/bad_batch': '1605644',
'2008/mercedes': '1368083',
'2008/spirit': '0831887',
'2009/dead_air': '0993841',
'2009/edge_of_love': '0819714',
'2009/fuel': '1072437',
'2009/fuel': '1072437',
'2009/one_good_man': '1239357',
'2009/st_trinians': '1210106',
'2009/surveillance': '0409345',
'2009/taken': '0936501',
'2009/vaml': '1610453',
'2010/adopting_haiti': '1764164',
'2010/afterlife': '0838247',
'2010/agora': '1186830',
'2010/athlete': '1356996',
'2010/beneath_the_blue': '1222698',
'2010/bitch_slap': '1212974',
'2010/black_waters_of_echos_pond': '0960066',
'2010/case_thirty_nine': '0795351',
'2010/finite_and_infinite_games': '1772268',
'2010/hole': '1085779',
'2010/jolene': '0867334',
'2010/lake_mungo': '0816556',
'2010/last_day_of_summer': '1242544',
'2010/leaves_of_grass': '1151359',
'2010/life_of_lemon': '1466057',
'2010/man_in_the_maze': '1721692',
'2010/mr_immortality_the_life_and_times_of_twista': '1711017',
'2010/paper_man': '0437405',
'2010/perfect_game': '0473102',
'2010/red_baron': '0365675',
'2010/satin': '0433397',
'2010/shutter_island': '1130884',
'2010/strange_powers': '1534075',
'2010/suicidegirls_must_die': '1584733',
'2010/veronika_decides_to_die': '1068678',
'2010/witchblade': '0494292',
'2010/youth_in_revolt': '0403702',
'2011/beastly': '1152398',
'2011/burning_palms': '1283887',
'2011/cabin_in_the_woods': '1259521',
'2011/conan': '0816462',
'2011/courageous': '1630036',
'2011/cruces_divided_two': '1698645',
'2011/green_with_envy': '1204342',
'2011/happythankyoumoreplease': '1481572',
'2011/homework': '1645080',
'2011/i_got_next': '1915570',
'2011/lebanon_pa': '1290082',
'2011/money_pet': '1965198',
'2011/my_suicide': '0492896',
'2011/priest': '0822847',
'2011/prowl': '1559033',
'2011/red_sonja': '0800175',
'2011/season_of_the_witch': '0479997',
'2011/stay_cool': '1235807',
'2011/sympathy_for_delicious': '1270277',
'2011/trust': '1529572',
'2011/undefeated': '1961604',
'2011/vanishing_on_seventh_street': '1452628',
'2011/where_is_robert_fisher': '2042712',
'2011/yellowbrickroad': '1398428',
'2012/haywire': '1506999',
'2012/last_call_at_the_oasis': '2043900',
}
if __name__ == '__main__':
ids = get_ids()
print sorted(ids), len(ids)

View file

@ -0,0 +1,187 @@
# vi:si:et:sw=4:sts=4:ts=4
# encoding: utf-8
import re
import urllib
from ox.cache import read_url
from ox.html import decode_html, strip_tags
from ox.text import find_re
from ox.text import find_string
# to sniff itunes traffic, use something like
# sudo tcpdump -i en1 -Avs 8192 host appleglobal.112.2o7.net
# http://ax.phobos.apple.com.edgesuite.net/WebObjects/MZSearch.woa/wa/advancedSearch?media=music&songTerm=&genreIndex=1&flavor=0&mediaType=2&composerTerm=&allArtistNames=Arcadia&ringtone=0&searchButton=submit
# http://ax.phobos.apple.com.edgesuite.net/WebObjects/MZSearch.woa/wa/advancedSearch?media=movie&movieTerm=The%20Matrix&descriptionTerm=&ratingIndex=1&mediaType=3&directorProducerName=Andy%20Wachowski&flavor=0&releaseYearTerm=1999&closedCaption=0&actorTerm=&searchButton=submit
ITUNES_HEADERS = {
'X-Apple-Tz': '0',
'X-Apple-Storefront': '143441-1',
'User-Agent': 'iTunes/7.6.2 (Macintosh; U; Intel Mac OS X 10.5.2)',
'Accept-Language': 'en-us, en;q=0.50',
'Accept-Encoding': 'gzip',
'Connection': 'close',
}
def compose_url(request, parameters):
if request == 'advancedSearch':
url = 'http://ax.phobos.apple.com.edgesuite.net/WebObjects/MZSearch.woa/wa/advancedSearch?'
if parameters['media'] == 'music':
url += urllib.urlencode({
'albumTerm': parameters['title'],
'allArtistNames': parameters['artist'],
'composerTerm': '',
'flavor': 0,
'genreIndex': 1,
'media': 'music',
'mediaType': 2,
'ringtone': 0,
'searchButton': 'submit',
'songTerm': ''
})
elif parameters['media'] == 'movie':
url += urllib.urlencode({
'actorTerm': '',
'closedCaption': 0,
'descriptionTerm': '',
'directorProducerName': parameters['director'],
'flavor': 0,
'media': 'movie',
'mediaType': 3,
'movieTerm': parameters['title'],
'ratingIndex': 1,
'releaseYearTerm': '',
'searchButton': 'submit'
})
elif request == 'viewAlbum':
url = 'http://phobos.apple.com/WebObjects/MZStore.woa/wa/viewAlbum?id=%s' % parameters['id']
elif request == 'viewMovie':
url = 'http://phobos.apple.com/WebObjects/MZStore.woa/wa/viewMovie?id=%s&prvw=1' % parameters['id']
return url
def parse_xml_dict(xml):
values = {}
strings = xml.split('<key>')
for string in strings:
if string.find('</key>') != -1:
key = find_re(string, '(.*?)</key>')
type = find_re(string, '</key><(.*?)>')
if type == 'true/':
value = True
else:
value = find_re(string, '<%s>(.*?)</%s>' % (type, type))
if type == 'integer':
value = int(value)
elif type == 'string':
value = decode_html(value)
values[key] = value
return values
def parse_cast(xml, title):
list = []
try:
strings = find_re(xml, '<SetFontStyle normalStyle="textColor">%s(.*?)</VBoxView>' % title[:-1].upper()).split('</GotoURL>')
strings.pop()
for string in strings:
list.append(find_re(string, '<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
return list
except:
return list
def parse_movies(xml, title):
list = []
try:
strings = find_re(xml, '<SetFontStyle normalStyle="outlineTitleFontStyle"><b>%s(.*?)</Test>' % title[:-1].upper()).split('</GotoURL>')
strings.pop()
for string in strings:
list.append({
'id': find_re(string, 'viewMovie\?id=(.*?)&'),
'title': find_re(string, '<SetFontStyle normalStyle="outlineTextFontStyle"><b>(.*?)</b></SetFontStyle>')
})
return list
except:
return list
class ItunesAlbum:
def __init__(self, id = '', title = '', artist = ''):
self.id = id
self.title = title
self.artist = artist
if not id:
self.id = self.get_id()
def get_id(self):
url = compose_url('advancedSearch', {'media': 'music', 'title': self.title, 'artist': self.artist})
xml = read_url(url, headers = ITUNES_HEADERS)
id = find_re(xml, 'viewAlbum\?id=(.*?)&')
return id
def get_data(self):
data = {'id': self.id}
url = compose_url('viewAlbum', {'id': self.id})
xml = read_url(url, None, ITUNES_HEADERS)
data['albumName'] = find_re(xml, '<B>(.*?)</B>')
data['artistName'] = find_re(xml, '<b>(.*?)</b>')
data['coverUrl'] = find_re(xml, 'reflection="." url="(.*?)"')
data['genre'] = find_re(xml, 'Genre:(.*?)<')
data['releaseDate'] = find_re(xml, 'Released(.*?)<')
data['review'] = strip_tags(find_re(xml, 'REVIEW</b>.*?<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
data['tracks'] = []
strings = find_re(xml, '<key>items</key>.*?<dict>(.*?)$').split('<dict>')
for string in strings:
data['tracks'].append(parse_xml_dict(string))
data['type'] = find_re(xml, '<key>listType</key><string>(.*?)<')
return data
class ItunesMovie:
def __init__(self, id = '', title = '', director = ''):
self.id = id
self.title = title
self.director = director
if not id:
self.id = self.get_id()
def get_id(self):
url = compose_url('advancedSearch', {'media': 'movie', 'title': self.title, 'director': self.director})
xml = read_url(url, headers = ITUNES_HEADERS)
id = find_re(xml, 'viewMovie\?id=(.*?)&')
return id
def get_data(self):
data = {'id': self.id}
url = compose_url('viewMovie', {'id': self.id})
xml = read_url(url, None, ITUNES_HEADERS)
f = open('/Users/rolux/Desktop/iTunesData.xml', 'w')
f.write(xml)
f.close()
data['actors'] = parse_cast(xml, 'actors')
string = find_re(xml, 'Average Rating:(.*?)</HBoxView>')
data['averageRating'] = string.count('rating_star_000033.png') + string.count('&#189;') * 0.5
data['directors'] = parse_cast(xml, 'directors')
data['format'] = find_re(xml, 'Format:(.*?)<')
data['genre'] = decode_html(find_re(xml, 'Genre:(.*?)<'))
data['plotSummary'] = decode_html(find_re(xml, 'PLOT SUMMARY</b>.*?<SetFontStyle normalStyle="textColor">(.*?)</SetFontStyle>'))
data['posterUrl'] = find_re(xml, 'reflection="." url="(.*?)"')
data['producers'] = parse_cast(xml, 'producers')
data['rated'] = find_re(xml, 'Rated(.*?)<')
data['relatedMovies'] = parse_movies(xml, 'related movies')
data['releaseDate'] = find_re(xml, 'Released(.*?)<')
data['runTime'] = find_re(xml, 'Run Time:(.*?)<')
data['screenwriters'] = parse_cast(xml, 'screenwriters')
data['soundtrackId'] = find_re(xml, 'viewAlbum\?id=(.*?)&')
data['trailerUrl'] = find_re(xml, 'autoplay="." url="(.*?)"')
return data
if __name__ == '__main__':
from ox.utils import json
data = ItunesAlbum(title = 'So Red the Rose', artist = 'Arcadia').get_data()
print json.dumps(data, sort_keys = True, indent = 4)
data = ItunesMovie(title = 'The Matrix', director = 'Wachowski').get_data()
print json.dumps(data, sort_keys = True, indent = 4)
for v in data['relatedMovies']:
data = ItunesMovie(id = v['id']).get_data()
print json.dumps(data, sort_keys = True, indent = 4)
data = ItunesMovie(id='272960052').get_data()
print json.dumps(data, sort_keys = True, indent = 4)

View file

@ -0,0 +1,42 @@
from ox.cache import read_url
from ox import find_re, strip_tags
import re
base = 'http://www.lookupbyisbn.com'
def get_data(isbn):
r = {}
url = '%s/Search/Book/%s/1' % (base, isbn)
data = read_url(url).decode('utf-8')
m = re.compile('href="(/Lookup/Book/[^"]+?)"').findall(data)
if m:
ids = m[0].split('/')
r['isbn'] = ids[-2]
r['asin'] = ids[-3]
url = '%s%s' % (base, m[0])
data = read_url(url).decode('utf-8')
r["title"] = find_re(data, "<h2>(.*?)</h2>")
keys = {
'author': 'Author(s)',
'publisher': 'Publisher',
'date': 'Publication date',
'edition': 'Edition',
'binding': 'Binding',
'volume': 'Volume(s)',
'pages': 'Pages',
}
for key in keys:
r[key] = find_re(data, '<span class="title">%s:</span>(.*?)</li>'% re.escape(keys[key]))
if r[key] == '--':
r[key] = ''
if key == 'pages' and r[key]:
r[key] = int(r[key])
desc = find_re(data, '<h2>Description:<\/h2>(.*?)<div ')
desc = desc.replace('<br /><br />', ' ').replace('<br /> ', ' ').replace('<br />', ' ')
r['description'] = strip_tags(desc).strip()
if r['description'] == u'Description of this item is not available at this time.':
r['description'] = ''
r['cover'] = find_re(data, '<img src="(.*?)" alt="Book cover').replace('._SL160_', '')
return r

View file

@ -0,0 +1,21 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
from ox.cache import read_url
from ox.html import decode_html
from ox.text import find_re
def get_lyrics(title, artist):
html = read_url('http://lyricsfly.com/api/')
key = find_re(html, '<font color=green><b>(.*?)</b></font>')
url = 'http://lyricsfly.com/api/api.php?i=%s&a=%s&t=%s' % (key, artist, title)
xml = read_url(url)
lyrics = find_re(xml, '<tx>(.*?)\[br\] Lyrics [a-z]* by lyricsfly.com')
lyrics = lyrics.replace('\n', '').replace('\r', '')
lyrics = lyrics.replace('[br]', '\n').strip()
lyrics.replace('\n\n\n', '\n\n')
lyrics = decode_html(lyrics.replace('&amp;', '&'))
return lyrics
if __name__ == '__main__':
print getLyrics('Election Day', 'Arcadia')

View file

@ -0,0 +1,63 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import re
from urllib import quote
from lxml.html import document_fromstring
from ox.cache import read_url
from ox import find_re, strip_tags
def get_url(id=None, imdb=None):
if imdb:
url = "http://www.imdb.com/title/tt%s/criticreviews" % imdb
data = read_url(url)
metacritic_url = find_re(data, '"(http://www.metacritic.com/movie/.*?)"')
return metacritic_url or None
return 'http://www.metacritic.com/movie/%s' % id
def get_id(url):
return url.split('/')[-1]
def get_show_url(title):
title = quote(title)
url = "http://www.metacritic.com/search/process?ty=6&ts=%s&tfs=tvshow_title&x=0&y=0&sb=0&release_date_s=&release_date_e=&metascore_s=&metascore_e=" % title
data = read_url(url)
return find_re(data, '(http://www.metacritic.com/tv/shows/.*?)\?')
def get_data(url):
data = read_url(url, unicode=True)
doc = document_fromstring(data)
score = filter(lambda s: s.attrib.get('property') == 'v:average',
doc.xpath('//span[@class="score_value"]'))
if score:
score = int(score[0].text)
else:
score = -1
authors = [a.text
for a in doc.xpath('//div[@class="review_content"]//div[@class="author"]//a')]
sources = [d.text
for d in doc.xpath('//div[@class="review_content"]//div[@class="source"]/a')]
reviews = [d.text
for d in doc.xpath('//div[@class="review_content"]//div[@class="review_body"]')]
scores = [int(d.text.strip())
for d in doc.xpath('//div[@class="review_content"]//div[contains(@class, "critscore")]')]
urls = [a.attrib['href']
for a in doc.xpath('//div[@class="review_content"]//a[contains(@class, "external")]')]
metacritics = []
for i in range(len(authors)):
metacritics.append({
'critic': authors[i],
'url': urls[i],
'source': sources[i],
'quote': strip_tags(reviews[i]).strip(),
'score': scores[i],
})
return {
'critics': metacritics,
'id': get_id(url),
'score': score,
'url': url,
}

View file

@ -0,0 +1,121 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
from datetime import datetime
import re
import socket
from urllib import quote
from ox.cache import read_url
from ox import find_re, cache, strip_tags, decode_html, get_torrent_info, int_value, normalize_newlines
from ox.normalize import normalize_imdbid
import ox
from torrent import Torrent
def _parse_results_page(data, max_results=10):
results=[]
regexp = '''<tr><td>(.*?)</td><td>(.*?)<a href="/tor/(.*?)">(.*?)</a>.*?</td>.*?</tr>'''
for row in re.compile(regexp, re.DOTALL).findall(data):
torrentDate = row[0]
torrentExtra = row[1]
torrentId = row[2]
torrentTitle = decode_html(row[3]).strip()
torrentLink = "http://www.mininova.org/tor/" + torrentId
privateTracker = 'priv.gif' in torrentExtra
if not privateTracker:
results.append((torrentTitle, torrentLink, ''))
return results
def find_movie(query=None, imdb=None, max_results=10):
'''search for torrents on mininova
'''
if imdb:
url = "http://www.mininova.org/imdb/?imdb=%s" % normalize_imdbid(imdb)
else:
url = "http://www.mininova.org/search/%s/seeds" % quote(query)
data = read_url(url, unicode=True)
return _parse_results_page(data, max_results)
def get_id(mininovaId):
mininovaId = unicode(mininovaId)
d = find_re(mininovaId, "/(\d+)")
if d:
return d
mininovaId = mininovaId.split('/')
if len(mininovaId) == 1:
return mininovaId[0]
else:
return mininovaId[-1]
def exists(mininovaId):
mininovaId = get_id(mininovaId)
data = ox.net.read_url("http://www.mininova.org/tor/%s" % mininovaId)
if not data or 'Torrent not found...' in data:
return False
if 'tracker</a> of this torrent requires registration.' in data:
return False
return True
def get_data(mininovaId):
_key_map = {
'by': u'uploader',
}
mininovaId = get_id(mininovaId)
torrent = dict()
torrent[u'id'] = mininovaId
torrent[u'domain'] = 'mininova.org'
torrent[u'comment_link'] = "http://www.mininova.org/tor/%s" % mininovaId
torrent[u'torrent_link'] = "http://www.mininova.org/get/%s" % mininovaId
torrent[u'details_link'] = "http://www.mininova.org/det/%s" % mininovaId
data = read_url(torrent['comment_link'], unicode=True) + read_url(torrent['details_link'], unicode=True)
if '<h1>Torrent not found...</h1>' in data:
return None
for d in re.compile('<p>.<strong>(.*?):</strong>(.*?)</p>', re.DOTALL).findall(data):
key = d[0].lower().strip()
key = _key_map.get(key, key)
value = decode_html(strip_tags(d[1].strip()))
torrent[key] = value
torrent[u'title'] = find_re(data, '<title>(.*?):.*?</title>')
torrent[u'imdbId'] = find_re(data, 'title/tt(\d{7})')
torrent[u'description'] = find_re(data, '<div id="description">(.*?)</div>')
if torrent['description']:
torrent['description'] = normalize_newlines(decode_html(strip_tags(torrent['description']))).strip()
t = read_url(torrent[u'torrent_link'])
torrent[u'torrent_info'] = get_torrent_info(t)
return torrent
class Mininova(Torrent):
'''
>>> Mininova('123')
{}
>>> Mininova('1072195')['infohash']
'72dfa59d2338e4a48c78cec9de25964cddb64104'
'''
def __init__(self, mininovaId):
self.data = get_data(mininovaId)
if not self.data:
return
Torrent.__init__(self)
ratio = self.data['share ratio'].split(',')
self['seeder'] = -1
self['leecher'] = -1
if len(ratio) == 2:
val = int_value(ratio[0].replace(',','').strip())
if val:
self['seeder'] = int(val)
val = int_value(ratio[1].replace(',','').strip())
if val:
self['leecher'] = int(val)
val = int_value(self.data['downloads'].replace(',','').strip())
if val:
self['downloaded'] = int(val)
else:
self['downloaded'] = -1
published = self.data['added on']
published = published.split(' +')[0]
self['published'] = datetime.strptime(published, "%a, %d %b %Y %H:%M:%S")

View file

@ -0,0 +1,44 @@
# -*- coding: UTF-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import re
from ox.cache import read_url
from ox import find_re
def get_data(id):
'''
>>> get_data('0060304')['posters'][0]
u'http://www.movieposterdb.com/posters/06_03/1967/0060304/l_99688_0060304_639fdd1e.jpg'
>>> get_data('0123456')['posters']
[]
'''
data = {
"url": get_url(id)
}
data["posters"] = get_posters(data["url"])
return data
def get_id(url):
return url.split("/")[-2]
def get_posters(url, group=True, timeout=-1):
posters = []
html = read_url(url, timeout=timeout, unicode=True)
if url in html:
if group:
results = re.compile('<a href="(http://www.movieposterdb.com/group/.+?)\??">', re.DOTALL).findall(html)
for result in results:
posters += get_posters(result, False)
results = re.compile('<a href="(http://www.movieposterdb.com/poster/.+?)">', re.DOTALL).findall(html)
for result in results:
html = read_url(result, timeout=timeout, unicode=True)
posters.append(find_re(html, '"(http://www.movieposterdb.com/posters/.+?\.jpg)"'))
return posters
def get_url(id):
return "http://www.movieposterdb.com/movie/%s/" % id
if __name__ == '__main__':
print get_data('0060304')
print get_data('0133093')

View file

@ -0,0 +1,41 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import re
import feedparser
from ox.cache import read_url
from ox import find_re, strip_tags
from ox.iso import langCode2To3, langTo3Code
def find_subtitles(imdb, parts = 1, language = "eng"):
if len(language) == 2:
language = langCode2To3(language)
elif len(language) != 3:
language = langTo3Code(language)
url = "http://www.opensubtitles.org/en/search/"
if language:
url += "sublanguageid-%s/" % language
url += "subsumcd-%s/subformat-srt/imdbid-%s/rss_2_00" % (parts, imdb)
data = read_url(url)
if "title>opensubtitles.com - search results</title" in data:
fd = feedparser.parse(data)
opensubtitleId = None
if fd.entries:
link = fd.entries[0]['links'][0]['href']
opensubtitleId = re.compile('subtitles/(.*?)/').findall(link)
if opensubtitleId:
opensubtitleId = opensubtitleId[0]
else:
opensubtitleId = find_re(data, '/en/subtitles/(.*?)/')
return opensubtitleId
def download_subtitle(opensubtitle_id):
srts = {}
data = read_url('http://www.opensubtitles.org/en/subtitles/%s' % opensubtitle_id)
reg_exp = 'href="(/en/download/file/.*?)">(.*?)</a>'
for f in re.compile(reg_exp, re.DOTALL).findall(data):
name = strip_tags(f[1]).split('\n')[0]
url = "http://www.opensubtitles.com%s" % f[0]
srts[name] = read_url(url, unicode=True)
return srts

View file

@ -0,0 +1,10 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import ox.cache
def get_poster_url(id):
url = "http://0xdb.org/%s/poster.0xdb.jpg" % id
if ox.cache.exists(url):
return url
return ''

View file

@ -0,0 +1,19 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
from __future__ import print_function
import re
from ox.net import read_url
def get_poster_url(id):
url = 'http://piratecinema.org/posters/'
html = read_url(url, unicode=True)
results = re.compile('src="(.+)" title=".+\((\d{7})\)"').findall(html)
for result in results:
if result[1] == id:
return url + result[0]
return ''
if __name__ == '__main__':
print(get_poster_url('0749451'))

View file

@ -0,0 +1,54 @@
# -*- coding: UTF-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import re
from ox.cache import read_url
from ox import find_re, strip_tags
def get_url(id=None, imdb=None):
#this would also wor but does not cache:
'''
from urllib2 import urlopen
u = urlopen(url)
return u.url
'''
if imdb:
url = "http://www.rottentomatoes.com/alias?type=imdbid&s=%s" % imdb
data = read_url(url)
if "movie_title" in data:
movies = re.compile('(/m/.*?/)').findall(data)
if movies:
return "http://www.rottentomatoes.com" + movies[0]
return None
def get_og(data, key):
return find_re(data, '<meta property="og:%s".*?content="(.*?)"' % key)
def get_data(url):
data = read_url(url)
r = {}
r['title'] = find_re(data, '<h1 class="movie_title">(.*?)</h1>')
if '(' in r['title']:
r['year'] = find_re(r['title'], '\((\d*?)\)')
r['title'] = strip_tags(re.sub('\((\d*?)\)', '', r['title'])).strip()
r['summary'] = strip_tags(find_re(data, '<p id="movieSynopsis" class="movie_synopsis" itemprop="description">(.*?)</p>')).strip()
r['summary'] = r['summary'].replace('\t', ' ').replace('\n', ' ').replace(' ', ' ').replace(' ', ' ')
if not r['summary']:
r['summary'] = get_og(data, 'description')
meter = re.compile('<span id="all-critics-meter" class="meter(.*?)">(.*?)</span>').findall(data)
meter = filter(lambda m: m[1].isdigit(), meter)
if meter:
r['tomatometer'] = meter[0][1]
r['rating'] = find_re(data, 'Average Rating: <span>([\d.]+)/10</span>')
r['user_score'] = find_re(data, '<span class="meter popcorn numeric ">(\d+)</span>')
r['user_rating'] = find_re(data, 'Average Rating: ([\d.]+)/5')
poster = get_og(data, 'image')
if poster and not 'poster_default.gif' in poster:
r['posters'] = [poster]
for key in r.keys():
if not r[key]:
del r[key]
return r

View file

@ -0,0 +1,76 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import re
from six import string_types
from ..cache import read_url
from .. import decode_html
from ..utils import datetime
def cleanup(key, data, data_type):
if data:
if isinstance(data[0], string_types):
#FIXME: some types need strip_tags
#data = [strip_tags(decode_html(p)).strip() for p in data]
data = [decode_html(p).strip() for p in data]
elif isinstance(data[0], list) or isinstance(data[0], tuple):
data = [cleanup(key, p, data_type) for p in data]
while len(data) == 1 and not isinstance(data, string_types):
data = data[0]
if data_type == 'list' and isinstance(data, string_types):
data = [data, ]
elif data_type != 'list':
data = ''
return data
class SiteParser(dict):
baseUrl = ''
regex = {}
def get_url(self, page):
return "%s%s" % (self.baseUrl, page)
def read_url(self, url, timeout):
if not url in self._cache:
self._cache[url] = read_url(url, timeout=timeout, unicode=True)
return self._cache[url]
def __init__(self, timeout=-1):
self._cache = {}
for key in self.regex:
url = self.get_url(self.regex[key]['page'])
data = self.read_url(url, timeout)
if isinstance(self.regex[key]['re'], string_types):
data = re.compile(self.regex[key]['re'], re.DOTALL).findall(data)
data = cleanup(key, data, self.regex[key]['type'])
elif callable(self.regex[key]['re']):
data = self.regex[key]['re'](data)
else:
for r in self.regex[key]['re']:
if callable(r):
f = r
else:
f = re.compile(r, re.DOTALL).findall
if isinstance(data, string_types):
data = f(data)
else:
data = [f(d) for d in data]
data = cleanup(key, data, self.regex[key]['type'])
def apply_f(f, data):
if data and isinstance(data[0], list):
data = [f(d) for d in data]
else:
data = f(data)
return data
if self.regex[key]['type'] == 'float' and data:
data = apply_f(float, data)
elif self.regex[key]['type'] == 'int' and data:
data = apply_f(int, data)
elif self.regex[key]['type'] == 'date':
parse_date = lambda d: d and datetime.strptime('-'.join(d), '%m-%d-%Y').strftime('%Y-%m-%d')
data = apply_f(parse_date, data)
if data:
self[key] = data

View file

@ -0,0 +1,287 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
from datetime import datetime
import re
import time
import ox.cache
from ox.html import decode_html, strip_tags
import ox.net
def get_news(year, month, day):
sections = [
'politik', 'wirtschaft', 'panorama', 'sport', 'kultur', 'netzwelt',
'wissenschaft', 'unispiegel', 'schulspiegel', 'reise', 'auto'
]
dt = datetime(year, month, day)
day = int(dt.strftime('%j'))
date = dt.strftime('%d.%m.%Y')
news = []
for section in sections:
url = 'http://www.spiegel.de/%s/0,1518,archiv-%d-%03d,00.html' % (section, year, day)
if date == time.strftime('%d.%m.%Y', time.localtime()):
html = ox.net.read_url(url)
else:
html = ox.cache.read_url(url)
for item in re.compile('<div class="spTeaserCenterpage(.*?)</p>', re.DOTALL).findall(html):
dateString = strip_tags(re.compile('<div class="spDateTime">(.*?)</div>', re.DOTALL).findall(item)[0]).strip()
try:
description = format_string(re.compile('<p>(.*?)<', re.DOTALL).findall(item)[0])
except:
description = ''
try:
imageUrl = re.compile('<img src="(.*?)"').findall(item)[0]
except:
imageUrl = ''
try:
title = format_string(re.compile('alt=[\'|"](.*?)[\'|"] title=', re.DOTALL).findall(item)[0]).replace(' : ', ': ').replace('::', ':')
except:
title = ''
if dateString[:10] == date and description and imageUrl and title.find(': ') != -1:
new = {}
if len(dateString) == 10:
new['date'] = '%s-%s-%s 00:00' % (dateString[6:10], dateString[3:5], dateString[:2])
else:
new['date'] = '%s-%s-%s %s:%s' % (dateString[6:10], dateString[3:5], dateString[:2], dateString[12:14], dateString[15:17])
# fix decode_html
# new['description'] = format_string(decode_html(description))
new['description'] = format_string(description)
new['imageUrl'] = imageUrl
new['section'] = format_section(section)
new['title'] = format_string(title)
new['title1'] = new['title'].replace('\xdf', '\xdf\xdf')[:len(format_string(re.compile('<h4>(.*?)</h4>', re.DOTALL).findall(item)[0]))].replace('\xdf\xdf', '\xdf')
if new['title1'][-1:] == ':':
new['title1'] = new['title1'][0:-1]
new['title2'] = new['title'][len(new['title1']) + 2:]
new['url'] = re.compile('<a href="(.*?)"').findall(item)[0]
if new['url'][:1] == '/':
new['url'] = 'http://www.spiegel.de' + new['url']
news.append(new)
# print '%s, %s' % (new['section'], dateString)
'''
elif dateString[:10] == date and not description:
print dateString + ' - no description'
elif dateString[:10] == date and not imageUrl:
print dateString + ' - no image'
'''
return news
def split_title(title):
title1 = re.compile('(.*?): ').findall(title)[0]
title2 = re.compile(': (.*?)$').findall(title)[0]
return [title1, title2]
def format_string(string):
string = string.replace('<span class="spOptiBreak"> </span>', '')
string = string.replace('\n', ' ').replace(' ', ' ').strip()
string = string.replace('&amp;', '&').replace('&apos;', '\'').replace('&quot;', '"')
return string
def format_section(string):
return string[:1].upper() + string[1:].replace('spiegel', 'SPIEGEL')
def format_subsection(string):
# SPIEGEL, SPIEGEL special
subsection = {
'abi': 'Abi - und dann?',
'formel1': 'Formel 1',
'jobundberuf': 'Job & Beruf',
'leben': 'Leben U21',
'mensch': 'Mensch & Technik',
'sonst': '',
'staedte': u'St\xc3dte',
'ussports': 'US-Sports',
'wunderbar': 'wunderBAR'
}
if subsection.has_key(string):
return subsection[string].replace(u'\xc3', 'ae')
return string[:1].upper() + string[1:]
def get_issue(year, week):
coverUrl = 'http://www.spiegel.de/static/epaper/SP/%d/%d/ROSPANZ%d%03d0001-312.jpg' % (year, week, year, week)
if not ox.net.exists(coverUrl):
return None
url = 'http://service.spiegel.de/digas/servlet/epaper?Q=SP&JG=%d&AG=%d&SE=1&AN=INHALT' % (year, week)
contents = []
data = ox.cache.read_url(url)
items = re.compile('<a.?href="http://service.spiegel.de/digas/servlet/epaper\?Q=SP&JG=".?>(.*?)</a>').findall(data)
for item in items:
item = item[1]
page = int(re.compile('&amp;SE=(.*?)"').findall(item)[0])
title = strip_tags(item).strip()
contents.append({'title': title, 'page': page})
pageUrl = {}
pages = page + 2
for page in range(1, pages + 10):
url = 'http://www.spiegel.de/static/epaper/SP/%d/%d/ROSPANZ%d%03d%04d-205.jpg' % (year, week, year, week, page)
if ox.cache.exists(url):
pageUrl[page] = url
else:
pageUrl[page] = ''
return {'pages': pages, 'contents': contents, 'coverUrl': coverUrl, 'pageUrl': pageUrl}
def archive_issues():
'''
this is just an example of an archiving application
'''
p = {}
import os
from ox.utils import json
import time
archivePath = '/Volumes/Rolux Home/Desktop/Data/spiegel.de/Der Spiegel'
localtime = time.localtime()
year = int(time.strftime('%Y', localtime))
week = int(time.strftime('%W', localtime))
for y in range(year, 1993, -1):
if y == year:
wMax = week + 1
else:
wMax = 53
for w in range(wMax, 0, -1):
print 'get_issue(%d, %d)' % (y, w)
issue = get_issue(y, w)
if issue:
dirname = '%s/%d/%02d' % (archivePath, y, w)
if not os.path.exists(dirname):
os.makedirs(dirname)
filename = '%s/Der Spiegel %d %02d.json' % (dirname, y, w)
if not os.path.exists(filename):
data = json.dumps(issue, ensure_ascii = False)
f = open(filename, 'w')
f.write(data)
f.close()
filename = '%s/Der Spiegel %d %02d.txt' % (dirname, y, w)
if not os.path.exists(filename):
data = []
for item in issue['contents']:
data.append('%3d %s' % (item['page'], item['title']))
data = '\n'.join(data)
f = open(filename, 'w')
f.write(data)
f.close()
filename = '%s/Der Spiegel %d %02d.jpg' % (dirname, y, w)
if not os.path.exists(filename):
data = ox.cache.read_url(issue['coverUrl'])
f = open(filename, 'w')
f.write(data)
f.close()
for page in issue['pageUrl']:
url = issue['pageUrl'][page]
if url:
filename = '%s/Der Spiegel %d %02d %03d.jpg' % (dirname, y, w, page)
if not os.path.exists(filename):
data = ox.cache.read_url(url)
f = open(filename, 'w')
f.write(data)
f.close()
if not p:
p = {'num': 1, 'sum': issue['pages'], 'min': issue['pages'], 'max': issue['pages']}
else:
p['num'] += 1
p['sum'] += issue['pages']
if issue['pages'] < p['min']:
p['min'] = issue['pages']
if issue['pages'] > p['max']:
p['max'] = issue['pages']
print p['min'], p['sum'] / p['num'], p['max']
def archive_news():
'''
this is just an example of an archiving application
'''
import os
from ox.utils import json
import time
count = {}
colon = []
archivePath = '/Volumes/Rolux Home/Desktop/Data/spiegel.de/Spiegel Online'
days = [0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
localtime = time.localtime()
year = int(time.strftime('%Y', localtime))
month = int(time.strftime('%m', localtime))
day = int(time.strftime('%d', localtime)) - 1
for y in range(year, 1999, -1):
if y == year:
mMax = month
else:
mMax = 12
for m in range(mMax, 0, -1):
if y == year and m == month:
dMax = day
elif m == 2 and y % 4 == 0 and y % 400 != 0:
dMax = days[m] + 1
else:
dMax = days[m]
for d in range(dMax, 0, -1):
print 'getNews(%d, %d, %d)' % (y, m, d)
news = getNews(y, m ,d)
for new in news:
dirname = archivePath + '/' + new['date'][0:4] + '/' + new['date'][5:7] + new['date'][8:10] + '/' + new['date'][11:13] + new['date'][14:16]
if not os.path.exists(dirname):
os.makedirs(dirname)
if new['url'][-5:] == '.html':
filename = dirname + '/' + new['url'].split('/')[-1][:-5] + '.json'
else:
filename = dirname + '/' + new['url'] + '.json'
if not os.path.exists(filename) or True:
data = json.dumps(new, ensure_ascii = False)
f = open(filename, 'w')
f.write(data)
f.close()
filename = filename[:-5] + '.txt'
if not os.path.exists(filename) or True:
data = split_title(new['title'])
data.append(new['description'])
data = '\n'.join(data)
f = open(filename, 'w')
f.write(data)
f.close()
filename = dirname + '/' + new['imageUrl'].split('/')[-1]
if not os.path.exists(filename):
data = ox.cache.read_url(new['imageUrl'])
f = open(filename, 'w')
f.write(data)
f.close()
strings = new['url'].split('/')
string = strings[3]
if len(strings) == 6:
string += '/' + strings[4]
if not count.has_key(string):
count[string] = {'count': 1, 'string': '%s %s http://www.spiegel.de/%s/0,1518,archiv-%d-%03d,00.html' % (new['date'], new['date'], new['section'].lower(), y, int(datetime(y, m, d).strftime('%j')))}
else:
count[string] = {'count': count[string]['count'] + 1, 'string': '%s %s' % (new['date'], count[string]['string'][17:])}
strings = split_title(new['title'])
if strings[0] != new['title1'] or strings[1] != new['title2']:
colon.append('%s %s %s: %s' % (new['date'], new['title'], new['title1'], new['title2']))
for key in sorted(count):
print '%6d %-24s %s' % (count[key]['count'], key, count[key]['string'])
for value in colon:
print value
if __name__ == '__main__':
# spiegel = Spiegel(2008, 8)
# print spiegel.getContents()
# news = News(2001, 9, 10)
# output(news.getNews())
'''
x = []
for d in range(10, 30):
print '2/%d' % d
news = getNews(2008, 2, d)
for new in news:
strings = new['url'].split('/')
string = format_section(strings[3])
if len(strings) == 6:
string += '/' + format_subsection(strings[4])
if not string in x:
x.append(string)
print x
'''
# archive_issues()
archive_news()

View file

@ -0,0 +1,117 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
from datetime import datetime
import re
import socket
from urllib import quote, urlencode
from urllib2 import URLError
from ox import find_re, cache, strip_tags, decode_html, get_torrent_info, normalize_newlines
from ox.normalize import normalize_imdbid
import ox
from torrent import Torrent
cache_timeout = 24*60*60 # cache search only for 24 hours
season_episode = re.compile("S..E..", re.IGNORECASE)
def read_url(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None, unicode=False):
headers = headers.copy()
headers['Cookie'] = 'language=en_EN'
return cache.read_url(url, data, headers, timeout, unicode=unicode)
def find_movies(query=None, imdb=None, max_results=10):
if imdb:
query = "tt" + normalize_imdbid(imdb)
results = []
next = ["http://thepiratebay.org/search/%s/0/3/200" % quote(query), ]
page_count = 1
while next and page_count < 4:
page_count += 1
url = next[0]
if not url.startswith('http'):
if not url.startswith('/'):
url = "/" + url
url = "http://thepiratebay.org" + url
data = read_url(url, timeout=cache_timeout, unicode=True)
regexp = '''<tr.*?<td class="vertTh"><a href="/browse/(.*?)".*?<td><a href="(/torrent/.*?)" class="detLink".*?>(.*?)</a>.*?</tr>'''
for row in re.compile(regexp, re.DOTALL).findall(data):
torrentType = row[0]
torrentLink = "http://thepiratebay.org" + row[1]
torrentTitle = decode_html(row[2])
# 201 = Movies , 202 = Movie DVDR, 205 TV Shows
if torrentType in ['201']:
results.append((torrentTitle, torrentLink, ''))
if len(results) >= max_results:
return results
next = re.compile('<a.*?href="(.*?)".*?>.*?next.gif.*?</a>').findall(data)
return results
def get_id(piratebayId):
if piratebayId.startswith('http://torrents.thepiratebay.org/'):
piratebayId = piratebayId.split('org/')[1]
d = find_re(piratebayId, "tor/(\d+)")
if d:
piratebayId = d
d = find_re(piratebayId, "torrent/(\d+)")
if d:
piratebayId = d
return piratebayId
def exists(piratebayId):
piratebayId = get_id(piratebayId)
return ox.net.exists("http://thepiratebay.org/torrent/%s" % piratebayId)
def get_data(piratebayId):
_key_map = {
'spoken language(s)': u'language',
'texted language(s)': u'subtitle language',
'by': u'uploader',
'leechers': 'leecher',
'seeders': 'seeder',
}
piratebayId = get_id(piratebayId)
torrent = dict()
torrent[u'id'] = piratebayId
torrent[u'domain'] = 'thepiratebay.org'
torrent[u'comment_link'] = 'http://thepiratebay.org/torrent/%s' % piratebayId
data = read_url(torrent['comment_link'], unicode=True)
torrent[u'title'] = find_re(data, '<title>(.*?) \(download torrent\) - TPB</title>')
if not torrent[u'title']:
return None
torrent[u'title'] = decode_html(torrent[u'title']).strip()
torrent[u'imdbId'] = find_re(data, 'title/tt(\d{7})')
title = quote(torrent['title'].encode('utf-8'))
torrent[u'torrent_link']="http://torrents.thepiratebay.org/%s/%s.torrent" % (piratebayId, title)
for d in re.compile('dt>(.*?):</dt>.*?<dd.*?>(.*?)</dd>', re.DOTALL).findall(data):
key = d[0].lower().strip()
key = _key_map.get(key, key)
value = decode_html(strip_tags(d[1].strip()))
torrent[key] = value
torrent[u'description'] = find_re(data, '<div class="nfo">(.*?)</div>')
if torrent[u'description']:
torrent['description'] = normalize_newlines(decode_html(strip_tags(torrent['description']))).strip()
t = read_url(torrent[u'torrent_link'])
torrent[u'torrent_info'] = get_torrent_info(t)
return torrent
class Thepiratebay(Torrent):
'''
>>> Thepiratebay('123')
{}
>>> Thepiratebay('3951349')['infohash']
'4e84415d36ed7b54066160c05a0b0f061898d12b'
'''
def __init__(self, piratebayId):
self.data = get_data(piratebayId)
if not self.data:
return
Torrent.__init__(self)
published = self.data['uploaded']
published = published.replace(' GMT', '').split(' +')[0]
self['published'] = datetime.strptime(published, "%Y-%m-%d %H:%M:%S")

View file

@ -0,0 +1,37 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
from ox import int_value
class Torrent(dict):
'''
>>> Torrent()
{'files': 1, 'domain': u'', 'subtitle language': u'', 'seeder': -1, 'description': u'', 'language': u'', 'title': u'', 'imdbId': u'', 'downloaded': -1, 'leecher': -1, 'torrent_link': u'', 'torrent_info': {}, 'published': u'', 'announce': '', 'infohash': '', 'id': u'', 'comment_link': u'', 'size': -1}
'''
_string_keys = ('id', 'title', 'description', 'infohash', 'torrent_link', 'comment_link',
'imdbId', 'announce', 'domain', 'published', 'language', 'subtitle language')
_int_keys = ('size', 'seeder', 'leecher', 'downloaded', 'files')
_dict_keys = ('torrent_info', )
_list_keys = ()
data = {'torrent_info': {}}
def __init__(self):
for key in self._string_keys:
self[key] = self.data.get(key, u'')
for key in self._dict_keys:
self[key] = self.data.get(key, {})
for key in self._list_keys:
self[key] = self.data.get(key, [])
for key in self._int_keys:
value = self.data.get(key, -1)
if not isinstance(value, int):
value = int(int_value(value))
self[key] = value
self['infohash'] = self.data['torrent_info'].get('hash', '')
self['size'] = self.data['torrent_info'].get('size', -1)
self['announce'] = self.data['torrent_info'].get('announce', '')
if 'files' in self.data['torrent_info']:
self['files'] = len(self.data['torrent_info']['files'])
else:
self['files'] = 1

View file

@ -0,0 +1,32 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import re
import time
from ox import strip_tags, find_re
from ox.cache import read_url
def get_episode_data(url):
'''
prases informatin on tvcom episode pages
returns dict with title, show, description, score
example:
get_episode_data('http://www.tv.com/lost/do-no-harm/episode/399310/summary.html')
'''
data = read_url(url, unicode=True)
r = {}
r['description'] = strip_tags(find_re(data, 'div id="main-col">.*?<div>(.*?)</div').split('\r')[0])
r['show'] = find_re(data, '<h1>(.*?)</h1>')
r['title'] = find_re(data, '<title>.*?: (.*?) - TV.com </title>')
#episode score
r['episode score'] = find_re(data, '<span class="f-28 f-bold mt-10 mb-10 f-FF9 db lh-18">(.*?)</span>')
match = re.compile('Episode Number: (\d*?) &nbsp;&nbsp; Season Num: (\d*?) &nbsp;&nbsp; First Aired: (.*?) &nbsp').findall(data)
if match:
r['season'] = int(match[0][1])
r['episode'] = int(match[0][0])
#'Wednesday September 29, 2004' -> 2004-09-29
r['air date'] = time.strftime('%Y-%m-%d', time.strptime(match[0][2], '%A %B %d, %Y'))
return r

View file

@ -0,0 +1,35 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import re
from datetime import datetime
from urllib import quote
import lxml.html
import ox
from ox.cache import read_url
def find(query=None, user=None, timeout=60):
if user:
url = 'https://twitter.com/' + quote(user)
else:
url = 'https://twitter.com/search/' + quote(query)
data = ox.cache.read_url(url, timeout=timeout).decode('utf-8')
doc = lxml.html.document_fromstring(data)
tweets = []
for e in doc.xpath("//div[contains(@class, 'original-tweet')]"):
t = lxml.html.tostring(e)
text = e.xpath(".//p[contains(@class, 'js-tweet-text')]")[0]
html = lxml.html.tostring(text, encoding='unicode').strip()
text = ox.decode_html(ox.strip_tags(html)).strip()
user = re.compile('data-name="(.*?)"').findall(t)[0]
user = ox.decode_html(ox.strip_tags(user)).strip()
tweets.append({
'id': re.compile('data-tweet-id="(\d+)"').findall(t)[0],
'user-id': re.compile('data-user-id="(\d+)"').findall(t)[0],
'name': re.compile('data-screen-name="(.*?)"').findall(t)[0],
'time': datetime.fromtimestamp(int(re.compile('data-time="(\d+)"').findall(t)[0])),
'user': user,
'text': text,
'html': html,
})
return tweets

View file

@ -0,0 +1,99 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import re
from ox import find_re, strip_tags, decode_html
from ox.cache import read_url
def get_id(url):
return url.replace('http://www.ubu.com/', '').split('.html')[0]
def get_url(id):
return 'http://www.ubu.com/%s.html' % id
def get_data(url):
if not url.startswith('http:'):
url = get_url(url)
data = read_url(url, unicode=True)
m = {
'id': get_id(url),
'url': url,
'type': re.compile('ubu.com/(.*?)/').findall(url)[0]
}
for videourl, title in re.compile('<a href="(http://ubumexico.centro.org.mx/.*?)">(.*?)</a>').findall(data):
if videourl.endswith('.srt'):
m['srt'] = videourl
elif not 'video' in m:
m['video'] = videourl
m['video'] = m['video'].replace('/video/ ', '/video/').replace(' ', '%20')
if m['video'] == 'http://ubumexico.centro.org.mx/video/':
del m['video']
m['title'] = strip_tags(decode_html(title)).strip()
if not 'url' in m:
print url, 'missing'
if 'title' in m:
m['title'] = re.sub('(.*?) \(\d{4}\)$', '\\1', m['title'])
match = re.compile("flashvars','file=(.*?.flv)'").findall(data)
if match:
m['flv'] = match[0]
m['flv'] = m['flv'].replace('/video/ ', '/video/').replace(' ', '%20')
y = re.compile('\((\d{4})\)').findall(data)
if y:
m['year'] = int(y[0])
d = re.compile('Director: (.+)').findall(data)
if d:
m['director'] = strip_tags(decode_html(d[0])).strip()
a = re.compile('<a href="(.*?)">Back to (.*?)</a>', re.DOTALL).findall(data)
if a:
m['artist'] = strip_tags(decode_html(a[0][1])).strip()
else:
a = re.compile('<a href="(.*?)">(.*?) in UbuWeb Film').findall(data)
if a:
m['artist'] = strip_tags(decode_html(a[0][1])).strip()
else:
a = re.compile('<b>(.*?)\(b\..*?\d{4}\)').findall(data)
if a:
m['artist'] = strip_tags(decode_html(a[0])).strip()
elif m['id'] == 'film/lawder_color':
m['artist'] = 'Standish Lawder'
if 'artist' in m:
m['artist'] = m['artist'].replace('in UbuWeb Film', '')
m['artist'] = m['artist'].replace('on UbuWeb Film', '').strip()
if m['id'] == 'film/coulibeuf':
m['title'] = 'Balkan Baroque'
m['year'] = 1999
return m
def get_films():
ids = get_ids()
films = []
for id in ids:
info = get_data(id)
if info['type'] == 'film' and ('flv' in info or 'video' in info):
films.append(info)
return films
def get_ids():
data = read_url('http://www.ubu.com/film/')
ids = []
author_urls = []
for url, author in re.compile('<a href="(\./.*?)">(.*?)</a>').findall(data):
url = 'http://www.ubu.com/film' + url[1:]
data = read_url(url)
author_urls.append(url)
for u, title in re.compile('<a href="(.*?)">(.*?)</a>').findall(data):
if not u.startswith('http'):
if u == '../../sound/burroughs.html':
u = 'http://www.ubu.com/sound/burroughs.html'
elif u.startswith('../'):
u = 'http://www.ubu.com/' + u[3:]
else:
u = 'http://www.ubu.com/film/' + u
if u not in author_urls and u.endswith('.html'):
ids.append(u)
ids = [get_id(url) for url in list(set(ids))]
return ids

View file

@ -0,0 +1,27 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import re
from StringIO import StringIO
import xml.etree.ElementTree as ET
from ox.cache import read_url
from ox import find_string, find_re
def get_data(id):
url = 'http://www.vimeo.com/moogaloop/load/clip:%s' %id
xml = read_url(url)
tree = ET.parse(StringIO(xml))
request_signature = tree.find('request_signature').text
request_signature_expires = tree.find('request_signature_expires').text
data = {}
video_url = "http://www.vimeo.com/moogaloop/play/clip:%s/%s/%s/?q=" % \
(id, request_signature, request_signature_expires)
data['video_sd'] = video_url + 'sd'
data['video_hd'] = video_url + 'hd'
video = tree.find('video')
for key in ('caption', 'width', 'height', 'duration', 'thumbnail'):
data[key] = video.find(key).text
return data

View file

@ -0,0 +1,156 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
from __future__ import print_function
import re
from six.moves import urllib
from ox.utils import json
from ox.cache import read_url
from ox import find_re
def get_id(url):
return url.split("/")[-1]
def get_url(id=None, imdb=None, allmovie=None):
if imdb:
query = '"%s"'% imdb
result = find(query)
if result:
url = result[0][1]
data = get_movie_data(url)
if 'imdb_id' in data:
return url
return ""
if allmovie:
query = '"amg_id = 1:%s"'% allmovie
result = find(query)
if result:
url = result[0][1]
return url
return ''
return "http://en.wikipedia.org/wiki/%s" % id
def get_movie_id(title, director='', year=''):
query = '"%s" film %s %s' % (title, director, year)
result = find(query, 1)
if result:
return result[0][1]
return ''
def get_wiki_data(wikipedia_url):
url = wikipedia_url.replace('wikipedia.org/wiki/', 'wikipedia.org/w/index.php?title=')
url = "%s&action=raw" % url
data = read_url(url).decode('utf-8')
return data
def get_movie_data(wikipedia_url):
if not wikipedia_url.startswith('http'):
wikipedia_url = get_url(wikipedia_url)
data = get_wiki_data(wikipedia_url)
filmbox_data = find_re(data, '''\{\{[Ii]nfobox.[Ff]ilm(.*?)\n\}\}''')
filmbox = {}
_box = filmbox_data.strip().split('|')
for row in _box:
d = row.split('=')
if len(d) == 2:
_key = d[0].strip()
if _key:
key = _key
if key[0] == '|':
key = key[1:]
key = key.strip()
value = d[1].strip()
value = value.replace('<!-- see WP:ALT -->', '')
if '<br>' in value:
value = value.split('<br>')
if value:
if key in filmbox:
if isinstance(value, list) and isinstance(filmbox[key], basestring):
filmbox[key] = [filmbox[key]] + value
else:
filmbox[key] += value
if isinstance(filmbox[key], list):
filmbox[key] = [k for k in filmbox[key] if k]
else:
filmbox[key] = value
if not filmbox_data:
return filmbox
if 'amg_id' in filmbox and not filmbox['amg_id'].isdigit():
del filmbox['amg_id']
if 'Allmovie movie' in data:
filmbox['amg_id'] = find_re(data, 'Allmovie movie\|.*?(\d+)')
elif 'Allmovie title' in data:
filmbox['amg_id'] = find_re(data, 'Allmovie title\|.*?(\d+)')
if 'Official website' in data:
filmbox['website'] = find_re(data, 'Official website\|(.*?)}').strip()
r = re.compile('{{IMDb title\|id=(\d{7})', re.IGNORECASE).findall(data)
if r:
filmbox['imdb_id'] = r[0]
else:
r = re.compile('{{IMDb title\|(\d{7})', re.IGNORECASE).findall(data)
if r:
filmbox['imdb_id'] = r[0]
r = re.compile('{{Internet Archive.*?\|id=(.*?)[\|}]', re.IGNORECASE).findall(data)
if r:
filmbox['archiveorg_id'] = r[0]
r = re.compile('{{mojo title\|(.*?)[\|}]', re.IGNORECASE).findall(data)
if r:
filmbox['mojo_id'] = r[0].replace('id=', '')
r = re.compile('{{rotten-tomatoes\|(.*?)[\|}]', re.IGNORECASE).findall(data)
if r:
filmbox['rottentomatoes_id'] = r[0].replace('id=', '')
if 'google video' in data:
filmbox['google_video_id'] = find_re(data, 'google video\|.*?(\d*?)[\|}]')
if 'DEFAULTSORT' in data:
filmbox['title_sort'] = find_re(data, '''\{\{DEFAULTSORT:(.*?)\}\}''')
return filmbox
def get_image_url(name):
url = 'http://en.wikipedia.org/wiki/Image:' + name.replace(' ', '%20')
data = read_url(url)
url = find_re(data, 'href="(http://upload.wikimedia.org/.*?)"')
if not url:
url = find_re(data, 'href="(//upload.wikimedia.org/.*?)"')
if url:
url = 'http:' + url
return url
def get_poster_url(wikipedia_url):
if not wikipedia_url.startswith('http'): wikipedia_url = get_url(wikipedia_url)
data = get_movie_data(wikipedia_url)
if 'image' in data:
return get_image_url(data['image'])
return ''
def get_movie_poster(wikipedia_url):
# deprecated, use get_poster_url()
return get_poster_url(wikipedia_url)
def get_allmovie_id(wikipedia_url):
data = get_movie_data(wikipedia_url)
return data.get('amg_id', '')
def find(query, max_results=10):
query = {'action': 'query', 'list':'search', 'format': 'json',
'srlimit': max_results, 'srwhat': 'text', 'srsearch': query.encode('utf-8')}
url = "http://en.wikipedia.org/w/api.php?" + urllib.parse.urlencode(query)
data = read_url(url)
if not data:
data = read_url(url, timeout=0)
result = json.loads(data.decode('utf-8'))
results = []
if result and 'query' in result:
for r in result['query']['search']:
title = r['title']
url = "http://en.wikipedia.org/wiki/%s" % title.replace(' ', '_')
results.append((title, url, ''))
return results

View file

@ -0,0 +1,217 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
from urllib import quote, unquote_plus
import urllib2
import cookielib
import re
from xml.dom.minidom import parseString
import json
import feedparser
import ox
from ox.cache import read_url, cache_timeout
def get_id(url):
match = re.compile('v=(.+?)($|&)').findall(url)
if match:
return match[0][0]
def get_url(id):
return 'http://www.youtube.com/watch?v=%s' % id
def video_url(youtubeId, format='mp4', timeout=cache_timeout):
"""
youtubeId - if of video
format - video format, options: webm, 1080p, 720p, mp4, high
"""
fmt = None
if format == '4k':
fmt=38
elif format == '1080p':
fmt=37
elif format == '720p':
fmt=22
elif format == 'mp4':
fmt=18
elif format == 'high':
fmt=35
elif format == 'webm':
streams = videos(youtubeId, 'webm')
return streams[max(streams.keys())]['url']
streams = videos(youtubeId)
if str(fmt) in streams:
return streams[str(fmt)]['url']
def get_video_info(id):
eurl = get_url(id)
data = read_url(eurl)
t = re.compile('\W[\'"]?t[\'"]?: ?[\'"](.+?)[\'"]').findall(data)
if t:
t = t[0]
else:
raise IOError
url = "http://www.youtube.com/get_video_info?&video_id=%s&el=$el&ps=default&eurl=%s&hl=en_US&t=%s" % (id, quote(eurl), quote(t))
data = read_url(url)
info = {}
for part in data.split('&'):
key, value = part.split('=')
info[key] = unquote_plus(value).replace('+', ' ')
return info
def find(query, max_results=10, offset=1, orderBy='relevance'):
query = quote(query)
url = "http://gdata.youtube.com/feeds/api/videos?vq=%s&orderby=%s&start-index=%s&max-results=%s" % (query, orderBy, offset, max_results)
data = read_url(url)
fd = feedparser.parse(data)
videos = []
for item in fd.entries:
id = item['id'].split('/')[-1]
title = item['title']
description = item['description']
videos.append((title, id, description))
if len(videos) >= max_results:
return videos
return videos
def info(id, timeout=cache_timeout):
info = {}
if id.startswith('http'):
id = get_id(id)
if not id:
return info
url = "http://gdata.youtube.com/feeds/api/videos/%s?v=2" % id
data = read_url(url, timeout=timeout)
xml = parseString(data)
info['id'] = id
info['url'] = get_url(id)
info['title'] = xml.getElementsByTagName('title')[0].firstChild.data
info['description'] = xml.getElementsByTagName('media:description')[0].firstChild.data
info['date'] = xml.getElementsByTagName('published')[0].firstChild.data.split('T')[0]
info['author'] = "http://www.youtube.com/user/%s"%xml.getElementsByTagName('name')[0].firstChild.data
info['categories'] = []
for cat in xml.getElementsByTagName('media:category'):
info['categories'].append(cat.firstChild.data)
k = xml.getElementsByTagName('media:keywords')[0].firstChild
if k:
info['keywords'] = k.data.split(', ')
data = read_url(info['url'], timeout=timeout)
match = re.compile('<h4>License:</h4>(.*?)</p>', re.DOTALL).findall(data)
if match:
info['license'] = match[0].strip()
info['license'] = re.sub('<.+?>', '', info['license']).strip()
url = "http://www.youtube.com/api/timedtext?hl=en&type=list&tlangs=1&v=%s&asrs=1" % id
data = read_url(url, timeout=timeout)
xml = parseString(data)
languages = [t.getAttribute('lang_code') for t in xml.getElementsByTagName('track')]
if languages:
info['subtitles'] = {}
for language in languages:
url = "http://www.youtube.com/api/timedtext?hl=en&v=%s&type=track&lang=%s&name&kind"%(id, language)
data = read_url(url, timeout=timeout)
xml = parseString(data)
subs = []
for t in xml.getElementsByTagName('text'):
start = float(t.getAttribute('start'))
duration = t.getAttribute('dur')
if not duration:
duration = '2'
end = start + float(duration)
if t.firstChild:
text = t.firstChild.data
subs.append({
'in': start,
'out': end,
'value': ox.decode_html(text),
})
info['subtitles'][language] = subs
return info
def videos(id, format=''):
stream_type = {
'flv': 'video/x-flv',
'webm': 'video/webm',
'mp4': 'video/mp4'
}.get(format)
info = get_video_info(id)
stream_map = info['url_encoded_fmt_stream_map']
streams = {}
for x in stream_map.split(','):
stream = {}
#for s in x.split('\\u0026'):
for s in x.split('&'):
key, value = s.split('=')
value = unquote_plus(value)
stream[key] = value
if 'url' in stream and 'sig' in stream:
stream['url'] = '%s&signature=%s' % (stream['url'], stream['sig'])
if not stream_type or stream['type'].startswith(stream_type):
streams[stream['itag']] = stream
return streams
def playlist(url):
data = read_url(url)
items = []
for i in list(set(re.compile('<a href="(/watch\?v=.*?)" title="(.*?)" ').findall(data))):
items.append({
'title': i[1],
'url': 'http://www.youtube.com' + i[0].split('&amp;')[0]
})
return items
def download_webm(id, filename):
stream_type = 'video/webm'
url = "http://www.youtube.com/watch?v=%s" % id
cj = cookielib.CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
opener.addheaders = [
('User-Agent',
'Mozilla/5.0 (X11; Linux i686; rv:2.0) Gecko/20100101 Firefox/4.0'),
('Accept-Language', 'en-us, en;q=0.50')
]
u = opener.open(url)
data = u.read()
u.close()
match = re.compile('"url_encoded_fmt_stream_map": "(.*?)"').findall(data)
streams = {}
for x in match[0].split(','):
stream = {}
for s in x.split('\\u0026'):
key, value = s.split('=')
value = unquote_plus(value)
stream[key] = value
if stream['type'].startswith(stream_type):
streams[stream['itag']] = stream
if streams:
s = max(streams.keys())
url = streams[s]['url']
if 'sig' in streams[s]:
url += 'signature=' + streams[s]['sig']
else:
return None
#download video and save to file.
u = opener.open(url)
f = open(filename, 'w')
data = True
while data:
data = u.read(4096)
f.write(data)
f.close()
u.close()
return filename
def get_config(id):
if id.startswith('http'):
url = id
else:
url = get_url(id)
data = read_url(url)
match = re.compile('ytplayer.config = (.*?);<').findall(data)
if match:
config = json.load(match[0])
return config