Compare commits

...

41 commits

Author SHA1 Message Date
j
e00d23e35c cleanup 2023-11-18 16:40:39 +01:00
j
32224ae005 requests is always required now 2023-10-11 11:29:42 +01:00
j
2f9e27b2f5 allow ): and ]: in title 2023-10-11 11:29:09 +01:00
j
76bb5b53d3 depend on pillow>=10 2023-08-31 19:20:50 +01:00
j
dd130004ee fix get_width 2023-08-31 18:58:26 +01:00
j
d5635508bb use draw.textbox instead of draw.textsize 2023-08-25 17:07:08 +02:00
j
31a491570c the many ways of tagging video rotation 2023-08-25 12:13:01 +02:00
j
4414998759 Image.ANTIALIAS->Image.LANCZOS 2023-08-25 00:18:23 +02:00
j
b4fbb863af don't import removed file 2023-07-31 13:13:16 +02:00
j
973c7e1ef1 remove MultiPartForm not needed with requests 2023-07-28 20:50:17 +02:00
j
2bac617dc9 update to version 3 2023-07-28 09:15:32 +02:00
j
99e221095b get rid of u string literal 2023-07-27 18:37:28 +02:00
j
d03a6b120d fix sanitize_fragment('\ufeff') 2023-07-27 18:37:28 +02:00
j
6b4a307e23 python3 does not need L 2023-07-27 18:37:13 +02:00
j
a0d5c793eb use CaseInsensitiveDict 2023-07-27 18:37:13 +02:00
j
bf34774533 requests is always required now 2023-07-27 18:07:49 +02:00
j
adad3be419 drop six and python2 support 2023-07-27 13:07:13 +02:00
j
955b4a4e9b avoid description: None 2023-07-07 15:30:34 +05:30
j
677b61877e get all connections 2023-07-07 14:50:14 +05:30
j
773d288f55 variable name 2023-07-06 18:48:34 +05:30
j
2d5171bb3f fix filmingLocations 2023-07-06 18:44:32 +05:30
j
16f1c35875 keywords=>keyword 2023-07-06 18:37:23 +05:30
j
4b531c55aa use requests for api 2023-07-06 18:35:13 +05:30
j
4feacb4a97 don't pass forwarded for header 2023-07-06 18:34:08 +05:30
j
d630f4b19c parse keywords 2023-07-06 18:32:45 +05:30
j
e6782b3c17 not all movies have connections 2023-07-06 18:13:26 +05:30
j
baec9c4ea6 180/-180 rotation does not change width/height 2023-07-06 17:07:11 +05:30
j
3debebf923 imdb fixes 2023-03-10 17:39:31 +01:00
j
a3cef06ad7 fix imdb parsing 2023-02-03 18:28:54 +01:00
j
e1657994ca add type json 2023-02-03 16:28:05 +01:00
j
5919345d3d fix aspect ratio 2022-10-22 11:50:46 +02:00
j
8e6bea8972 flip display_aspect_ratio if rotated 2022-06-14 22:29:47 +02:00
j
a1a3de685c more creators 2022-04-18 23:23:08 +01:00
j
d9870232cb add debug 2022-04-18 23:00:11 +01:00
j
6d968d54cc fix series creator 2022-04-18 22:59:16 +01:00
j
373ff6ee0f split real media 2022-01-01 14:31:33 +01:00
j
868a401553 detect add real media files 2021-11-14 13:35:26 +00:00
j
67c6c24131 add m2v 2021-09-22 18:56:25 +02:00
j
ad2ccd4626 parse google infobox 2021-08-29 13:43:33 +02:00
j
2172bcb3fb fix criterion parser 2021-08-07 11:30:23 +02:00
j
887760acc1 e.read() returns bytes 2021-06-18 12:23:10 +01:00
39 changed files with 585 additions and 883 deletions

View file

@ -5,7 +5,7 @@ try:
from . import __version from . import __version
__version__ = __version.VERSION __version__ = __version.VERSION
except: except:
__version__ = '2.3.x' __version__ = '3.0.x'
from . import cache from . import cache
from . import js from . import js
@ -17,7 +17,6 @@ from . import vtt
from .api import * from .api import *
from .file import * from .file import *
from .form import *
from .format import * from .format import *
from .geo import * from .geo import *
from .html import * from .html import *

View file

@ -4,19 +4,20 @@
from __future__ import print_function from __future__ import print_function
from types import MethodType from types import MethodType
import gzip import gzip
import mimetypes
import os import os
import shutil import shutil
import sys import sys
import time import time
from six.moves import http_cookiejar as cookielib from http import cookiejar as cookielib
from six import BytesIO, PY2 from io import BytesIO
from six.moves import urllib import urllib
from six.moves.urllib.parse import urlparse from urllib.parse import urlparse
import requests
from . import __version__ from . import __version__
from .utils import json from .utils import json
from .form import MultiPartForm
__all__ = ['getAPI', 'API'] __all__ = ['getAPI', 'API']
@ -37,12 +38,13 @@ class API(object):
self._cj = cj self._cj = cj
else: else:
self._cj = cookielib.CookieJar() self._cj = cookielib.CookieJar()
self._opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(self._cj),
urllib.request.HTTPHandler(debuglevel=self.debuglevel))
self._opener.addheaders = [
('User-Agent', '%s/%s' % (self.__name__, self.__version__))
]
self._requests_session = requests.Session()
self._requests_session.cookies = self._cj
self._requests_session.headers = {
'User-Agent': '%s/%s' % (self.__name__, self.__version__),
'Accept-Encoding': 'gzip, deflate',
}
self.url = url self.url = url
r = self._request('api', {'docs': True}) r = self._request('api', {'docs': True})
self._properties = r['data']['actions'] self._properties = r['data']['actions']
@ -53,9 +55,6 @@ class API(object):
def _add_method(self, method, name): def _add_method(self, method, name):
if name is None: if name is None:
name = method.func_name name = method.func_name
if PY2:
setattr(self, name, MethodType(method, self, type(self)))
else:
setattr(self, name, MethodType(method, self)) setattr(self, name, MethodType(method, self))
def _add_action(self, action): def _add_action(self, action):
@ -70,37 +69,20 @@ class API(object):
return self._request(action, kw) return self._request(action, kw)
if 'doc' in self._properties[action]: if 'doc' in self._properties[action]:
method.__doc__ = self._properties[action]['doc'] method.__doc__ = self._properties[action]['doc']
if PY2:
method.func_name = str(action)
else:
method.func_name = action method.func_name = action
self._add_method(method, action) self._add_method(method, action)
def _json_request(self, url, form): def _json_request(self, url, data, files=None):
result = {} result = {}
try: try:
body = form.body() request = self._requests_session.post(url, data=data, files=files)
if PY2: result = request.json()
if not isinstance(url, bytes): return result
url = url.encode('utf-8')
request = urllib.request.Request(url)
request.add_data(body)
else:
request = urllib.request.Request(url, data=body, method='POST')
request.add_header('Content-Type', form.get_content_type())
request.add_header('Content-Length', str(len(body)))
request.add_header('Accept-Encoding', 'gzip, deflate')
f = self._opener.open(request)
result = f.read()
if f.headers.get('content-encoding', None) == 'gzip':
result = gzip.GzipFile(fileobj=BytesIO(result)).read()
result = result.decode('utf-8')
return json.loads(result)
except urllib.error.HTTPError as e: except urllib.error.HTTPError as e:
if self.DEBUG: if self.DEBUG:
import webbrowser import webbrowser
if e.code >= 500: if e.code >= 500:
with open('/tmp/error.html', 'w') as f: with open('/tmp/error.html', 'wb') as f:
f.write(e.read()) f.write(e.read())
webbrowser.open_new_tab('/tmp/error.html') webbrowser.open_new_tab('/tmp/error.html')
@ -125,17 +107,15 @@ class API(object):
raise raise
def _request(self, action, data=None): def _request(self, action, data=None):
form = MultiPartForm() form = {
form.add_field('action', action) 'action': action
}
if data: if data:
form.add_field('data', json.dumps(data)) form['data'] = json.dumps(data)
return self._json_request(self.url, form) return self._json_request(self.url, form)
def get_url(self, url): def get_url(self, url):
request = urllib.request.Request(url, method='GET') return self._requests_session.get(url).content
f = self._opener.open(request)
result = f.read()
return result
def save_url(self, url, filename, overwrite=False): def save_url(self, url, filename, overwrite=False):
chunk_size = 16 * 1024 chunk_size = 16 * 1024
@ -143,21 +123,15 @@ class API(object):
dirname = os.path.dirname(filename) dirname = os.path.dirname(filename)
if dirname and not os.path.exists(dirname): if dirname and not os.path.exists(dirname):
os.makedirs(dirname) os.makedirs(dirname)
request = urllib.request.Request(url, method='GET')
tmpname = filename + '.tmp' tmpname = filename + '.tmp'
with open(tmpname, 'wb') as fd: with open(tmpname, 'wb') as fd:
u = self._opener.open(request) r = self._requests_session.get(url)
for chunk in iter(lambda: u.read(chunk_size), b''): for chunk in iter(lambda: r.read(chunk_size), b''):
fd.write(chunk) fd.write(chunk)
shutil.move(tmpname, filename) shutil.move(tmpname, filename)
def upload_chunks(self, url, filename, data=None, silent=False): def upload_chunks(self, url, filename, data=None, silent=False):
form = MultiPartForm() data = self._json_request(url, data)
if data:
for key in data:
form.add_field(key, data[key])
data = self._json_request(url, form)
def full_url(path): def full_url(path):
if path.startswith('/'): if path.startswith('/'):
@ -178,16 +152,20 @@ class API(object):
resume_offset = 0 resume_offset = 0
chunk = f.read(CHUNK_SIZE) chunk = f.read(CHUNK_SIZE)
fname = os.path.basename(filename) fname = os.path.basename(filename)
mime_type = mimetypes.guess_type(fname)[0] or 'application/octet-stream'
if not isinstance(fname, bytes): if not isinstance(fname, bytes):
fname = fname.encode('utf-8') fname = fname.encode('utf-8')
while chunk: while chunk:
form = MultiPartForm() meta = {
form.add_file('chunk', fname, chunk) 'offset': str(done)
}
if len(chunk) < CHUNK_SIZE or f.tell() == fsize: if len(chunk) < CHUNK_SIZE or f.tell() == fsize:
form.add_field('done', '1') meta['done'] = '1'
form.add_field('offset', str(done)) files = [
('chunk', (fname, chunk, mime_type))
]
try: try:
data = self._json_request(uploadUrl, form) data = self._json_request(uploadUrl, meta, files=files)
except KeyboardInterrupt: except KeyboardInterrupt:
if not slient: if not slient:
print("\ninterrupted by user.") print("\ninterrupted by user.")

View file

@ -10,15 +10,11 @@ import sqlite3
import time import time
import zlib import zlib
from six import BytesIO from io import BytesIO
from six.moves import urllib import urllib
from six import PY2 import requests
try: from requests.structures import CaseInsensitiveDict
import requests
USE_REQUESTS = True
requests_session = requests.Session()
except:
USE_REQUESTS = False
from .utils import json from .utils import json
from .file import makedirs from .file import makedirs
@ -28,6 +24,7 @@ from .net import DEFAULT_HEADERS, detect_encoding
cache_timeout = 30*24*60*60 # default is 30 days cache_timeout = 30*24*60*60 # default is 30 days
requests_session = requests.Session()
COMPRESS_TYPES = ( COMPRESS_TYPES = (
'text/html', 'text/html',
@ -69,7 +66,7 @@ def get_headers(url, data=None, headers=None, timeout=cache_timeout):
if not url_headers: if not url_headers:
url_headers = net.get_headers(url, data, headers) url_headers = net.get_headers(url, data, headers)
store.set(url, data, -1, url_headers) store.set(url, data, -1, url_headers)
return url_headers return CaseInsensitiveDict(url_headers)
def get_json(url, data=None, headers=None, timeout=cache_timeout): def get_json(url, data=None, headers=None, timeout=cache_timeout):
return json.loads(read_url(url, data, headers, timeout).decode('utf-8')) return json.loads(read_url(url, data, headers, timeout).decode('utf-8'))
@ -101,9 +98,11 @@ def read_url(url, data=None, headers=None, timeout=cache_timeout, valid=None, un
result = store.get(url, data, headers, timeout) result = store.get(url, data, headers, timeout)
url_headers = {} url_headers = {}
if not result: if not result:
if USE_REQUESTS:
if headers is None: if headers is None:
headers = DEFAULT_HEADERS.copy() headers = DEFAULT_HEADERS.copy()
if data:
r = requests_session.post(url, data=data, headers=headers)
else:
r = requests_session.get(url, headers=headers) r = requests_session.get(url, headers=headers)
for key in r.headers: for key in r.headers:
url_headers[key.lower()] = r.headers[key] url_headers[key.lower()] = r.headers[key]
@ -113,20 +112,6 @@ def read_url(url, data=None, headers=None, timeout=cache_timeout, valid=None, un
store.set(url, post_data=data, data=result, headers=url_headers) store.set(url, post_data=data, data=result, headers=url_headers)
else: else:
raise InvalidResult(result, url_headers) raise InvalidResult(result, url_headers)
else:
try:
url_headers, result = net.read_url(url, data, headers, return_headers=True)
except urllib.error.HTTPError as e:
e.headers['Status'] = "%s" % e.code
for key in e.headers:
url_headers[key.lower()] = e.headers[key]
result = e.read()
if url_headers.get('content-encoding', None) == 'gzip':
result = gzip.GzipFile(fileobj=BytesIO(result)).read()
if not valid or valid(result, url_headers):
store.set(url, post_data=data, data=result, headers=url_headers)
else:
raise InvalidResult(result, url_headers)
if unicode: if unicode:
ctype = url_headers.get('content-type', '').lower() ctype = url_headers.get('content-type', '').lower()
if 'charset' in ctype: if 'charset' in ctype:
@ -239,8 +224,6 @@ class SQLiteCache(Cache):
elif value == 'data': elif value == 'data':
if row[1] == 1: if row[1] == 1:
r = zlib.decompress(r) r = zlib.decompress(r)
elif PY2:
r = str(r)
break break
c.close() c.close()
@ -279,6 +262,8 @@ class SQLiteCache(Cache):
data = zlib.compress(data) data = zlib.compress(data)
else: else:
compressed = 0 compressed = 0
if isinstance(data, str):
data = data.encode("utf-8")
data = sqlite3.Binary(data) data = sqlite3.Binary(data)
#fixme: this looks wrong #fixme: this looks wrong

View file

@ -19,7 +19,8 @@ __all__ = ['sha1sum', 'oshash', 'avinfo', 'makedirs', 'iexists']
EXTENSIONS = { EXTENSIONS = {
'audio': [ 'audio': [
'aac', 'aif', 'aiff', 'amr', 'aac', 'aif', 'aiff', 'amr',
'flac', 'm4a', 'mp3', 'oga', 'ogg', 'wav', 'wma', 'opus' 'flac', 'm4a', 'mp3', 'oga', 'ogg', 'wav', 'wma', 'opus',
'ra', # Real Audio
], ],
'image': [ 'image': [
'bmp', 'gif', 'jpeg', 'jpg', 'png', 'svg', 'webp' 'bmp', 'gif', 'jpeg', 'jpg', 'png', 'svg', 'webp'
@ -29,11 +30,12 @@ EXTENSIONS = {
], ],
'video': [ 'video': [
'3gp', '3gp',
'avi', 'divx', 'dv', 'flv', 'm2t', 'm2ts', 'm4v', 'mkv', 'mov', 'mp4', 'avi', 'divx', 'dv', 'flv', 'm2t', 'm2ts', 'm2v', 'm4v', 'mkv', 'mov', 'mp4',
'mpeg', 'mpg', 'mts', 'ogm', 'ogv', 'rm', 'rmvb', 'vob', 'webm', 'wmv', 'asf', 'mpeg', 'mpg', 'mts', 'ogm', 'ogv', 'vob', 'webm', 'wmv', 'asf',
'mod', 'tod', # http://en.wikipedia.org/wiki/MOD_and_TOD 'mod', 'tod', # http://en.wikipedia.org/wiki/MOD_and_TOD
'mxf', 'ts', 'mxf', 'ts',
'dat', # VOD files 'dat', # VOD files
'rm', 'rmvb', # Real Media
], ],
} }
@ -214,12 +216,16 @@ def ffprobe(filename):
] ]
for s in ffinfo['streams']: for s in ffinfo['streams']:
tags = s.pop('tags', {}) tags = s.pop('tags', {})
side_data_list = s.pop('side_data_list', [])
language = None language = None
for t in tags: for t in tags:
if t == 'language': if t == 'language':
language = tags[t] language = tags[t]
else: else:
info['metadata'][t] = tags[t] info['metadata'][t] = tags[t]
for kv in side_data_list:
for k, v in kv.items():
info['metadata'][k] = v
if s.get('codec_type') in ('audio', 'video'): if s.get('codec_type') in ('audio', 'video'):
stream = {} stream = {}
if language and language != 'und': if language and language != 'und':
@ -273,9 +279,15 @@ def ffprobe(filename):
pass pass
# print s # print s
for v in info['video']: for v in info['video']:
if 'rotate' in info.get('metadata', {}) and int(info['metadata']['rotate']) in (-180, -90, 90, 180):
v['width'], v['height'] = v['height'], v['width']
k = 'display_aspect_ratio' k = 'display_aspect_ratio'
if 'rotate' in info.get('metadata', {}) and int(info['metadata']['rotate']) in (-90, 90):
v['width'], v['height'] = v['height'], v['width']
if k in v:
v[k] = ':'.join(reversed(v[k].split(':')))
elif 'rotation' in info.get('metadata', {}) and int(info['metadata']['rotation']) in (-90, 90):
v['width'], v['height'] = v['height'], v['width']
if k in v:
v[k] = ':'.join(reversed(v[k].split(':')))
if k not in v and 'width' in v \ if k not in v and 'width' in v \
or (k in v and v[k] == '0:1'): or (k in v and v[k] == '0:1'):
v[k] = '%d:%d' % (v['width'], v['height']) v[k] = '%d:%d' % (v['width'], v['height'])

View file

@ -6,7 +6,6 @@ from __future__ import print_function
import unicodedata import unicodedata
from six import unichr, text_type
__all__ = ['fix_bad_unicode'] __all__ = ['fix_bad_unicode']
@ -151,7 +150,7 @@ def text_badness(text):
- Improbable single-byte characters, such as ƒ or ¬ - Improbable single-byte characters, such as ƒ or ¬
- Letters in somewhat rare scripts - Letters in somewhat rare scripts
''' '''
assert isinstance(text, text_type) assert isinstance(text, str)
errors = 0 errors = 0
very_weird_things = 0 very_weird_things = 0
weird_things = 0 weird_things = 0
@ -289,7 +288,7 @@ SINGLE_BYTE_WEIRDNESS = (
# Pre-cache the Unicode data saying which of these first 256 characters are # Pre-cache the Unicode data saying which of these first 256 characters are
# letters. We'll need it often. # letters. We'll need it often.
SINGLE_BYTE_LETTERS = [ SINGLE_BYTE_LETTERS = [
unicodedata.category(unichr(i)).startswith('L') unicodedata.category(chr(i)).startswith('L')
for i in range(256) for i in range(256)
] ]

View file

@ -1,108 +0,0 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
# GPL 2014
from __future__ import print_function
import itertools
import mimetypes
import os
import hashlib
import sys
from six import PY2
__all__ = ['MultiPartForm']
# from /usr/lib/python3.4/email/generator.py
# Helper used by Generator._make_boundary
_width = len(repr(sys.maxsize-1))
_fmt = '%%0%dd' % _width
def _make_boundary():
# Craft a random boundary.
boundary = ('=' * 15) + hashlib.sha1(os.urandom(32)).hexdigest() + '=='
return boundary
class MultiPartForm(object):
"""Accumulate the data to be used when posting a form."""
def __init__(self):
self.form_fields = []
self.files = []
self.boundary = _make_boundary()
return
def get_content_type(self):
return 'multipart/form-data; boundary=%s' % self.boundary
def add_field(self, name, value):
"""Add a simple field to the form data."""
if isinstance(name, bytes):
name = name.decode('utf-8')
if isinstance(value, bytes):
value = value.decode('utf-8')
self.form_fields.append((name, value))
return
def add_file(self, fieldname, filename, fileHandle, mimetype=None):
"""Add a file to be uploaded."""
if isinstance(fieldname, bytes):
fieldname = fieldname.decode('utf-8')
if isinstance(filename, bytes):
filename = filename.decode('utf-8')
if hasattr(fileHandle, 'read'):
body = fileHandle.read()
else:
body = fileHandle
if mimetype is None:
mimetype = mimetypes.guess_type(filename)[0] or 'application/octet-stream'
self.files.append((fieldname, filename, mimetype, body))
return
def __str__(self):
body = self.body()
if not PY2:
body = body.decode('utf-8')
return body
def body(self):
"""Return a byte string representing the form data, including attached files."""
# Build a list of lists, each containing "lines" of the
# request. Each part is separated by a boundary string.
# Once the list is built, return a string where each
# line is separated by '\r\n'.
parts = []
part_boundary = '--' + self.boundary
# Add the form fields
parts.extend(
[ part_boundary,
'Content-Disposition: form-data; name="%s"' % name,
'',
value,
]
for name, value in self.form_fields
)
# Add the files to upload
parts.extend(
[ part_boundary,
'Content-Disposition: file; name="%s"; filename="%s"' % \
(field_name, filename),
'Content-Type: %s' % content_type,
'',
body,
]
for field_name, filename, content_type, body in self.files
)
# Flatten the list and add closing boundary marker,
# then return CR+LF separated data
flattened = list(itertools.chain(*parts))
flattened.append('--' + self.boundary + '--')
flattened.append('')
flattened = [part if isinstance(part, bytes) else part.encode('utf-8') for part in flattened]
return b'\r\n'.join(flattened)

View file

@ -4,8 +4,6 @@ import math
import re import re
import string import string
from six import text_type
def toAZ(num): def toAZ(num):
""" """
Converts an integer to bijective base 26 string using A-Z Converts an integer to bijective base 26 string using A-Z
@ -108,7 +106,7 @@ def to32(q):
>>> to32(555306645) >>> to32(555306645)
'GHJKMN' 'GHJKMN'
>>> to32(800197332334559L) >>> to32(800197332334559)
'PQRSTVWXYZ' 'PQRSTVWXYZ'
>>> to32(32) >>> to32(32)
@ -226,36 +224,36 @@ def to36(q):
def from36(q): def from36(q):
return int(q, 36) return int(q, 36)
def int_value(strValue, default=u''): def int_value(strValue, default=''):
""" """
>>> int_value('abc23') >>> int_value('abc23')
u'23' '23'
>>> int_value(' abc23') >>> int_value(' abc23')
u'23' '23'
>>> int_value('ab') >>> int_value('ab')
u'' ''
""" """
try: try:
val = re.compile('(\d+)').findall(text_type(strValue).strip())[0] val = re.compile('(\d+)').findall(str(strValue).strip())[0]
except: except:
val = default val = default
return val return val
def float_value(strValue, default=u''): def float_value(strValue, default=''):
""" """
>>> float_value('abc23.4') >>> float_value('abc23.4')
u'23.4' '23.4'
>>> float_value(' abc23.4') >>> float_value(' abc23.4')
u'23.4' '23.4'
>>> float_value('ab') >>> float_value('ab')
u'' ''
""" """
try: try:
val = re.compile('([\d.]+)').findall(text_type(strValue).strip())[0] val = re.compile('([\d.]+)').findall(str(strValue).strip())[0]
except: except:
val = default val = default
return val return val

View file

@ -3,8 +3,7 @@
# GPL 2008 # GPL 2008
import re import re
import string import string
from six.moves.html_entities import name2codepoint from html.entities import name2codepoint
from six import unichr, PY2, string_types
letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz' letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
@ -26,8 +25,7 @@ link_target_attribute_re = re.compile(r'(<a [^>]*?)target=[^\s>]+')
html_gunk_re = re.compile(r'(?:<br clear="all">|<i><\/i>|<b><\/b>|<em><\/em>|<strong><\/strong>|<\/?smallcaps>|<\/?uppercase>)', re.IGNORECASE) html_gunk_re = re.compile(r'(?:<br clear="all">|<i><\/i>|<b><\/b>|<em><\/em>|<strong><\/strong>|<\/?smallcaps>|<\/?uppercase>)', re.IGNORECASE)
hard_coded_bullets_re = re.compile(r'((?:<p>(?:%s).*?[a-zA-Z].*?</p>\s*)+)' % '|'.join([re.escape(x) for x in DOTS]), re.DOTALL) hard_coded_bullets_re = re.compile(r'((?:<p>(?:%s).*?[a-zA-Z].*?</p>\s*)+)' % '|'.join([re.escape(x) for x in DOTS]), re.DOTALL)
trailing_empty_content_re = re.compile(r'(?:<p>(?:&nbsp;|\s|<br \/>)*?</p>\s*)+\Z') trailing_empty_content_re = re.compile(r'(?:<p>(?:&nbsp;|\s|<br \/>)*?</p>\s*)+\Z')
if PY2:
del x # Temporary variable
def escape(html): def escape(html):
''' '''
@ -36,7 +34,7 @@ def escape(html):
>>> escape('html "test" & <brothers>') >>> escape('html "test" & <brothers>')
'html &quot;test&quot; &amp; &lt;brothers&gt;' 'html &quot;test&quot; &amp; &lt;brothers&gt;'
''' '''
if not isinstance(html, string_types): if not isinstance(html, str):
html = str(html) html = str(html)
return html.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;').replace('"', '&quot;').replace("'", '&apos;') return html.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;').replace('"', '&quot;').replace("'", '&apos;')
@ -147,20 +145,20 @@ charrefpat = re.compile(r'&(#(\d+|x[\da-fA-F]+)|[\w.:-]+);?')
def decode_html(html): def decode_html(html):
""" """
>>> decode_html('me &amp; you and &#36;&#38;%') >>> decode_html('me &amp; you and &#36;&#38;%')
u'me & you and $&%' 'me & you and $&%'
>>> decode_html('&#x80;') >>> decode_html('&#x80;')
u'\u20ac' '\u20ac'
>>> decode_html('Anniversary of Daoud&apos;s Republic') >>> decode_html('Anniversary of Daoud&apos;s Republic')
u"Anniversary of Daoud's Republic" "Anniversary of Daoud's Republic"
""" """
if isinstance(html, bytes): if isinstance(html, bytes):
html = html.decode('utf-8') html = html.decode('utf-8')
uchr = unichr uchr = chr
def entitydecode(match, uchr=uchr): def entitydecode(match, uchr=uchr):
entity = match.group(1) entity = match.group(1)
if entity == '#x80': if entity == '#x80':
return u'' return ''
elif entity.startswith('#x'): elif entity.startswith('#x'):
return uchr(int(entity[2:], 16)) return uchr(int(entity[2:], 16))
elif entity.startswith('#'): elif entity.startswith('#'):
@ -171,7 +169,7 @@ def decode_html(html):
return "'" return "'"
else: else:
return match.group(0) return match.group(0)
return charrefpat.sub(entitydecode, html).replace(u'\xa0', ' ') return charrefpat.sub(entitydecode, html).replace('\xa0', ' ')
def highlight(text, query, hlClass="hl"): def highlight(text, query, hlClass="hl"):
""" """
@ -189,51 +187,51 @@ def highlight(text, query, hlClass="hl"):
def escape_html(value): def escape_html(value):
''' '''
>>> escape_html(u'<script> foo') >>> escape_html('<script> foo')
u'&lt;script&gt; foo' '&lt;script&gt; foo'
>>> escape_html(u'&lt;script&gt; foo') >>> escape_html('&lt;script&gt; foo')
u'&lt;script&gt; foo' '&lt;script&gt; foo'
''' '''
return escape(decode_html(value)) return escape(decode_html(value))
def sanitize_html(html, tags=None, global_attributes=[]): def sanitize_html(html, tags=None, global_attributes=[]):
''' '''
>>> sanitize_html('http://foo.com, bar') >>> sanitize_html('http://foo.com, bar')
u'<a href="http://foo.com">http://foo.com</a>, bar' '<a href="http://foo.com">http://foo.com</a>, bar'
>>> sanitize_html('http://foo.com/foobar?foo, bar') >>> sanitize_html('http://foo.com/foobar?foo, bar')
u'<a href="http://foo.com/foobar?foo">http://foo.com/foobar?foo</a>, bar' '<a href="http://foo.com/foobar?foo">http://foo.com/foobar?foo</a>, bar'
>>> sanitize_html('(see: www.foo.com)') >>> sanitize_html('(see: www.foo.com)')
u'(see: <a href="http://www.foo.com">www.foo.com</a>)' '(see: <a href="http://www.foo.com">www.foo.com</a>)'
>>> sanitize_html('foo@bar.com') >>> sanitize_html('foo@bar.com')
u'<a href="mailto:foo@bar.com">foo@bar.com</a>' '<a href="mailto:foo@bar.com">foo@bar.com</a>'
>>> sanitize_html(sanitize_html('foo@bar.com')) >>> sanitize_html(sanitize_html('foo@bar.com'))
u'<a href="mailto:foo@bar.com">foo@bar.com</a>' '<a href="mailto:foo@bar.com">foo@bar.com</a>'
>>> sanitize_html('<a href="http://foo.com" onmouseover="alert()">foo</a>') >>> sanitize_html('<a href="http://foo.com" onmouseover="alert()">foo</a>')
u'<a href="http://foo.com">foo</a>' '<a href="http://foo.com">foo</a>'
>>> sanitize_html('<a href="javascript:alert()">foo</a>') >>> sanitize_html('<a href="javascript:alert()">foo</a>')
u'&lt;a href="javascript:alert()"&gt;foo&lt;/a&gt;' '&lt;a href="javascript:alert()"&gt;foo&lt;/a&gt;'
>>> sanitize_html('[http://foo.com foo]') >>> sanitize_html('[http://foo.com foo]')
u'<a href="http://foo.com">foo</a>' '<a href="http://foo.com">foo</a>'
>>> sanitize_html('<div style="direction: rtl">foo</div>') >>> sanitize_html('<div style="direction: rtl">foo</div>')
u'<div style="direction: rtl">foo</div>' '<div style="direction: rtl">foo</div>'
>>> sanitize_html('<script>alert()</script>') >>> sanitize_html('<script>alert()</script>')
u'&lt;script&gt;alert()&lt;/script&gt;' '&lt;script&gt;alert()&lt;/script&gt;'
>>> sanitize_html("'foo' < 'bar' && \\"foo\\" > \\"bar\\"") >>> sanitize_html("'foo' < 'bar' && \\"foo\\" > \\"bar\\"")
u'\\'foo\\' &lt; \\'bar\\' &amp;&amp; "foo" &gt; "bar"' '\\'foo\\' &lt; \\'bar\\' &amp;&amp; "foo" &gt; "bar"'
>>> sanitize_html('<b>foo') >>> sanitize_html('<b>foo')
u'<b>foo</b>' '<b>foo</b>'
>>> sanitize_html('<b>foo</b></b>') >>> sanitize_html('<b>foo</b></b>')
u'<b>foo</b>' '<b>foo</b>'
>>> sanitize_html('Anniversary of Daoud&apos;s Republic') >>> sanitize_html('Anniversary of Daoud&apos;s Republic')
u"Anniversary of Daoud's Republic" "Anniversary of Daoud's Republic"
>>> sanitize_html('') >>> sanitize_html('')
u'' ''
>>> sanitize_html(' ') >>> sanitize_html(' ')
u' ' ' '
>>> sanitize_html(u'&nbsp;') # canonicalised to a space: okay, I suppose >>> sanitize_html('&nbsp;') # canonicalised to a space: okay, I suppose
u' ' ' '
>>> sanitize_html(u'\u00a0') # also nbsp >>> sanitize_html('\u00a0') # also nbsp
u' ' ' '
''' '''
if not tags: if not tags:
valid_url = '^((https?:\/\/|\/|mailto:).*?)' valid_url = '^((https?:\/\/|\/|mailto:).*?)'
@ -414,24 +412,24 @@ def sanitize_fragment(html):
are quoted, etc. Does not strip potentially-malicious HTML: use are quoted, etc. Does not strip potentially-malicious HTML: use
sanitize_html() for that. sanitize_html() for that.
>>> sanitize_fragment(u'<span lang="en">') >>> sanitize_fragment('<span lang="en">')
u'<span lang="en"></span>' '<span lang="en"></span>'
>>> sanitize_fragment(u'<span lang=en></span>') >>> sanitize_fragment('<span lang=en></span>')
u'<span lang="en"></span>' '<span lang="en"></span>'
>>> sanitize_fragment(u'<br><br/></br>') >>> sanitize_fragment('<br><br/></br>')
u'<br><br>' '<br><br>'
>>> sanitize_fragment(u'<a href="javascript:alert()">foo</a>') >>> sanitize_fragment('<a href="javascript:alert()">foo</a>')
u'<a href="javascript:alert()">foo</a>' '<a href="javascript:alert()">foo</a>'
>>> sanitize_fragment(u'') >>> sanitize_fragment('')
u'' ''
>>> sanitize_fragment(u' ') >>> sanitize_fragment(' ')
u' ' ' '
>>> sanitize_fragment(u'&nbsp;') >>> sanitize_fragment('&nbsp;')
u'\\xa0' '\\xa0'
>>> sanitize_fragment(u'\\u00a0') # nbsp >>> sanitize_fragment('\\u00a0') # nbsp
u'\\xa0' '\\xa0'
>>> sanitize_fragment(u'\\ufeff') # zero-width no-break space >>> sanitize_fragment('\\ufeff') # zero-width no-break space
u'\\ufeff' '\\ufeff'
''' '''
''' '''
@ -442,7 +440,12 @@ def sanitize_fragment(html):
if not html.strip(): if not html.strip():
return html return html
import lxml.html import lxml.html
try:
body = lxml.html.document_fromstring(html).find('body') body = lxml.html.document_fromstring(html).find('body')
except lxml.etree.ParserError as e:
if e.args and e.args[0] == 'Document is empty':
return html
raise e
html = lxml.html.tostring(body, encoding='utf-8')[6:-7].decode('utf-8') html = lxml.html.tostring(body, encoding='utf-8')[6:-7].decode('utf-8')
if html.startswith('<p>') and html.endswith('</p>'): if html.startswith('<p>') and html.endswith('</p>'):
html = html[3:-4] html = html[3:-4]

View file

@ -22,11 +22,17 @@ ZONE_INDEX = [
] ]
] ]
def textsize(draw, text, font):
left, top, right, bottom = draw.textbbox((0, 0), text, font=font)
return (right, bottom)
def drawText(image, position, text, font_file, font_size, color): def drawText(image, position, text, font_file, font_size, color):
draw = ImageDraw.Draw(image) draw = ImageDraw.Draw(image)
font = ImageFont.truetype(font_file, font_size, encoding='unic') font = ImageFont.truetype(font_file, font_size, encoding='unic')
draw.text(position, text, fill=color, font=font) draw.text(position, text, fill=color, font=font)
size = draw.textsize(text, font=font) size = textsize(draw, text, font)
version = getattr(Image, 'PILLOW_VERSION', None) version = getattr(Image, 'PILLOW_VERSION', None)
if version and version > '2.1.0' and version < '2.6.1': if version and version > '2.1.0' and version < '2.6.1':
offset = font.getoffset(text) offset = font.getoffset(text)
@ -57,7 +63,7 @@ def getHSL(rgb):
return tuple(hsl) return tuple(hsl)
def getImageHash(image_file, mode): def getImageHash(image_file, mode):
image = Image.open(image_file).convert('RGB').resize((8, 8), Image.ANTIALIAS) image = Image.open(image_file).convert('RGB').resize((8, 8), Image.LANCZOS)
image_hash = 0 image_hash = 0
if mode == 'color': if mode == 'color':
# divide the image into 8 zones: # divide the image into 8 zones:
@ -99,7 +105,7 @@ def getImageHash(image_file, mode):
return image_hash return image_hash
def getImageHeat(image_file): def getImageHeat(image_file):
image = Image.open(image_file).convert('RGB').resize((16, 16), Image.ANTIALIAS) image = Image.open(image_file).convert('RGB').resize((16, 16), Image.LANCZOS)
pixel = image.load() pixel = image.load()
image_heat = 0 image_heat = 0
for y in range(image.size[1]): for y in range(image.size[1]):
@ -114,7 +120,7 @@ def getImageHeat(image_file):
return image_heat / 256 return image_heat / 256
def getImageHSL(image_file): def getImageHSL(image_file):
image = Image.open(image_file).convert('RGB').resize((1, 1), Image.ANTIALIAS) image = Image.open(image_file).convert('RGB').resize((1, 1), Image.LANCZOS)
return getHSL(image.getpixel((0, 0))) return getHSL(image.getpixel((0, 0)))
def getRGB(hsl): def getRGB(hsl):
@ -148,7 +154,7 @@ def getRGB(hsl):
def getTextSize(image, text, font_file, font_size): def getTextSize(image, text, font_file, font_size):
draw = ImageDraw.Draw(image) draw = ImageDraw.Draw(image)
font = ImageFont.truetype(font_file, font_size, encoding='unic') font = ImageFont.truetype(font_file, font_size, encoding='unic')
size = draw.textsize(text, font=font) size = textsize(draw, text, font)
version = getattr(Image, 'PILLOW_VERSION', None) version = getattr(Image, 'PILLOW_VERSION', None)
if version and version > '2.1.0' and version < '2.6.1': if version and version > '2.1.0' and version < '2.6.1':
offset = font.getoffset(text) offset = font.getoffset(text)
@ -168,7 +174,7 @@ def wrapText(text, max_width, max_lines, font_file, font_size):
return min_width return min_width
def get_width(string): def get_width(string):
return draw.textsize(string, font=font)[0] return textsize(draw, string, font)[0]
image = Image.new('RGB', (1, 1)) image = Image.new('RGB', (1, 1))
draw = ImageDraw.Draw(image) draw = ImageDraw.Draw(image)

View file

@ -2,19 +2,12 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4 # vi:si:et:sw=4:sts=4:ts=4
from six import PY2
from .utils import json from .utils import json
def minify(source, comment=''): def minify(source, comment=''):
# see https://github.com/douglascrockford/JSMin/blob/master/README # see https://github.com/douglascrockford/JSMin/blob/master/README
def get_next_non_whitespace_token(): def get_next_non_whitespace_token():
pass pass
# python2 performance with unicode string is terrible
if PY2:
if isinstance(source, unicode): # pylint: disable=undefined-variable
source = source.encode('utf-8')
if isinstance(comment, unicode): # pylint: disable=undefined-variable
comment = comment.encode('utf-8')
tokens = tokenize(source) tokens = tokenize(source)
length = len(tokens) length = len(tokens)
minified = '/*' + comment + '*/' if comment else '' minified = '/*' + comment + '*/' if comment else ''

View file

@ -206,7 +206,7 @@ def parse_path(path, directory_key='director'):
string = re.sub(re.compile('(?<=\w)_(?=\w)', re.U), '/', string) string = re.sub(re.compile('(?<=\w)_(?=\w)', re.U), '/', string)
string = re.sub(' _ ', ' / ', string) string = re.sub(' _ ', ' / ', string)
# 'foo_ ' is ':' # 'foo_ ' is ':'
string = re.sub(re.compile('(?<=\w)_ ', re.U), ': ', string) string = re.sub(re.compile('(?<=[\w\)\]])_ ', re.U), ': ', string)
string = unicodedata.normalize('NFD', string) string = unicodedata.normalize('NFD', string)
return string return string

View file

@ -8,13 +8,10 @@ import os
import re import re
import struct import struct
try: import requests
import requests
USE_REQUESTS = True from io import BytesIO
except: import urllib
USE_REQUESTS = False
from six import BytesIO, PY2
from six.moves import urllib
from chardet.universaldetector import UniversalDetector from chardet.universaldetector import UniversalDetector
@ -59,14 +56,10 @@ def get_json(url, data=None, headers=None):
def open_url(url, data=None, headers=None): def open_url(url, data=None, headers=None):
if headers is None: if headers is None:
headers = DEFAULT_HEADERS.copy() headers = DEFAULT_HEADERS.copy()
if PY2:
if not isinstance(url, bytes):
url = url.encode('utf-8')
else:
if isinstance(url, bytes): if isinstance(url, bytes):
url = url.decode('utf-8') url = url.decode('utf-8')
url = url.replace(' ', '%20') url = url.replace(' ', '%20')
if data and not PY2 and not isinstance(data, bytes): if data and not isinstance(data, bytes):
data = data.encode('utf-8') data = data.encode('utf-8')
req = urllib.request.Request(url, data, headers) req = urllib.request.Request(url, data, headers)
return urllib.request.urlopen(req) return urllib.request.urlopen(req)
@ -123,16 +116,11 @@ def save_url(url, filename, overwrite=False):
if dirname and not os.path.exists(dirname): if dirname and not os.path.exists(dirname):
os.makedirs(dirname) os.makedirs(dirname)
headers = DEFAULT_HEADERS.copy() headers = DEFAULT_HEADERS.copy()
if USE_REQUESTS:
r = requests.get(url, headers=headers, stream=True) r = requests.get(url, headers=headers, stream=True)
with open(filename, 'wb') as f: with open(filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024): for chunk in r.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks if chunk: # filter out keep-alive new chunks
f.write(chunk) f.write(chunk)
else:
data = read_url(url)
with open(filename, 'wb') as f:
f.write(data)
def _get_size(url): def _get_size(url):
req = urllib.request.Request(url, headers=DEFAULT_HEADERS.copy()) req = urllib.request.Request(url, headers=DEFAULT_HEADERS.copy())

View file

@ -4,8 +4,6 @@
import re import re
import unicodedata import unicodedata
from six import string_types
_articles = ('the', 'la', 'a', 'die', 'der', 'le', 'el', _articles = ('the', 'la', 'a', 'die', 'der', 'le', 'el',
"l'", 'il', 'das', 'les', 'o', 'ein', 'i', 'un', 'los', 'de', "l'", 'il', 'das', 'les', 'o', 'ein', 'i', 'un', 'los', 'de',
@ -103,7 +101,7 @@ def normalize_imdbid(imdbId):
>>> normalize_imdbid('tt0159206') >>> normalize_imdbid('tt0159206')
'0159206' '0159206'
""" """
if isinstance(imdbId, string_types): if isinstance(imdbId, str):
imdbId = re.sub('.*(\d{7}).*', '\\1', imdbId) imdbId = re.sub('.*(\d{7}).*', '\\1', imdbId)
elif isinstance(imdbId, int): elif isinstance(imdbId, int):
imdbId = "%07d" % imdbId imdbId = "%07d" % imdbId

View file

@ -5,7 +5,6 @@ import codecs
import re import re
import chardet import chardet
from six import PY2
import ox import ox
@ -24,9 +23,6 @@ def _detect_encoding(fp):
# go to beginning of file and get the first 4 bytes # go to beginning of file and get the first 4 bytes
oldFP = fp.tell() oldFP = fp.tell()
fp.seek(0) fp.seek(0)
if PY2:
(byte1, byte2, byte3, byte4) = [ord(b) for b in fp.read(4)]
else:
(byte1, byte2, byte3, byte4) = fp.read(4) (byte1, byte2, byte3, byte4) = fp.read(4)
# try bom detection using 4 bytes, 3 bytes, or 2 bytes # try bom detection using 4 bytes, 3 bytes, or 2 bytes

View file

@ -1,11 +1,13 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4 # vi:si:et:sw=4:sts=4:ts=4
# GPL 2008 # GPL 2008
import gzip
import math import math
import re import re
import unicodedata import unicodedata
from io import BytesIO
from six.moves import reduce from functools import reduce
ARTICLES = list(set([ ARTICLES = list(set([
# def sg, def pl, indef sg, indef pl (each m/f/n) # def sg, def pl, indef sg, indef pl (each m/f/n)
@ -473,10 +475,10 @@ def wrap(text, width):
def wrap_string(string, length=80, separator='\n', balance=False): def wrap_string(string, length=80, separator='\n', balance=False):
''' '''
>>> wrap_string(u"Anticonstitutionellement, Paris s'eveille", 16) >>> wrap_string("Anticonstitutionellement, Paris s'eveille", 16)
u"Anticonstitution\\nellement, Paris \\ns'eveille" "Anticonstitution\\nellement, Paris \\ns'eveille"
>>> wrap_string(u'All you can eat', 12, '\\n', True) >>> wrap_string(u'All you can eat', 12, '\\n', True)
u'All you \\ncan eat' 'All you \\ncan eat'
''' '''
words = string.split(' ') words = string.split(' ')
if balance: if balance:
@ -491,20 +493,20 @@ def wrap_string(string, length=80, separator='\n', balance=False):
break break
lines = [''] lines = ['']
for word in words: for word in words:
if len(lines[len(lines) - 1] + word + u' ') <= length + 1: if len(lines[len(lines) - 1] + word + ' ') <= length + 1:
# word fits in current line # word fits in current line
lines[len(lines) - 1] += word + u' ' lines[len(lines) - 1] += word + ' '
else: else:
if len(word) <= length: if len(word) <= length:
# word fits in next line # word fits in next line
lines.append(word + u' ') lines.append(word + ' ')
else: else:
# word is longer than line # word is longer than line
position = length - len(lines[len(lines) - 1]) position = length - len(lines[len(lines) - 1])
lines[len(lines) - 1] += word[0:position] lines[len(lines) - 1] += word[0:position]
for i in range(position, len(word), length): for i in range(position, len(word), length):
lines.append(word[i:i+length]) lines.append(word[i:i+length])
lines[len(lines) - 1] += u' ' lines[len(lines) - 1] += ' '
return separator.join(lines).strip() return separator.join(lines).strip()
def truncate_string(string, length, padding='...', position='right'): def truncate_string(string, length, padding='...', position='right'):
@ -576,14 +578,14 @@ def get_valid_filename(s):
def get_text_list(list_, last_word='or'): def get_text_list(list_, last_word='or'):
""" """
>>> get_text_list([u'a', u'b', u'c', u'd']) >>> get_text_list(['a', 'b', 'c', 'd'])
u'a, b, c or d' 'a, b, c or d'
>>> get_text_list([u'a', u'b', u'c'], 'and') >>> get_text_list(['a', 'b', 'c'], 'and')
u'a, b and c' 'a, b and c'
>>> get_text_list([u'a', u'b'], 'and') >>> get_text_list(['a', 'b'], 'and')
u'a and b' 'a and b'
>>> get_text_list([u'a']) >>> get_text_list(['a'])
u'a' 'a'
>>> get_text_list([]) >>> get_text_list([])
'' ''
""" """
@ -591,24 +593,24 @@ def get_text_list(list_, last_word='or'):
return '' return ''
if len(list_) == 1: if len(list_) == 1:
return list_[0] return list_[0]
return u'%s %s %s' % (u', '.join([i for i in list_][:-1]), last_word, list_[-1]) return '%s %s %s' % (', '.join([i for i in list_][:-1]), last_word, list_[-1])
def get_list_text(text, last_word='or'): def get_list_text(text, last_word='or'):
""" """
>>> get_list_text(u'a, b, c or d') >>> get_list_text('a, b, c or d')
[u'a', u'b', u'c', u'd'] ['a', 'b', 'c', 'd']
>>> get_list_text(u'a, b and c', u'and') >>> get_list_text('a, b and c', 'and')
[u'a', u'b', u'c'] ['a', 'b', 'c']
>>> get_list_text(u'a and b', u'and') >>> get_list_text('a and b', 'and')
[u'a', u'b'] ['a', 'b']
>>> get_list_text(u'a') >>> get_list_text('a')
[u'a'] ['a']
>>> get_list_text(u'') >>> get_list_text('')
[] []
""" """
list_ = [] list_ = []
if text: if text:
list_ = text.split(u', ') list_ = text.split(', ')
if list_: if list_:
i = len(list_)-1 i = len(list_)-1
last = list_[i].split(last_word) last = list_[i].split(last_word)
@ -646,8 +648,6 @@ def phone2numeric(phone):
return letters.sub(char2number, phone) return letters.sub(char2number, phone)
def compress_string(s): def compress_string(s):
import gzip
from six import BytesIO
zbuf = BytesIO() zbuf = BytesIO()
zfile = gzip.GzipFile(mode='wb', compresslevel=6, fileobj=zbuf) zfile = gzip.GzipFile(mode='wb', compresslevel=6, fileobj=zbuf)
zfile.write(s) zfile.write(s)
@ -682,7 +682,7 @@ def words(text):
return [re.sub("(([.!?:-_]|'s)$)", '', x) for x in text] return [re.sub("(([.!?:-_]|'s)$)", '', x) for x in text]
def sort_string(string): def sort_string(string):
string = string.replace(u'Æ', 'AE').replace(u'Ø', 'O').replace(u'Þ', 'Th') string = string.replace('Æ', 'AE').replace('Ø', 'O').replace('Þ', 'Th')
# pad numbered titles # pad numbered titles
string = re.sub('(\d),(\d{3})', '\\1\\2', string) string = re.sub('(\d),(\d{3})', '\\1\\2', string)

View file

@ -5,12 +5,8 @@
from threading import Event from threading import Event
from hashlib import sha1 from hashlib import sha1
import os import os
from six import PY2
if PY2: from .bencode3 import bencode, bdecode
from .bencode import bencode, bdecode
else:
from .bencode3 import bencode, bdecode
__all__ = ['create_torrent', 'get_info_hash', 'get_torrent_info', 'get_files', 'get_torrent_size'] __all__ = ['create_torrent', 'get_info_hash', 'get_torrent_info', 'get_files', 'get_torrent_size']

View file

@ -1,321 +0,0 @@
# Written by Petru Paler, Uoti Urpala, Ross Cohen and John Hoffman
# see LICENSE.txt for license information
from __future__ import print_function
from types import IntType, LongType, StringType, ListType, TupleType, DictType
try:
from types import BooleanType
except ImportError:
BooleanType = None
try:
from types import UnicodeType
except ImportError:
UnicodeType = None
from cStringIO import StringIO
def decode_int(x, f):
f += 1
newf = x.index('e', f)
try:
n = int(x[f:newf])
except:
n = long(x[f:newf])
if x[f] == '-':
if x[f + 1] == '0':
raise ValueError
elif x[f] == '0' and newf != f+1:
raise ValueError
return (n, newf+1)
def decode_string(x, f):
colon = x.index(':', f)
try:
n = int(x[f:colon])
except (OverflowError, ValueError):
n = long(x[f:colon])
if x[f] == '0' and colon != f+1:
raise ValueError
colon += 1
return (x[colon:colon+n], colon+n)
def decode_unicode(x, f):
s, f = decode_string(x, f+1)
return (s.decode('UTF-8'),f)
def decode_list(x, f):
r, f = [], f+1
while x[f] != 'e':
v, f = decode_func[x[f]](x, f)
r.append(v)
return (r, f + 1)
def decode_dict(x, f):
r, f = {}, f+1
lastkey = None
while x[f] != 'e':
k, f = decode_string(x, f)
# why is this needed
# if lastkey >= k:
# raise ValueError
lastkey = k
r[k], f = decode_func[x[f]](x, f)
return (r, f + 1)
decode_func = {}
decode_func['l'] = decode_list
decode_func['d'] = decode_dict
decode_func['i'] = decode_int
decode_func['0'] = decode_string
decode_func['1'] = decode_string
decode_func['2'] = decode_string
decode_func['3'] = decode_string
decode_func['4'] = decode_string
decode_func['5'] = decode_string
decode_func['6'] = decode_string
decode_func['7'] = decode_string
decode_func['8'] = decode_string
decode_func['9'] = decode_string
#decode_func['u'] = decode_unicode
def bdecode(x, sloppy = 1):
try:
r, l = decode_func[x[0]](x, 0)
# except (IndexError, KeyError):
except (IndexError, KeyError, ValueError):
raise ValueError("bad bencoded data")
if not sloppy and l != len(x):
raise ValueError("bad bencoded data")
return r
def test_bdecode():
try:
bdecode('0:0:')
assert 0
except ValueError:
pass
try:
bdecode('ie')
assert 0
except ValueError:
pass
try:
bdecode('i341foo382e')
assert 0
except ValueError:
pass
assert bdecode('i4e') == 4
assert bdecode('i0e') == 0
assert bdecode('i123456789e') == 123456789
assert bdecode('i-10e') == -10
try:
bdecode('i-0e')
assert 0
except ValueError:
pass
try:
bdecode('i123')
assert 0
except ValueError:
pass
try:
bdecode('')
assert 0
except ValueError:
pass
try:
bdecode('i6easd')
assert 0
except ValueError:
pass
try:
bdecode('35208734823ljdahflajhdf')
assert 0
except ValueError:
pass
try:
bdecode('2:abfdjslhfld')
assert 0
except ValueError:
pass
assert bdecode('0:') == ''
assert bdecode('3:abc') == 'abc'
assert bdecode('10:1234567890') == '1234567890'
try:
bdecode('02:xy')
assert 0
except ValueError:
pass
try:
bdecode('l')
assert 0
except ValueError:
pass
assert bdecode('le') == []
try:
bdecode('leanfdldjfh')
assert 0
except ValueError:
pass
assert bdecode('l0:0:0:e') == ['', '', '']
try:
bdecode('relwjhrlewjh')
assert 0
except ValueError:
pass
assert bdecode('li1ei2ei3ee') == [1, 2, 3]
assert bdecode('l3:asd2:xye') == ['asd', 'xy']
assert bdecode('ll5:Alice3:Bobeli2ei3eee') == [['Alice', 'Bob'], [2, 3]]
try:
bdecode('d')
assert 0
except ValueError:
pass
try:
bdecode('defoobar')
assert 0
except ValueError:
pass
assert bdecode('de') == {}
assert bdecode('d3:agei25e4:eyes4:bluee') == {'age': 25, 'eyes': 'blue'}
assert bdecode('d8:spam.mp3d6:author5:Alice6:lengthi100000eee') == {'spam.mp3': {'author': 'Alice', 'length': 100000}}
try:
bdecode('d3:fooe')
assert 0
except ValueError:
pass
try:
bdecode('di1e0:e')
assert 0
except ValueError:
pass
try:
bdecode('d1:b0:1:a0:e')
assert 0
except ValueError:
pass
try:
bdecode('d1:a0:1:a0:e')
assert 0
except ValueError:
pass
try:
bdecode('i03e')
assert 0
except ValueError:
pass
try:
bdecode('l01:ae')
assert 0
except ValueError:
pass
try:
bdecode('9999:x')
assert 0
except ValueError:
pass
try:
bdecode('l0:')
assert 0
except ValueError:
pass
try:
bdecode('d0:0:')
assert 0
except ValueError:
pass
try:
bdecode('d0:')
assert 0
except ValueError:
pass
bencached_marker = []
class Bencached:
def __init__(self, s):
self.marker = bencached_marker
self.bencoded = s
BencachedType = type(Bencached('')) # insufficient, but good as a filter
def encode_bencached(x,r):
assert x.marker == bencached_marker
r.append(x.bencoded)
def encode_int(x,r):
r.extend(('i',str(x),'e'))
def encode_bool(x,r):
encode_int(int(x),r)
def encode_string(x,r):
r.extend((str(len(x)),':',x))
def encode_unicode(x,r):
#r.append('u')
encode_string(x.encode('UTF-8'),r)
def encode_list(x,r):
r.append('l')
for e in x:
encode_func[type(e)](e, r)
r.append('e')
def encode_dict(x,r):
r.append('d')
ilist = x.items()
ilist.sort()
for k,v in ilist:
r.extend((str(len(k)),':',k))
encode_func[type(v)](v, r)
r.append('e')
encode_func = {}
encode_func[BencachedType] = encode_bencached
encode_func[IntType] = encode_int
encode_func[LongType] = encode_int
encode_func[StringType] = encode_string
encode_func[ListType] = encode_list
encode_func[TupleType] = encode_list
encode_func[DictType] = encode_dict
if BooleanType:
encode_func[BooleanType] = encode_bool
if UnicodeType:
encode_func[UnicodeType] = encode_unicode
def bencode(x):
r = []
try:
encode_func[type(x)](x, r)
except:
print("*** error *** could not encode type %s (value: %s)" % (type(x), x))
assert 0
return ''.join(r)
def test_bencode():
assert bencode(4) == 'i4e'
assert bencode(0) == 'i0e'
assert bencode(-10) == 'i-10e'
assert bencode(12345678901234567890) == 'i12345678901234567890e'
assert bencode('') == '0:'
assert bencode('abc') == '3:abc'
assert bencode('1234567890') == '10:1234567890'
assert bencode([]) == 'le'
assert bencode([1, 2, 3]) == 'li1ei2ei3ee'
assert bencode([['Alice', 'Bob'], [2, 3]]) == 'll5:Alice3:Bobeli2ei3eee'
assert bencode({}) == 'de'
assert bencode({'age': 25, 'eyes': 'blue'}) == 'd3:agei25e4:eyes4:bluee'
assert bencode({'spam.mp3': {'author': 'Alice', 'length': 100000}}) == 'd8:spam.mp3d6:author5:Alice6:lengthi100000eee'
try:
bencode({1: 'foo'})
assert 0
except AssertionError:
pass
try:
import psyco
psyco.bind(bdecode)
psyco.bind(bencode)
except ImportError:
pass

View file

@ -8,11 +8,7 @@ from hashlib import sha1 as sha
from copy import copy from copy import copy
import re import re
from six import PY2 from .bencode3 import bencode
if PY2:
from .bencode import bencode
else:
from .bencode3 import bencode
from threading import Event from threading import Event
from time import time from time import time
from traceback import print_exc from traceback import print_exc

View file

@ -13,13 +13,13 @@ def get_id(url):
def get_data(id): def get_data(id):
''' '''
>>> get_data('129689')['cast'][1][1] >>> get_data('129689')['cast'][1][1]
u'Marianne' 'Marianne'
>>> get_data('129689')['credits'][0][0] >>> get_data('129689')['credits'][0][0]
u'Jean-Luc Godard' 'Jean-Luc Godard'
>>> get_data('129689')['posters'][0] >>> get_data('129689')['posters'][0]
u'http://image.allmusic.com/00/adg/cov200/dru800/u812/u81260bbffr.jpg' 'http://image.allmusic.com/00/adg/cov200/dru800/u812/u81260bbffr.jpg'
>>> get_data('129689')['rating'] >>> get_data('129689')['rating']
u'4.5' '4.5'
''' '''
if id.startswith('http'): if id.startswith('http'):
id = get_id(id) id = get_id(id)

View file

@ -2,7 +2,7 @@
# vi:si:et:sw=4:sts=4:ts=4 # vi:si:et:sw=4:sts=4:ts=4
from __future__ import print_function from __future__ import print_function
import re import re
from six.moves.urllib.parse import quote from urllib.parse import quote
from ox import find_re, strip_tags, decode_html from ox import find_re, strip_tags, decode_html
from ox.cache import read_url from ox.cache import read_url

View file

@ -2,7 +2,6 @@ from __future__ import print_function
import json import json
import re import re
from six import text_type
from ox.cache import read_url from ox.cache import read_url
HEADERS = { HEADERS = {
@ -17,9 +16,9 @@ USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7) '
USER_AGENT += 'AppleWebKit/534.48.3 (KHTML, like Gecko) Version/5.1 Safari/534.48.3' USER_AGENT += 'AppleWebKit/534.48.3 (KHTML, like Gecko) Version/5.1 Safari/534.48.3'
def get_movie_data(title, director): def get_movie_data(title, director):
if isinstance(title, text_type): if isinstance(title, str):
title = title.encode('utf-8') title = title.encode('utf-8')
if isinstance(director, text_type): if isinstance(director, str):
director = director.encode('utf-8') director = director.encode('utf-8')
data = {} data = {}
# itunes section (preferred source for link) # itunes section (preferred source for link)

View file

@ -3,8 +3,6 @@
from .. import cache from .. import cache
from ..utils import json from ..utils import json
from six import string_types
def get_id(url): def get_id(url):
return url.split("/")[-1] return url.split("/")[-1]
@ -21,7 +19,7 @@ def get_data(id):
data[key] = details['metadata'][key] data[key] = details['metadata'][key]
if isinstance(data[key], list): if isinstance(data[key], list):
data[key] = data[key][0] data[key] = data[key][0]
if isinstance(data[key], string_types): if isinstance(data[key], str):
data[key] = data[key].strip() data[key] = data[key].strip()
if data[key][0] == '[' and data[key][-1] == ']': if data[key][0] == '[' and data[key][-1] == ']':
data[key] = data[key][1:-1] data[key] = data[key][1:-1]

View file

@ -19,18 +19,18 @@ def get_data(id, language='en'):
if 'Willkommen in der Datenbank des Arsenal' in html: if 'Willkommen in der Datenbank des Arsenal' in html:
return None return None
data = {} data = {}
data[u'id'] = id data['id'] = id
data[u'url'] = url data['url'] = url
m = re.compile('<h1>(.*?)</h1>').findall(html) m = re.compile('<h1>(.*?)</h1>').findall(html)
if m: if m:
data[u'title'] = m[0] data['title'] = m[0]
m = re.compile("<b>Director: </b><a href='.*?'>(.*?)</a>").findall(html) m = re.compile("<b>Director: </b><a href='.*?'>(.*?)</a>").findall(html)
if m: if m:
data[u'director'] = m[0] data['director'] = m[0]
m = re.compile("caUI.initImageScroller\(\[\{url:'(.*?)'").findall(html) m = re.compile("caUI.initImageScroller\(\[\{url:'(.*?)'").findall(html)
if m: if m:
data[u'image'] = m[0] data['image'] = m[0]
units = re.compile("<div class='unit'>(.*?)</div>", re.DOTALL).findall(html) units = re.compile("<div class='unit'>(.*?)</div>", re.DOTALL).findall(html)
for x in map(re.compile('<b>(.*?)</b>: (.*)', re.DOTALL).findall, units): for x in map(re.compile('<b>(.*?)</b>: (.*)', re.DOTALL).findall, units):
@ -43,7 +43,7 @@ def get_data(id, language='en'):
else: else:
data[key] = strip_tags(data[key]) data[key] = strip_tags(data[key])
if "running time (minutes)" in data: if "running time (minutes)" in data:
data[u'runtime'] = float(data.pop("running time (minutes)").replace(',', '.')) * 60 data['runtime'] = float(data.pop("running time (minutes)").replace(',', '.')) * 60
for key in ('year', 'length in metres', 'forum participation year', 'number of reels'): for key in ('year', 'length in metres', 'forum participation year', 'number of reels'):
if key in data and data[key].isdigit(): if key in data and data[key].isdigit():
data[key] = int(data[key]) data[key] = int(data[key])

View file

@ -19,13 +19,13 @@ def get_url(id):
def get_data(id, timeout=ox.cache.cache_timeout, get_imdb=False): def get_data(id, timeout=ox.cache.cache_timeout, get_imdb=False):
''' '''
>>> get_data('1333').get('imdbId') >>> get_data('1333').get('imdbId')
u'0060304' '0060304'
>>> get_data('236')['posters'][0] >>> get_data('236')['posters'][0]
u'http://s3.amazonaws.com/criterion-production/release_images/1586/ThirdManReplace.jpg' 'http://s3.amazonaws.com/criterion-production/release_images/1586/ThirdManReplace.jpg'
>>> get_data('786')['posters'][0] >>> get_data('786')['posters'][0]
u'http://s3.amazonaws.com/criterion-production/product_images/185/343_box_348x490.jpg' 'http://s3.amazonaws.com/criterion-production/product_images/185/343_box_348x490.jpg'
''' '''
data = { data = {
"id": id, "id": id,
@ -39,12 +39,16 @@ def get_data(id, timeout=ox.cache.cache_timeout, get_imdb=False):
data["number"] = find_re(html, "<b>Spine #(\d+)") data["number"] = find_re(html, "<b>Spine #(\d+)")
data["title"] = decode_html(find_re(html, "<h1 class=\"header__primarytitle\".*?>(.*?)</h1>")) data["title"] = decode_html(find_re(html, "<h1 class=\"header__primarytitle\".*?>(.*?)</h1>"))
data["title"] = data["title"].split(u' \u2014 The Television Version')[0].strip() data["title"] = data["title"].split(' \u2014 The Television Version')[0].strip()
results = find_re(html, '<ul class="film-meta-list">(.*?)</ul>') results = find_re(html, '<ul class="film-meta-list">(.*?)</ul>')
info = re.compile('<li itemprop="(.*?)".*?>(.*?)</li>', re.DOTALL).findall(results) info = re.compile('<li itemprop="(.*?)".*?>(.*?)</li>', re.DOTALL).findall(results)
info = {k: strip_tags(v).strip() for k, v in info} info = {k: strip_tags(v).strip() for k, v in info}
meta = re.compile('<meta.*? name="(.*?)".*? content="(.*?)"', re.DOTALL).findall(html)
meta = {k: v.strip() for k, v in meta}
if 'director' in info: if 'director' in info:
data['director'] = info['director'] data['director'] = info['director']
elif 'director' in meta:
data['director'] = meta['director']
if 'countryOfOrigin' in info: if 'countryOfOrigin' in info:
data['country'] = [c.strip() for c in decode_html(info['countryOfOrigin']).split(', ')] data['country'] = [c.strip() for c in decode_html(info['countryOfOrigin']).split(', ')]
if 'inLanguage' in info: if 'inLanguage' in info:
@ -80,7 +84,8 @@ def get_data(id, timeout=ox.cache.cache_timeout, get_imdb=False):
for poster in re.compile('<img src="(.*?)"').findall(posters): for poster in re.compile('<img src="(.*?)"').findall(posters):
data['posters'].append(poster) data['posters'].append(poster)
result = find_re(html, "<img alt=\"Film Still\" height=\"252\" src=\"(.*?)\"") result = re.compile('<div class="gallery-item ">.*?src="(.*?)"', re.DOTALL).findall(html)
#result = find_re(html, "<img alt=\"Film Still\" height=\"252\" src=\"(.*?)\"")
if result: if result:
data["stills"] = [result] data["stills"] = [result]
data["trailers"] = [] data["trailers"] = []

View file

@ -1,7 +1,7 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4 # vi:si:et:sw=4:sts=4:ts=4
import re import re
from six.moves.urllib.parse import unquote from urllib.parse import unquote
from ox.cache import read_url from ox.cache import read_url

View file

@ -2,7 +2,7 @@
# vi:si:et:sw=4:sts=4:ts=4 # vi:si:et:sw=4:sts=4:ts=4
import re import re
from six.moves import urllib import urllib
import ox import ox
from ox import strip_tags, decode_html from ox import strip_tags, decode_html
from ox.cache import read_url from ox.cache import read_url

View file

@ -58,10 +58,10 @@ def get_data(id, timeout=-1):
def get_id(url=None, imdb=None): def get_id(url=None, imdb=None):
''' '''
>>> get_id(imdb='0133093') >>> get_id(imdb='0133093')
u'the-matrix' 'the-matrix'
#>>> get_id(imdb='0060304') #>>> get_id(imdb='0060304')
#u'2-or-3-things-i-know-about-her' #'2-or-3-things-i-know-about-her'
''' '''
if imdb: if imdb:
i = ImdbCombined(imdb) i = ImdbCombined(imdb)

View file

@ -1,7 +1,7 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4 # vi:si:et:sw=4:sts=4:ts=4
import re import re
from six.moves import urllib import urllib
import ox import ox
from ox import strip_tags, decode_html from ox import strip_tags, decode_html
@ -17,6 +17,31 @@ def quote_plus(s):
s = s.encode('utf-8') s = s.encode('utf-8')
return urllib.parse.quote_plus(s) return urllib.parse.quote_plus(s)
def infobox(query, timeout=DEFAULT_TIMEOUT):
import lxml.html
data = read_url(url, timeout=timeout)
doc = lxml.html.document_fromstring(data)
k = 'kp-wholepage'
wholepage = doc.cssselect('.' + k)
infobox = {}
if wholepage:
page = wholepage[0]
for a in page.cssselect('a'):
if a.attrib.get('href', '').startswith('http'):
domain = '.'.join(a.attrib['href'].split('/')[2].split('.')[-2:])
infobox[domain] = a.attrib['href']
for e in page.cssselect('*[data-attrid]'):
key = e.attrib['data-attrid']
value = e.text_content()
if value and key not in (
'kc:/film/film:media_actions_wholepage',
'action:watch_film'
):
infobox[key] = value
return infobox
def find(query, max_results=DEFAULT_MAX_RESULTS, timeout=DEFAULT_TIMEOUT): def find(query, max_results=DEFAULT_MAX_RESULTS, timeout=DEFAULT_TIMEOUT):
""" """
Return max_results tuples with title, url, description Return max_results tuples with title, url, description

View file

@ -1,13 +1,13 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4 # vi:si:et:sw=4:sts=4:ts=4
from __future__ import print_function from collections import defaultdict
import json
import re import re
import time import time
import unicodedata import unicodedata
from six.moves.urllib.parse import urlencode from urllib.parse import urlencode
from six import text_type, string_types
from .. import find_re, strip_tags, decode_html from .. import find_re, strip_tags, decode_html
from .. import cache from .. import cache
@ -16,13 +16,13 @@ from .. import cache
from . siteparser import SiteParser from . siteparser import SiteParser
from . import duckduckgo from . import duckduckgo
from ..utils import datetime from ..utils import datetime
from ..geo import normalize_country_name from ..geo import normalize_country_name, get_country_name
def prepare_url(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None, unicode=False): def prepare_url(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None, unicode=False):
headers = headers.copy() headers = headers.copy()
# https://webapps.stackexchange.com/questions/11003/how-can-i-disable-reconfigure-imdbs-automatic-geo-location-so-it-does-not-defau # https://webapps.stackexchange.com/questions/11003/how-can-i-disable-reconfigure-imdbs-automatic-geo-location-so-it-does-not-defau
headers['X-Forwarded-For'] = '72.21.206.80' #headers['X-Forwarded-For'] = '72.21.206.80'
headers['Accept-Language'] = 'en' headers['Accept-Language'] = 'en'
return url, data, headers, timeout, unicode return url, data, headers, timeout, unicode
@ -106,6 +106,131 @@ def technical(label):
} }
def tech_spec(metadata):
tech = {}
for row in metadata['props']['pageProps']['contentData']['section']['items']:
title = {
'aspect ratio': 'aspectratio',
'sound mix': 'sound',
}.get(row['rowTitle'].lower(), row['rowTitle'].lower())
tech[title] = []
for content in row['listContent']:
value = content['text']
tech[title].append(value)
return tech
def movie_connections(metadata):
connections = {}
if 'props' not in metadata:
return connections
for row in metadata['props']['pageProps']['contentData']['categories']:
title = {
}.get(row['name'], row['name'])
if title not in connections:
connections[title] = []
for item in row['section']['items']:
item_ = {
'id': item['id'][2:],
}
item_['title'] = re.compile('<a.*?>(.*?)</a>').findall(item['listContent'][0]['html'])[0]
if len(item['listContent']) >=2:
item_['description'] = strip_tags(item['listContent'][1]['html'])
connections[title].append(item_)
return connections
def get_category_by_id(metadata, id):
for category in metadata['props']['pageProps']['contentData']['categories']:
if category['id'] == id:
return category
def get_release_date(metadata):
releases = get_category_by_id(metadata, 'releases')
def parse_date(d):
parsed = None
for fmt in (
'%B %d, %Y',
'%d %B %Y',
'%B %Y',
):
try:
parsed = datetime.strptime(d, fmt)
break
except:
pass
if not parsed:
return None
return '%d-%02d-%02d' % (parsed.year, parsed.month, parsed.day)
dates = []
for item in releases['section']['items']:
content = item['listContent'][0]
date = parse_date(content['text'])
if date:
dates.append(date)
if dates:
return min(dates)
def get_locations(metadata):
try:
locations = [
row['cardText']
for row in metadata['props']['pageProps']['contentData']['categories'][0]['section']['items']
]
except:
locations = []
return locations
def get_keywords(metadata):
try:
keywords = [
row['rowTitle']
for row in metadata['props']['pageProps']['contentData']['section']['items']
]
except:
keywords = []
return keywords
def get_entity_metadata(metadata):
data = {}
entity = metadata['props']['pageProps']['contentData']['entityMetadata']
data['title'] = entity['titleText']['text']
data['originalTitle'] = entity['originalTitleText']['text']
data['year'] = entity['releaseYear']['year']
data['plot'] = entity['plot']['plotText']['plainText']
data['country'] = [get_country_name(c['id']) for c in entity['countriesOfOrigin']['countries']]
data['poster'] = metadata['props']['pageProps']['contentData']['posterData']['image']['url']
return data
def alternative_titles(metadata):
titles = defaultdict(list)
akas = get_category_by_id(metadata, 'akas')
skip = [
metadata['props']['pageProps']['contentData']['entityMetadata']['titleText']['text'],
metadata['props']['pageProps']['contentData']['entityMetadata']['originalTitleText']['text']
]
for row in akas['section']['items']:
content = row['listContent'][0]
title = content['text']
country = row['rowTitle']
if title in skip:
continue
titles[title].append(country)
#if content.get('subText'):
# titles[-1]['subText'] = content['subText']
return [kv for kv in titles.items()]
''' '''
'posterIds': { 'posterIds': {
'page': 'posters', 'page': 'posters',
@ -116,18 +241,17 @@ def technical(label):
class Imdb(SiteParser): class Imdb(SiteParser):
''' '''
>>> Imdb('0068646')['title'] == text_type(u'The Godfather') >>> Imdb('0068646')['title'] == 'The Godfather'
True True
>>> Imdb('0133093')['title'] == text_type(u'The Matrix') >>> Imdb('0133093')['title'] == 'The Matrix'
True True
''' '''
regex = { regex = {
'alternativeTitles': { 'alternativeTitles': {
'page': 'releaseinfo', 'page': 'releaseinfo',
're': [ 're': [
'<h4[^>]*?id="akas"[^>]*?>(.*?)</table>', '<li role="presentation" class="ipc-metadata-list__item" data-testid="list-item"><button class="ipc-metadata-list-item__label" role="button" tabindex="0" aria-disabled="false">([^>]+)</button.*?<li role="presentation" class="ipc-inline-list__item"><label class="ipc-metadata-list-item__list-content-item"[^>]*?>([^<]+)</label>',
"td[^>]*?>(.*?)</td>.*?<td[^>]*?>(.*?)</td>"
], ],
'type': 'list' 'type': 'list'
}, },
@ -152,21 +276,7 @@ class Imdb(SiteParser):
'type': 'list' 'type': 'list'
}, },
'cinematographer': reference_section('cinematographers'), 'cinematographer': reference_section('cinematographers'),
'connections': {
'page': 'movieconnections',
're': '<h4 class="li_group">(.*?)</h4>(.*?)(<\/div>\n <a|<script)',
'type': 'list'
},
'country': zebra_list('Country', more=['<a.*?>(.*?)</a>']), 'country': zebra_list('Country', more=['<a.*?>(.*?)</a>']),
'creator': {
'page': '',
're': [
'<div class="credit_summary_item">.*?<h4.*?>Creator.?:</h4>(.*?)</div>',
'<a href="/name/.*?>(.*?)</a>',
lambda ll: strip_tags(ll)
],
'type': 'list'
},
'director': reference_section('directors'), 'director': reference_section('directors'),
'editor': reference_section('editors'), 'editor': reference_section('editors'),
'composer': reference_section('composers'), 'composer': reference_section('composers'),
@ -175,27 +285,14 @@ class Imdb(SiteParser):
're': '<h3 itemprop="name">(.*?)<', 're': '<h3 itemprop="name">(.*?)<',
'type': 'string' 'type': 'string'
}, },
'filmingLocations': {
'page': 'locations',
're': [
'<a href="/search/title\?locations=.*?".*?>(.*?)</a>',
lambda data: data.strip(),
],
'type': 'list'
},
'genre': zebra_list('Genres', more=['<a.*?>(.*?)</a>']), 'genre': zebra_list('Genres', more=['<a.*?>(.*?)</a>']),
'gross': zebra_table('Cumulative Worldwide Gross', more=[ 'gross': zebra_table('Cumulative Worldwide Gross', more=[
lambda data: find_re(decode_html(data).replace(',', ''), '\d+') lambda data: find_re(decode_html(data).replace(',', ''), '\d+')
], type='int'), ], type='int'),
'keyword': {
'page': 'keywords',
're': 'data-item-keyword="(.*?)"',
'type': 'list'
},
'language': zebra_list('Language', more=['<a.*?>(.*?)</a>']), 'language': zebra_list('Language', more=['<a.*?>(.*?)</a>']),
'originalTitle': { 'originalTitle': {
'page': 'releaseinfo', 'page': 'releaseinfo',
're': '<td.*?>\s*?\(original title\)\s*?</td>\s*<td.*?>(.*?)</td>', 're': '<li role="presentation" class="ipc-metadata-list__item" data-testid="list-item"><button class="ipc-metadata-list-item__label" role="button" tabindex="0" aria-disabled="false">\(original title\)</button.*?<li role="presentation" class="ipc-inline-list__item"><label class="ipc-metadata-list-item__list-content-item"[^>]*?>([^<]+)</label>',
'type': 'string' 'type': 'string'
}, },
'summary': zebra_table('Plot Summary', more=[ 'summary': zebra_table('Plot Summary', more=[
@ -228,14 +325,6 @@ class Imdb(SiteParser):
], ],
'type': 'float' 'type': 'float'
}, },
'releasedate': {
'page': 'releaseinfo',
're': [
'<td class="release-date-item__date".*?>(.*?)</td>',
strip_tags,
],
'type': 'list'
},
#FIXME using some /offsite/ redirect now #FIXME using some /offsite/ redirect now
#'reviews': { #'reviews': {
# 'page': 'externalreviews', # 'page': 'externalreviews',
@ -251,11 +340,6 @@ class Imdb(SiteParser):
lambda r: r[0] if isinstance(r, list) else r, lambda r: r[0] if isinstance(r, list) else r,
strip_tags strip_tags
]), ]),
'sound': zebra_list('Sound Mix', more=[
'<a.*?>([^(<]+)',
lambda r: r[0] if isinstance(r, list) else r,
strip_tags
]),
'season': { 'season': {
'page': 'reference', 'page': 'reference',
're': [ 're': [
@ -284,7 +368,7 @@ class Imdb(SiteParser):
}, },
'title': { 'title': {
'page': 'releaseinfo', 'page': 'releaseinfo',
're': 'h3 itemprop="name">.*?>(.*?)</a>', 're': '<h2.*?>(.*?)</h2>',
'type': 'string' 'type': 'string'
}, },
'trivia': { 'trivia': {
@ -323,19 +407,34 @@ class Imdb(SiteParser):
}, },
'laboratory': technical('Laboratory'), 'laboratory': technical('Laboratory'),
'camera': technical('Camera'), 'camera': technical('Camera'),
'negative format': technical('Negative Format'),
'cinematographic process': technical('Cinematographic Process'),
'printed film format': technical('Printed Film Format'),
} }
def read_url(self, url, timeout): def read_url(self, url, timeout):
if self.debug:
print(url)
if url not in self._cache: if url not in self._cache:
self._cache[url] = read_url(url, timeout=timeout, unicode=True) self._cache[url] = read_url(url, timeout=timeout, unicode=True)
return self._cache[url] return self._cache[url]
def get_page_data(self, page, timeout=-1):
url = self.get_url(page)
data = self.read_url(url, timeout)
pdata = re.compile('<script id="__NEXT_DATA__" type="application/json">(.*?)</script>', re.DOTALL).findall(data)
if pdata:
pdata = pdata[0]
return json.loads(pdata)
return {}
def __init__(self, id, timeout=-1): def __init__(self, id, timeout=-1):
# http://www.imdb.com/help/show_leaf?titlelanguagedisplay # http://www.imdb.com/help/show_leaf?titlelanguagedisplay
self.baseUrl = "http://www.imdb.com/title/tt%s/" % id self.baseUrl = "http://www.imdb.com/title/tt%s/" % id
self._id = id
if timeout != 0:
self._cache = {}
url = self.baseUrl + 'releaseinfo'
page = self.read_url(url, timeout=-1)
if '<h2>See also</h2>' in page:
timeout = 0
super(Imdb, self).__init__(timeout) super(Imdb, self).__init__(timeout)
url = self.baseUrl + 'reference' url = self.baseUrl + 'reference'
@ -349,7 +448,7 @@ class Imdb(SiteParser):
if 'alternativeTitles' in self: if 'alternativeTitles' in self:
if len(self['alternativeTitles']) == 2 and \ if len(self['alternativeTitles']) == 2 and \
isinstance(self['alternativeTitles'][0], string_types): isinstance(self['alternativeTitles'][0], str):
self['alternativeTitles'] = [self['alternativeTitles']] self['alternativeTitles'] = [self['alternativeTitles']]
for key in ('country', 'genre', 'language', 'sound', 'color'): for key in ('country', 'genre', 'language', 'sound', 'color'):
@ -414,7 +513,7 @@ class Imdb(SiteParser):
self['sound'] = list(sorted(set(self['sound']))) self['sound'] = list(sorted(set(self['sound'])))
if 'cast' in self: if 'cast' in self:
if isinstance(self['cast'][0], string_types): if isinstance(self['cast'][0], str):
self['cast'] = [self['cast']] self['cast'] = [self['cast']]
self['actor'] = [c[0] for c in self['cast']] self['actor'] = [c[0] for c in self['cast']]
def cleanup_character(c): def cleanup_character(c):
@ -424,26 +523,6 @@ class Imdb(SiteParser):
self['cast'] = [{'actor': x[0], 'character': cleanup_character(x[1])} self['cast'] = [{'actor': x[0], 'character': cleanup_character(x[1])}
for x in self['cast']] for x in self['cast']]
if 'connections' in self:
cc={}
if len(self['connections']) == 3 and isinstance(self['connections'][0], string_types):
self['connections'] = [self['connections']]
for rel, data, _ in self['connections']:
if isinstance(rel, bytes):
rel = rel.decode('utf-8')
#cc[rel] = re.compile('<a href="/title/tt(\d+)/">(.*?)</a>').findall(data)
def get_conn(c):
r = {
'id': c[0],
'title': cleanup_title(c[1]),
}
description = c[2].split('<br />')
if len(description) == 2 and description[-1].strip() != '-':
r['description'] = description[-1].strip()
return r
cc[rel] = list(map(get_conn, re.compile('<a href="/title/tt(\d+)/?">(.*?)</a>(.*?)<\/div', re.DOTALL).findall(data)))
self['connections'] = cc
if 'isSeries' in self: if 'isSeries' in self:
del self['isSeries'] del self['isSeries']
@ -451,49 +530,6 @@ class Imdb(SiteParser):
if 'episodeTitle' in self: if 'episodeTitle' in self:
self['episodeTitle'] = re.sub('Episode \#\d+\.\d+', '', self['episodeTitle']) self['episodeTitle'] = re.sub('Episode \#\d+\.\d+', '', self['episodeTitle'])
if 'series' in self:
series = Imdb(self['series'], timeout=timeout)
self['seriesTitle'] = series['title']
if 'episodeTitle' in self:
self['seriesTitle'] = series['title']
if 'season' in self and 'episode' in self:
self['title'] = "%s (S%02dE%02d) %s" % (
self['seriesTitle'], self['season'], self['episode'], self['episodeTitle'])
else:
self['title'] = "%s (S01) %s" % (self['seriesTitle'], self['episodeTitle'])
self['season'] = 1
self['title'] = self['title'].strip()
if 'director' in self:
self['episodeDirector'] = self['director']
if 'creator' not in series and 'director' in series:
series['creator'] = series['director']
if len(series['creator']) > 10:
series['creator'] = series['director'][:1]
for key in ['creator', 'country']:
if key in series:
self[key] = series[key]
if 'year' in series:
self['seriesYear'] = series['year']
if 'year' not in self:
self['year'] = series['year']
if 'year' in self:
self['episodeYear'] = self['year']
if 'creator' in self:
self['seriesDirector'] = self['creator']
if 'originalTitle' in self:
del self['originalTitle']
else:
for key in ('seriesTitle', 'episodeTitle', 'season', 'episode'):
if key in self:
del self[key]
if 'creator' in self:
if 'director' in self:
self['episodeDirector'] = self['director']
self['director'] = self['creator']
#make lists unique but keep order #make lists unique but keep order
for key in ('director', 'language'): for key in ('director', 'language'):
@ -511,21 +547,20 @@ class Imdb(SiteParser):
if 'budget' in self and 'gross' in self: if 'budget' in self and 'gross' in self:
self['profit'] = self['gross'] - self['budget'] self['profit'] = self['gross'] - self['budget']
if 'releasedate' in self: metadata = self.get_page_data('releaseinfo')
def parse_date(d): releasedate = get_release_date(metadata)
try: if releasedate:
d = datetime.strptime(d, '%d %B %Y') self['releasedate'] = releasedate
except:
try: metadata = self.get_page_data('keywords')
d = datetime.strptime(d, '%B %Y') keywords = get_keywords(metadata)
except: if keywords:
return 'x' self['keyword'] = keywords
return '%d-%02d-%02d' % (d.year, d.month, d.day)
self['releasedate'] = min([ metadata = self.get_page_data('locations')
parse_date(d) for d in self['releasedate'] locations = get_locations(metadata)
]) if locations:
if self['releasedate'] == 'x': self['filmingLocations'] = locations
del self['releasedate']
if 'summary' not in self and 'storyline' in self: if 'summary' not in self and 'storyline' in self:
self['summary'] = self.pop('storyline') self['summary'] = self.pop('storyline')
@ -533,6 +568,22 @@ class Imdb(SiteParser):
if isinstance(self['summary'], list): if isinstance(self['summary'], list):
self['summary'] = self['summary'][0] self['summary'] = self['summary'][0]
self['summary'] = strip_tags(self['summary'].split('</p')[0]).split(' Written by\n')[0].strip() self['summary'] = strip_tags(self['summary'].split('</p')[0]).split(' Written by\n')[0].strip()
else:
try:
summary = metadata['props']['pageProps']['contentData']['entityMetadata']['plot']['plotText']['plainText']
self['summary'] = summary
except:
pass
#self['connections'] = movie_connections(self.get_page_data('movieconnections'))
self['connections'] = self._get_connections()
spec = tech_spec(self.get_page_data('technical'))
for key in spec:
if not self.get(key):
self[key] = spec[key]
if 'credits' in self: if 'credits' in self:
credits = [ credits = [
@ -581,6 +632,115 @@ class Imdb(SiteParser):
series_credit = [c for c in self['credits'] if c.get('deparment') == deparment] series_credit = [c for c in self['credits'] if c.get('deparment') == deparment]
if series_credit: if series_credit:
self[key] = [c['name'] for c in series_credit] self[key] = [c['name'] for c in series_credit]
creator = []
for c in self.get('credits', []):
if '(created by)' in c['roles'] and c['name'] not in creator:
creator.append(c['name'])
if '(creator)' in c['roles'] and c['name'] not in creator:
creator.append(c['name'])
if creator:
self['creator'] = creator
if 'series' in self:
series = Imdb(self['series'], timeout=timeout)
self['seriesTitle'] = series['title']
if 'episodeTitle' in self:
self['seriesTitle'] = series['title']
if 'season' in self and 'episode' in self:
self['title'] = "%s (S%02dE%02d) %s" % (
self['seriesTitle'], self['season'], self['episode'], self['episodeTitle'])
else:
self['title'] = "%s (S01) %s" % (self['seriesTitle'], self['episodeTitle'])
self['season'] = 1
self['title'] = self['title'].strip()
if 'director' in self:
self['episodeDirector'] = self['director']
if 'creator' not in series and 'director' in series:
series['creator'] = series['director']
if len(series['creator']) > 10:
series['creator'] = series['director'][:1]
for key in ['creator', 'country']:
if key in series:
self[key] = series[key]
if 'year' in series:
self['seriesYear'] = series['year']
if 'year' not in self:
self['year'] = series['year']
if 'year' in self:
self['episodeYear'] = self['year']
if 'creator' in self:
self['seriesDirector'] = self['creator']
if 'originalTitle' in self:
del self['originalTitle']
else:
for key in ('seriesTitle', 'episodeTitle', 'season', 'episode'):
if key in self:
del self[key]
if 'creator' in self:
if 'director' in self:
self['episodeDirector'] = self['director']
self['director'] = self['creator']
def _get_connections(self):
query = '''query {
title(id: "tt%s") {
id
titleText {
text
}
connections(first: 5000) {
edges {
node {
associatedTitle {
id
titleText {
text
}
}
category {
text
}
text
}
}
}
}
}
''' % self._id
url = 'https://caching.graphql.imdb.com/'
headers = cache.DEFAULT_HEADERS.copy()
headers.update({
'Accept': 'application/graphql+json, application/json',
'Origin': 'https://www.imdb.com',
'Referer': 'https://www.imdb.com',
'x-imdb-user-country': 'US',
'x-imdb-user-language': 'en-US',
'content-type': 'application/json',
'Accept-Language': 'en,en-US;q=0.5'
})
#response = requests.post(url, json=
response = json.loads(read_url(url, data=json.dumps({
"query": query
}), headers=headers))
connections = {}
for c in response['data']['title']['connections']['edges']:
cat = c['node']['category']['text']
if cat not in connections:
connections[cat] = []
connection = {
'id': c['node']['associatedTitle']['id'][2:],
'title': c['node']['associatedTitle']['titleText']['text'],
}
description = c['node'].get('text', '')
if description:
connection['description'] = description
connections[cat].append(connection)
return connections
class ImdbCombined(Imdb): class ImdbCombined(Imdb):
def __init__(self, id, timeout=-1): def __init__(self, id, timeout=-1):

View file

@ -2,7 +2,7 @@
# encoding: utf-8 # encoding: utf-8
from __future__ import print_function from __future__ import print_function
import re import re
from six.moves.urllib.parse import urlencode from urllib.parse import urlencode
from ox.cache import read_url from ox.cache import read_url
from ox.html import decode_html, strip_tags from ox.html import decode_html, strip_tags

View file

@ -2,7 +2,7 @@
# vi:si:et:sw=4:sts=4:ts=4 # vi:si:et:sw=4:sts=4:ts=4
import re import re
from six.moves.urllib.parse import quote from urllib.parse import quote
from lxml.html import document_fromstring from lxml.html import document_fromstring
from ox.cache import read_url from ox.cache import read_url

View file

@ -1,10 +1,9 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4 # vi:si:et:sw=4:sts=4:ts=4
import re import re
import json
from multiprocessing.pool import ThreadPool from multiprocessing.pool import ThreadPool
from six import string_types
from ..cache import read_url from ..cache import read_url
from .. import decode_html from .. import decode_html
from ..utils import datetime from ..utils import datetime
@ -12,15 +11,15 @@ from ..utils import datetime
def cleanup(key, data, data_type): def cleanup(key, data, data_type):
if data: if data:
if isinstance(data[0], string_types): if isinstance(data[0], str):
#FIXME: some types need strip_tags #FIXME: some types need strip_tags
#data = [strip_tags(decode_html(p)).strip() for p in data] #data = [strip_tags(decode_html(p)).strip() for p in data]
data = [decode_html(p).strip() for p in data] data = [decode_html(p).strip() for p in data]
elif isinstance(data[0], list) or isinstance(data[0], tuple): elif isinstance(data[0], list) or isinstance(data[0], tuple):
data = [cleanup(key, p, data_type) for p in data] data = [cleanup(key, p, data_type) for p in data]
while len(data) == 1 and not isinstance(data, string_types): while len(data) == 1 and not isinstance(data, str):
data = data[0] data = data[0]
if data_type == 'list' and isinstance(data, string_types): if data_type == 'list' and isinstance(data, str):
data = [data, ] data = [data, ]
elif data_type != 'list': elif data_type != 'list':
data = '' data = ''
@ -30,6 +29,7 @@ class SiteParser(dict):
baseUrl = '' baseUrl = ''
regex = {} regex = {}
pool = ThreadPool(8) pool = ThreadPool(8)
debug = False
def get_url(self, page): def get_url(self, page):
return "%s%s" % (self.baseUrl, page) return "%s%s" % (self.baseUrl, page)
@ -47,7 +47,7 @@ class SiteParser(dict):
for key in self.regex: for key in self.regex:
url = self.get_url(self.regex[key]['page']) url = self.get_url(self.regex[key]['page'])
data = self.read_url(url, timeout) data = self.read_url(url, timeout)
if isinstance(self.regex[key]['re'], string_types): if isinstance(self.regex[key]['re'], str):
data = re.compile(self.regex[key]['re'], re.DOTALL).findall(data) data = re.compile(self.regex[key]['re'], re.DOTALL).findall(data)
data = cleanup(key, data, self.regex[key]['type']) data = cleanup(key, data, self.regex[key]['type'])
elif callable(self.regex[key]['re']): elif callable(self.regex[key]['re']):
@ -58,7 +58,7 @@ class SiteParser(dict):
f = r f = r
else: else:
f = re.compile(r, re.DOTALL).findall f = re.compile(r, re.DOTALL).findall
if isinstance(data, string_types): if isinstance(data, str):
data = f(data) data = f(data)
else: else:
data = [f(d) for d in data] data = [f(d) for d in data]
@ -76,6 +76,10 @@ class SiteParser(dict):
elif self.regex[key]['type'] == 'date': elif self.regex[key]['type'] == 'date':
parse_date = lambda d: d and datetime.strptime('-'.join(d), '%m-%d-%Y').strftime('%Y-%m-%d') parse_date = lambda d: d and datetime.strptime('-'.join(d), '%m-%d-%Y').strftime('%Y-%m-%d')
data = apply_f(parse_date, data) data = apply_f(parse_date, data)
elif self.regex[key]['type'] == 'json':
if isinstance(data, list) and len(data) == 1:
data = data[0]
data = json.loads(data)
if data: if data:
self[key] = data self[key] = data

View file

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4 # vi:si:et:sw=4:sts=4:ts=4
from six.moves import urllib import urllib
import lxml.html import lxml.html
import ox import ox

View file

@ -3,7 +3,7 @@
from datetime import datetime from datetime import datetime
import re import re
from six.moves.urllib.parse import quote from urllib.parse import quote
from ox import find_re, cache, strip_tags, decode_html, get_torrent_info, normalize_newlines from ox import find_re, cache, strip_tags, decode_html, get_torrent_info, normalize_newlines
from ox.normalize import normalize_imdbid from ox.normalize import normalize_imdbid

View file

@ -2,7 +2,7 @@
# vi:si:et:sw=4:sts=4:ts=4 # vi:si:et:sw=4:sts=4:ts=4
import re import re
from datetime import datetime from datetime import datetime
from six.moves.urllib.parse import quote from urllib.parse import quote
import lxml.html import lxml.html
import ox import ox

View file

@ -4,8 +4,7 @@ from __future__ import print_function
import re import re
from six.moves import urllib import urllib
from six import string_types
from ox.utils import json from ox.utils import json
from ox.cache import read_url from ox.cache import read_url
@ -69,7 +68,7 @@ def get_movie_data(wikipedia_url):
value = value.split('<br>') value = value.split('<br>')
if value: if value:
if key in filmbox: if key in filmbox:
if isinstance(value, list) and isinstance(filmbox[key], string_types): if isinstance(value, list) and isinstance(filmbox[key], str):
filmbox[key] = [filmbox[key]] + value filmbox[key] = [filmbox[key]] + value
else: else:
filmbox[key] += value filmbox[key] += value

View file

@ -1,8 +1,8 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4 # vi:si:et:sw=4:sts=4:ts=4
from six.moves.urllib.parse import quote, unquote_plus from urllib.parse import quote, unquote_plus
from six.moves import urllib import urllib
from six.moves import http_cookiejar as cookielib from http import cookiejar as cookielib
import re import re
from xml.dom.minidom import parseString from xml.dom.minidom import parseString
import json import json

View file

@ -1,3 +1,4 @@
chardet chardet
six>=1.5.2
lxml lxml
requests
Pillow>=10

View file

@ -2,14 +2,12 @@
# vi:si:et:sw=4:sts=4:ts=4 # vi:si:et:sw=4:sts=4:ts=4
# encoding: utf-8 # encoding: utf-8
try: from setuptools import setup
from setuptools import setup
except:
from distutils.core import setup
def get_revision(): def get_git_version():
import subprocess import subprocess
return subprocess.check_output(['git', 'rev-list', 'HEAD', '--count']).decode().strip() version = subprocess.check_output(['git', 'describe', '--tags']).decode().strip().replace('-', '.')
return '.'.join((version.split('.') + ['0'])[:3])
def get_version(): def get_version():
import os import os
@ -18,9 +16,8 @@ def get_version():
__version = os.path.join(os.path.dirname(__file__), 'ox/__version.py') __version = os.path.join(os.path.dirname(__file__), 'ox/__version.py')
changelog = os.path.join(os.path.dirname(__file__), 'debian/changelog') changelog = os.path.join(os.path.dirname(__file__), 'debian/changelog')
if os.path.exists(_git): if os.path.exists(_git):
rev = get_revision() version = get_git_version()
if rev: if version:
version = "2.3.%s" % rev
with open(__version, 'w') as fd: with open(__version, 'w') as fd:
fd.write('VERSION="%s"' % version) fd.write('VERSION="%s"' % version)
return version return version
@ -37,8 +34,8 @@ def get_version():
f.close() f.close()
rev = re.compile('\d+\.\d+\.(\d+)').findall(head) rev = re.compile('\d+\.\d+\.(\d+)').findall(head)
if rev: if rev:
return '2.3.%s' % rev[0] return '3.0.%s' % rev[0]
return '2.3.x' return '3.0.x'
setup( setup(
@ -50,17 +47,13 @@ setup(
url="https://code.0x2620.org/0x2620/python-ox", url="https://code.0x2620.org/0x2620/python-ox",
license="GPLv3", license="GPLv3",
packages=['ox', 'ox.torrent', 'ox.web'], packages=['ox', 'ox.torrent', 'ox.web'],
install_requires=['six>=1.5.2', 'chardet'], install_requires=['chardet', 'requests'],
keywords=[ keywords=[
], ],
classifiers=[ classifiers=[
'Operating System :: OS Independent', 'Operating System :: OS Independent',
'Programming Language :: Python', 'Programming Language :: Python',
'Programming Language :: Python :: 2',
'Programming Language :: Python :: 2.7',
'Programming Language :: Python :: 3', 'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3.4',
'Programming Language :: Python :: 3.5',
'Topic :: Software Development :: Libraries :: Python Modules', 'Topic :: Software Development :: Libraries :: Python Modules',
], ],
) )