diff --git a/ox/__init__.py b/ox/__init__.py index 98402fb..c90de79 100644 --- a/ox/__init__.py +++ b/ox/__init__.py @@ -5,7 +5,7 @@ try: from . import __version __version__ = __version.VERSION except: - __version__ = '2.3.x' + __version__ = '3.0.x' from . import cache from . import js @@ -17,7 +17,6 @@ from . import vtt from .api import * from .file import * -from .form import * from .format import * from .geo import * from .html import * diff --git a/ox/api.py b/ox/api.py index b784788..3e43493 100644 --- a/ox/api.py +++ b/ox/api.py @@ -4,19 +4,20 @@ from __future__ import print_function from types import MethodType import gzip +import mimetypes import os import shutil import sys import time -from six.moves import http_cookiejar as cookielib -from six import BytesIO, PY2 -from six.moves import urllib -from six.moves.urllib.parse import urlparse +from http import cookiejar as cookielib +from io import BytesIO +import urllib +from urllib.parse import urlparse +import requests from . import __version__ from .utils import json -from .form import MultiPartForm __all__ = ['getAPI', 'API'] @@ -37,12 +38,13 @@ class API(object): self._cj = cj else: self._cj = cookielib.CookieJar() - self._opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(self._cj), - urllib.request.HTTPHandler(debuglevel=self.debuglevel)) - self._opener.addheaders = [ - ('User-Agent', '%s/%s' % (self.__name__, self.__version__)) - ] + self._requests_session = requests.Session() + self._requests_session.cookies = self._cj + self._requests_session.headers = { + 'User-Agent': '%s/%s' % (self.__name__, self.__version__), + 'Accept-Encoding': 'gzip, deflate', + } self.url = url r = self._request('api', {'docs': True}) self._properties = r['data']['actions'] @@ -53,10 +55,7 @@ class API(object): def _add_method(self, method, name): if name is None: name = method.func_name - if PY2: - setattr(self, name, MethodType(method, self, type(self))) - else: - setattr(self, name, MethodType(method, self)) + setattr(self, name, MethodType(method, self)) def _add_action(self, action): def method(self, *args, **kw): @@ -70,37 +69,20 @@ class API(object): return self._request(action, kw) if 'doc' in self._properties[action]: method.__doc__ = self._properties[action]['doc'] - if PY2: - method.func_name = str(action) - else: - method.func_name = action + method.func_name = action self._add_method(method, action) - def _json_request(self, url, form): + def _json_request(self, url, data, files=None): result = {} try: - body = form.body() - if PY2: - if not isinstance(url, bytes): - url = url.encode('utf-8') - request = urllib.request.Request(url) - request.add_data(body) - else: - request = urllib.request.Request(url, data=body, method='POST') - request.add_header('Content-Type', form.get_content_type()) - request.add_header('Content-Length', str(len(body))) - request.add_header('Accept-Encoding', 'gzip, deflate') - f = self._opener.open(request) - result = f.read() - if f.headers.get('content-encoding', None) == 'gzip': - result = gzip.GzipFile(fileobj=BytesIO(result)).read() - result = result.decode('utf-8') - return json.loads(result) + request = self._requests_session.post(url, data=data, files=files) + result = request.json() + return result except urllib.error.HTTPError as e: if self.DEBUG: import webbrowser if e.code >= 500: - with open('/tmp/error.html', 'w') as f: + with open('/tmp/error.html', 'wb') as f: f.write(e.read()) webbrowser.open_new_tab('/tmp/error.html') @@ -125,17 +107,15 @@ class API(object): raise def _request(self, action, data=None): - form = MultiPartForm() - form.add_field('action', action) + form = { + 'action': action + } if data: - form.add_field('data', json.dumps(data)) + form['data'] = json.dumps(data) return self._json_request(self.url, form) def get_url(self, url): - request = urllib.request.Request(url, method='GET') - f = self._opener.open(request) - result = f.read() - return result + return self._requests_session.get(url).content def save_url(self, url, filename, overwrite=False): chunk_size = 16 * 1024 @@ -143,21 +123,15 @@ class API(object): dirname = os.path.dirname(filename) if dirname and not os.path.exists(dirname): os.makedirs(dirname) - request = urllib.request.Request(url, method='GET') tmpname = filename + '.tmp' with open(tmpname, 'wb') as fd: - u = self._opener.open(request) - for chunk in iter(lambda: u.read(chunk_size), b''): + r = self._requests_session.get(url) + for chunk in iter(lambda: r.read(chunk_size), b''): fd.write(chunk) shutil.move(tmpname, filename) - def upload_chunks(self, url, filename, data=None, silent=False): - form = MultiPartForm() - if data: - for key in data: - form.add_field(key, data[key]) - data = self._json_request(url, form) + data = self._json_request(url, data) def full_url(path): if path.startswith('/'): @@ -178,16 +152,20 @@ class API(object): resume_offset = 0 chunk = f.read(CHUNK_SIZE) fname = os.path.basename(filename) + mime_type = mimetypes.guess_type(fname)[0] or 'application/octet-stream' if not isinstance(fname, bytes): fname = fname.encode('utf-8') while chunk: - form = MultiPartForm() - form.add_file('chunk', fname, chunk) + meta = { + 'offset': str(done) + } if len(chunk) < CHUNK_SIZE or f.tell() == fsize: - form.add_field('done', '1') - form.add_field('offset', str(done)) + meta['done'] = '1' + files = [ + ('chunk', (fname, chunk, mime_type)) + ] try: - data = self._json_request(uploadUrl, form) + data = self._json_request(uploadUrl, meta, files=files) except KeyboardInterrupt: if not slient: print("\ninterrupted by user.") diff --git a/ox/cache.py b/ox/cache.py index c359cbd..3954ea7 100644 --- a/ox/cache.py +++ b/ox/cache.py @@ -10,15 +10,11 @@ import sqlite3 import time import zlib -from six import BytesIO -from six.moves import urllib -from six import PY2 -try: - import requests - USE_REQUESTS = True - requests_session = requests.Session() -except: - USE_REQUESTS = False +from io import BytesIO +import urllib +import requests +from requests.structures import CaseInsensitiveDict + from .utils import json from .file import makedirs @@ -28,6 +24,7 @@ from .net import DEFAULT_HEADERS, detect_encoding cache_timeout = 30*24*60*60 # default is 30 days +requests_session = requests.Session() COMPRESS_TYPES = ( 'text/html', @@ -69,7 +66,7 @@ def get_headers(url, data=None, headers=None, timeout=cache_timeout): if not url_headers: url_headers = net.get_headers(url, data, headers) store.set(url, data, -1, url_headers) - return url_headers + return CaseInsensitiveDict(url_headers) def get_json(url, data=None, headers=None, timeout=cache_timeout): return json.loads(read_url(url, data, headers, timeout).decode('utf-8')) @@ -101,32 +98,20 @@ def read_url(url, data=None, headers=None, timeout=cache_timeout, valid=None, un result = store.get(url, data, headers, timeout) url_headers = {} if not result: - if USE_REQUESTS: - if headers is None: - headers = DEFAULT_HEADERS.copy() - r = requests_session.get(url, headers=headers) - for key in r.headers: - url_headers[key.lower()] = r.headers[key] - result = r.content - url_headers['Status'] = "%s" % r.status_code - if not valid or valid(result, url_headers): - store.set(url, post_data=data, data=result, headers=url_headers) - else: - raise InvalidResult(result, url_headers) + if headers is None: + headers = DEFAULT_HEADERS.copy() + if data: + r = requests_session.post(url, data=data, headers=headers) else: - try: - url_headers, result = net.read_url(url, data, headers, return_headers=True) - except urllib.error.HTTPError as e: - e.headers['Status'] = "%s" % e.code - for key in e.headers: - url_headers[key.lower()] = e.headers[key] - result = e.read() - if url_headers.get('content-encoding', None) == 'gzip': - result = gzip.GzipFile(fileobj=BytesIO(result)).read() - if not valid or valid(result, url_headers): - store.set(url, post_data=data, data=result, headers=url_headers) - else: - raise InvalidResult(result, url_headers) + r = requests_session.get(url, headers=headers) + for key in r.headers: + url_headers[key.lower()] = r.headers[key] + result = r.content + url_headers['Status'] = "%s" % r.status_code + if not valid or valid(result, url_headers): + store.set(url, post_data=data, data=result, headers=url_headers) + else: + raise InvalidResult(result, url_headers) if unicode: ctype = url_headers.get('content-type', '').lower() if 'charset' in ctype: @@ -239,8 +224,6 @@ class SQLiteCache(Cache): elif value == 'data': if row[1] == 1: r = zlib.decompress(r) - elif PY2: - r = str(r) break c.close() @@ -279,6 +262,8 @@ class SQLiteCache(Cache): data = zlib.compress(data) else: compressed = 0 + if isinstance(data, str): + data = data.encode("utf-8") data = sqlite3.Binary(data) #fixme: this looks wrong diff --git a/ox/file.py b/ox/file.py index f12aee7..4b74b36 100644 --- a/ox/file.py +++ b/ox/file.py @@ -19,7 +19,8 @@ __all__ = ['sha1sum', 'oshash', 'avinfo', 'makedirs', 'iexists'] EXTENSIONS = { 'audio': [ 'aac', 'aif', 'aiff', 'amr', - 'flac', 'm4a', 'mp3', 'oga', 'ogg', 'wav', 'wma', 'opus' + 'flac', 'm4a', 'mp3', 'oga', 'ogg', 'wav', 'wma', 'opus', + 'ra', # Real Audio ], 'image': [ 'bmp', 'gif', 'jpeg', 'jpg', 'png', 'svg', 'webp' @@ -29,11 +30,12 @@ EXTENSIONS = { ], 'video': [ '3gp', - 'avi', 'divx', 'dv', 'flv', 'm2t', 'm2ts', 'm4v', 'mkv', 'mov', 'mp4', - 'mpeg', 'mpg', 'mts', 'ogm', 'ogv', 'rm', 'rmvb', 'vob', 'webm', 'wmv', 'asf', + 'avi', 'divx', 'dv', 'flv', 'm2t', 'm2ts', 'm2v', 'm4v', 'mkv', 'mov', 'mp4', + 'mpeg', 'mpg', 'mts', 'ogm', 'ogv', 'vob', 'webm', 'wmv', 'asf', 'mod', 'tod', # http://en.wikipedia.org/wiki/MOD_and_TOD 'mxf', 'ts', 'dat', # VOD files + 'rm', 'rmvb', # Real Media ], } @@ -214,12 +216,16 @@ def ffprobe(filename): ] for s in ffinfo['streams']: tags = s.pop('tags', {}) + side_data_list = s.pop('side_data_list', []) language = None for t in tags: if t == 'language': language = tags[t] else: info['metadata'][t] = tags[t] + for kv in side_data_list: + for k, v in kv.items(): + info['metadata'][k] = v if s.get('codec_type') in ('audio', 'video'): stream = {} if language and language != 'und': @@ -273,9 +279,15 @@ def ffprobe(filename): pass # print s for v in info['video']: - if 'rotate' in info.get('metadata', {}) and int(info['metadata']['rotate']) in (-180, -90, 90, 180): - v['width'], v['height'] = v['height'], v['width'] k = 'display_aspect_ratio' + if 'rotate' in info.get('metadata', {}) and int(info['metadata']['rotate']) in (-90, 90): + v['width'], v['height'] = v['height'], v['width'] + if k in v: + v[k] = ':'.join(reversed(v[k].split(':'))) + elif 'rotation' in info.get('metadata', {}) and int(info['metadata']['rotation']) in (-90, 90): + v['width'], v['height'] = v['height'], v['width'] + if k in v: + v[k] = ':'.join(reversed(v[k].split(':'))) if k not in v and 'width' in v \ or (k in v and v[k] == '0:1'): v[k] = '%d:%d' % (v['width'], v['height']) diff --git a/ox/fixunicode.py b/ox/fixunicode.py index d3a162d..e0386c6 100644 --- a/ox/fixunicode.py +++ b/ox/fixunicode.py @@ -6,7 +6,6 @@ from __future__ import print_function import unicodedata -from six import unichr, text_type __all__ = ['fix_bad_unicode'] @@ -151,7 +150,7 @@ def text_badness(text): - Improbable single-byte characters, such as ƒ or ¬ - Letters in somewhat rare scripts ''' - assert isinstance(text, text_type) + assert isinstance(text, str) errors = 0 very_weird_things = 0 weird_things = 0 @@ -289,7 +288,7 @@ SINGLE_BYTE_WEIRDNESS = ( # Pre-cache the Unicode data saying which of these first 256 characters are # letters. We'll need it often. SINGLE_BYTE_LETTERS = [ - unicodedata.category(unichr(i)).startswith('L') + unicodedata.category(chr(i)).startswith('L') for i in range(256) ] diff --git a/ox/form.py b/ox/form.py deleted file mode 100644 index faa1551..0000000 --- a/ox/form.py +++ /dev/null @@ -1,108 +0,0 @@ -# -*- coding: utf-8 -*- -# vi:si:et:sw=4:sts=4:ts=4 -# GPL 2014 -from __future__ import print_function - -import itertools -import mimetypes -import os -import hashlib -import sys - -from six import PY2 - - -__all__ = ['MultiPartForm'] - -# from /usr/lib/python3.4/email/generator.py -# Helper used by Generator._make_boundary -_width = len(repr(sys.maxsize-1)) -_fmt = '%%0%dd' % _width - -def _make_boundary(): - # Craft a random boundary. - boundary = ('=' * 15) + hashlib.sha1(os.urandom(32)).hexdigest() + '==' - return boundary - -class MultiPartForm(object): - """Accumulate the data to be used when posting a form.""" - - def __init__(self): - self.form_fields = [] - self.files = [] - self.boundary = _make_boundary() - return - - def get_content_type(self): - return 'multipart/form-data; boundary=%s' % self.boundary - - def add_field(self, name, value): - """Add a simple field to the form data.""" - if isinstance(name, bytes): - name = name.decode('utf-8') - if isinstance(value, bytes): - value = value.decode('utf-8') - self.form_fields.append((name, value)) - return - - def add_file(self, fieldname, filename, fileHandle, mimetype=None): - """Add a file to be uploaded.""" - if isinstance(fieldname, bytes): - fieldname = fieldname.decode('utf-8') - if isinstance(filename, bytes): - filename = filename.decode('utf-8') - - if hasattr(fileHandle, 'read'): - body = fileHandle.read() - else: - body = fileHandle - if mimetype is None: - mimetype = mimetypes.guess_type(filename)[0] or 'application/octet-stream' - self.files.append((fieldname, filename, mimetype, body)) - return - - def __str__(self): - body = self.body() - if not PY2: - body = body.decode('utf-8') - return body - - def body(self): - """Return a byte string representing the form data, including attached files.""" - # Build a list of lists, each containing "lines" of the - # request. Each part is separated by a boundary string. - # Once the list is built, return a string where each - # line is separated by '\r\n'. - parts = [] - part_boundary = '--' + self.boundary - - # Add the form fields - parts.extend( - [ part_boundary, - 'Content-Disposition: form-data; name="%s"' % name, - '', - value, - ] - for name, value in self.form_fields - ) - - # Add the files to upload - parts.extend( - [ part_boundary, - 'Content-Disposition: file; name="%s"; filename="%s"' % \ - (field_name, filename), - 'Content-Type: %s' % content_type, - '', - body, - ] - for field_name, filename, content_type, body in self.files - ) - - # Flatten the list and add closing boundary marker, - # then return CR+LF separated data - flattened = list(itertools.chain(*parts)) - flattened.append('--' + self.boundary + '--') - flattened.append('') - flattened = [part if isinstance(part, bytes) else part.encode('utf-8') for part in flattened] - return b'\r\n'.join(flattened) - diff --git a/ox/format.py b/ox/format.py index ad18c31..83756c1 100644 --- a/ox/format.py +++ b/ox/format.py @@ -4,8 +4,6 @@ import math import re import string -from six import text_type - def toAZ(num): """ Converts an integer to bijective base 26 string using A-Z @@ -108,7 +106,7 @@ def to32(q): >>> to32(555306645) 'GHJKMN' - >>> to32(800197332334559L) + >>> to32(800197332334559) 'PQRSTVWXYZ' >>> to32(32) @@ -226,36 +224,36 @@ def to36(q): def from36(q): return int(q, 36) -def int_value(strValue, default=u''): +def int_value(strValue, default=''): """ >>> int_value('abc23') - u'23' + '23' >>> int_value(' abc23') - u'23' + '23' >>> int_value('ab') - u'' + '' """ try: - val = re.compile('(\d+)').findall(text_type(strValue).strip())[0] + val = re.compile('(\d+)').findall(str(strValue).strip())[0] except: val = default return val -def float_value(strValue, default=u''): +def float_value(strValue, default=''): """ >>> float_value('abc23.4') - u'23.4' + '23.4' >>> float_value(' abc23.4') - u'23.4' + '23.4' >>> float_value('ab') - u'' + '' """ try: - val = re.compile('([\d.]+)').findall(text_type(strValue).strip())[0] + val = re.compile('([\d.]+)').findall(str(strValue).strip())[0] except: val = default return val diff --git a/ox/html.py b/ox/html.py index 73234ea..8666713 100644 --- a/ox/html.py +++ b/ox/html.py @@ -3,8 +3,7 @@ # GPL 2008 import re import string -from six.moves.html_entities import name2codepoint -from six import unichr, PY2, string_types +from html.entities import name2codepoint letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz' @@ -26,8 +25,7 @@ link_target_attribute_re = re.compile(r'(]*?)target=[^\s>]+') html_gunk_re = re.compile(r'(?:
|<\/i>|<\/b>|<\/em>|<\/strong>|<\/?smallcaps>|<\/?uppercase>)', re.IGNORECASE) hard_coded_bullets_re = re.compile(r'((?:

(?:%s).*?[a-zA-Z].*?

\s*)+)' % '|'.join([re.escape(x) for x in DOTS]), re.DOTALL) trailing_empty_content_re = re.compile(r'(?:

(?: |\s|
)*?

\s*)+\Z') -if PY2: - del x # Temporary variable + def escape(html): ''' @@ -36,7 +34,7 @@ def escape(html): >>> escape('html "test" & ') 'html "test" & <brothers>' ''' - if not isinstance(html, string_types): + if not isinstance(html, str): html = str(html) return html.replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"').replace("'", ''') @@ -147,20 +145,20 @@ charrefpat = re.compile(r'&(#(\d+|x[\da-fA-F]+)|[\w.:-]+);?') def decode_html(html): """ >>> decode_html('me & you and $&%') - u'me & you and $&%' + 'me & you and $&%' >>> decode_html('€') - u'\u20ac' + '\u20ac' >>> decode_html('Anniversary of Daoud's Republic') - u"Anniversary of Daoud's Republic" + "Anniversary of Daoud's Republic" """ if isinstance(html, bytes): html = html.decode('utf-8') - uchr = unichr + uchr = chr def entitydecode(match, uchr=uchr): entity = match.group(1) if entity == '#x80': - return u'€' + return '€' elif entity.startswith('#x'): return uchr(int(entity[2:], 16)) elif entity.startswith('#'): @@ -171,7 +169,7 @@ def decode_html(html): return "'" else: return match.group(0) - return charrefpat.sub(entitydecode, html).replace(u'\xa0', ' ') + return charrefpat.sub(entitydecode, html).replace('\xa0', ' ') def highlight(text, query, hlClass="hl"): """ @@ -189,51 +187,51 @@ def highlight(text, query, hlClass="hl"): def escape_html(value): ''' - >>> escape_html(u'') - u'<script>alert()</script>' + '<script>alert()</script>' >>> sanitize_html("'foo' < 'bar' && \\"foo\\" > \\"bar\\"") - u'\\'foo\\' < \\'bar\\' && "foo" > "bar"' + '\\'foo\\' < \\'bar\\' && "foo" > "bar"' >>> sanitize_html('foo') - u'foo' + 'foo' >>> sanitize_html('foo') - u'foo' + 'foo' >>> sanitize_html('Anniversary of Daoud's Republic') - u"Anniversary of Daoud's Republic" + "Anniversary of Daoud's Republic" >>> sanitize_html('') - u'' + '' >>> sanitize_html(' ') - u' ' - >>> sanitize_html(u' ') # canonicalised to a space: okay, I suppose - u' ' - >>> sanitize_html(u'\u00a0') # also nbsp - u' ' + ' ' + >>> sanitize_html(' ') # canonicalised to a space: okay, I suppose + ' ' + >>> sanitize_html('\u00a0') # also nbsp + ' ' ''' if not tags: valid_url = '^((https?:\/\/|\/|mailto:).*?)' @@ -414,24 +412,24 @@ def sanitize_fragment(html): are quoted, etc. Does not strip potentially-malicious HTML: use sanitize_html() for that. - >>> sanitize_fragment(u'') - u'' - >>> sanitize_fragment(u'') - u'' - >>> sanitize_fragment(u'


') - u'

' - >>> sanitize_fragment(u'
foo') - u'foo' - >>> sanitize_fragment(u'') - u'' - >>> sanitize_fragment(u' ') - u' ' - >>> sanitize_fragment(u' ') - u'\\xa0' - >>> sanitize_fragment(u'\\u00a0') # nbsp - u'\\xa0' - >>> sanitize_fragment(u'\\ufeff') # zero-width no-break space - u'\\ufeff' + >>> sanitize_fragment('') + '' + >>> sanitize_fragment('') + '' + >>> sanitize_fragment('


') + '

' + >>> sanitize_fragment('foo') + 'foo' + >>> sanitize_fragment('') + '' + >>> sanitize_fragment(' ') + ' ' + >>> sanitize_fragment(' ') + '\\xa0' + >>> sanitize_fragment('\\u00a0') # nbsp + '\\xa0' + >>> sanitize_fragment('\\ufeff') # zero-width no-break space + '\\ufeff' ''' ''' @@ -442,7 +440,12 @@ def sanitize_fragment(html): if not html.strip(): return html import lxml.html - body = lxml.html.document_fromstring(html).find('body') + try: + body = lxml.html.document_fromstring(html).find('body') + except lxml.etree.ParserError as e: + if e.args and e.args[0] == 'Document is empty': + return html + raise e html = lxml.html.tostring(body, encoding='utf-8')[6:-7].decode('utf-8') if html.startswith('

') and html.endswith('

'): html = html[3:-4] diff --git a/ox/image.py b/ox/image.py index 0fad5eb..5b76042 100644 --- a/ox/image.py +++ b/ox/image.py @@ -22,11 +22,17 @@ ZONE_INDEX = [ ] ] + +def textsize(draw, text, font): + left, top, right, bottom = draw.textbbox((0, 0), text, font=font) + return (right, bottom) + + def drawText(image, position, text, font_file, font_size, color): draw = ImageDraw.Draw(image) font = ImageFont.truetype(font_file, font_size, encoding='unic') draw.text(position, text, fill=color, font=font) - size = draw.textsize(text, font=font) + size = textsize(draw, text, font) version = getattr(Image, 'PILLOW_VERSION', None) if version and version > '2.1.0' and version < '2.6.1': offset = font.getoffset(text) @@ -57,7 +63,7 @@ def getHSL(rgb): return tuple(hsl) def getImageHash(image_file, mode): - image = Image.open(image_file).convert('RGB').resize((8, 8), Image.ANTIALIAS) + image = Image.open(image_file).convert('RGB').resize((8, 8), Image.LANCZOS) image_hash = 0 if mode == 'color': # divide the image into 8 zones: @@ -99,7 +105,7 @@ def getImageHash(image_file, mode): return image_hash def getImageHeat(image_file): - image = Image.open(image_file).convert('RGB').resize((16, 16), Image.ANTIALIAS) + image = Image.open(image_file).convert('RGB').resize((16, 16), Image.LANCZOS) pixel = image.load() image_heat = 0 for y in range(image.size[1]): @@ -114,7 +120,7 @@ def getImageHeat(image_file): return image_heat / 256 def getImageHSL(image_file): - image = Image.open(image_file).convert('RGB').resize((1, 1), Image.ANTIALIAS) + image = Image.open(image_file).convert('RGB').resize((1, 1), Image.LANCZOS) return getHSL(image.getpixel((0, 0))) def getRGB(hsl): @@ -148,7 +154,7 @@ def getRGB(hsl): def getTextSize(image, text, font_file, font_size): draw = ImageDraw.Draw(image) font = ImageFont.truetype(font_file, font_size, encoding='unic') - size = draw.textsize(text, font=font) + size = textsize(draw, text, font) version = getattr(Image, 'PILLOW_VERSION', None) if version and version > '2.1.0' and version < '2.6.1': offset = font.getoffset(text) @@ -168,7 +174,7 @@ def wrapText(text, max_width, max_lines, font_file, font_size): return min_width def get_width(string): - return draw.textsize(string, font=font)[0] + return textsize(draw, string, font)[0] image = Image.new('RGB', (1, 1)) draw = ImageDraw.Draw(image) diff --git a/ox/js.py b/ox/js.py index 2f419bd..9e9f1cb 100644 --- a/ox/js.py +++ b/ox/js.py @@ -2,19 +2,12 @@ # -*- coding: utf-8 -*- # vi:si:et:sw=4:sts=4:ts=4 -from six import PY2 from .utils import json def minify(source, comment=''): # see https://github.com/douglascrockford/JSMin/blob/master/README def get_next_non_whitespace_token(): pass - # python2 performance with unicode string is terrible - if PY2: - if isinstance(source, unicode): # pylint: disable=undefined-variable - source = source.encode('utf-8') - if isinstance(comment, unicode): # pylint: disable=undefined-variable - comment = comment.encode('utf-8') tokens = tokenize(source) length = len(tokens) minified = '/*' + comment + '*/' if comment else '' diff --git a/ox/movie.py b/ox/movie.py index cbf591d..54ede0c 100644 --- a/ox/movie.py +++ b/ox/movie.py @@ -206,7 +206,7 @@ def parse_path(path, directory_key='director'): string = re.sub(re.compile('(?<=\w)_(?=\w)', re.U), '/', string) string = re.sub(' _ ', ' / ', string) # 'foo_ ' is ':' - string = re.sub(re.compile('(?<=\w)_ ', re.U), ': ', string) + string = re.sub(re.compile('(?<=[\w\)\]])_ ', re.U), ': ', string) string = unicodedata.normalize('NFD', string) return string diff --git a/ox/net.py b/ox/net.py index 3a07d91..4d58bad 100644 --- a/ox/net.py +++ b/ox/net.py @@ -8,13 +8,10 @@ import os import re import struct -try: - import requests - USE_REQUESTS = True -except: - USE_REQUESTS = False -from six import BytesIO, PY2 -from six.moves import urllib +import requests + +from io import BytesIO +import urllib from chardet.universaldetector import UniversalDetector @@ -59,14 +56,10 @@ def get_json(url, data=None, headers=None): def open_url(url, data=None, headers=None): if headers is None: headers = DEFAULT_HEADERS.copy() - if PY2: - if not isinstance(url, bytes): - url = url.encode('utf-8') - else: - if isinstance(url, bytes): - url = url.decode('utf-8') + if isinstance(url, bytes): + url = url.decode('utf-8') url = url.replace(' ', '%20') - if data and not PY2 and not isinstance(data, bytes): + if data and not isinstance(data, bytes): data = data.encode('utf-8') req = urllib.request.Request(url, data, headers) return urllib.request.urlopen(req) @@ -123,16 +116,11 @@ def save_url(url, filename, overwrite=False): if dirname and not os.path.exists(dirname): os.makedirs(dirname) headers = DEFAULT_HEADERS.copy() - if USE_REQUESTS: - r = requests.get(url, headers=headers, stream=True) - with open(filename, 'wb') as f: - for chunk in r.iter_content(chunk_size=1024): - if chunk: # filter out keep-alive new chunks - f.write(chunk) - else: - data = read_url(url) - with open(filename, 'wb') as f: - f.write(data) + r = requests.get(url, headers=headers, stream=True) + with open(filename, 'wb') as f: + for chunk in r.iter_content(chunk_size=1024): + if chunk: # filter out keep-alive new chunks + f.write(chunk) def _get_size(url): req = urllib.request.Request(url, headers=DEFAULT_HEADERS.copy()) diff --git a/ox/normalize.py b/ox/normalize.py index dea40ae..4ee9293 100644 --- a/ox/normalize.py +++ b/ox/normalize.py @@ -4,8 +4,6 @@ import re import unicodedata -from six import string_types - _articles = ('the', 'la', 'a', 'die', 'der', 'le', 'el', "l'", 'il', 'das', 'les', 'o', 'ein', 'i', 'un', 'los', 'de', @@ -103,7 +101,7 @@ def normalize_imdbid(imdbId): >>> normalize_imdbid('tt0159206') '0159206' """ - if isinstance(imdbId, string_types): + if isinstance(imdbId, str): imdbId = re.sub('.*(\d{7}).*', '\\1', imdbId) elif isinstance(imdbId, int): imdbId = "%07d" % imdbId diff --git a/ox/srt.py b/ox/srt.py index c29ae8b..464c08e 100644 --- a/ox/srt.py +++ b/ox/srt.py @@ -5,7 +5,6 @@ import codecs import re import chardet -from six import PY2 import ox @@ -24,10 +23,7 @@ def _detect_encoding(fp): # go to beginning of file and get the first 4 bytes oldFP = fp.tell() fp.seek(0) - if PY2: - (byte1, byte2, byte3, byte4) = [ord(b) for b in fp.read(4)] - else: - (byte1, byte2, byte3, byte4) = fp.read(4) + (byte1, byte2, byte3, byte4) = fp.read(4) # try bom detection using 4 bytes, 3 bytes, or 2 bytes bomDetection = bomDict.get((byte1, byte2, byte3, byte4)) diff --git a/ox/text.py b/ox/text.py index 282afa2..d650262 100644 --- a/ox/text.py +++ b/ox/text.py @@ -1,11 +1,13 @@ # -*- coding: utf-8 -*- # vi:si:et:sw=4:sts=4:ts=4 # GPL 2008 +import gzip import math import re import unicodedata +from io import BytesIO -from six.moves import reduce +from functools import reduce ARTICLES = list(set([ # def sg, def pl, indef sg, indef pl (each m/f/n) @@ -473,10 +475,10 @@ def wrap(text, width): def wrap_string(string, length=80, separator='\n', balance=False): ''' - >>> wrap_string(u"Anticonstitutionellement, Paris s'eveille", 16) - u"Anticonstitution\\nellement, Paris \\ns'eveille" + >>> wrap_string("Anticonstitutionellement, Paris s'eveille", 16) + "Anticonstitution\\nellement, Paris \\ns'eveille" >>> wrap_string(u'All you can eat', 12, '\\n', True) - u'All you \\ncan eat' + 'All you \\ncan eat' ''' words = string.split(' ') if balance: @@ -491,20 +493,20 @@ def wrap_string(string, length=80, separator='\n', balance=False): break lines = [''] for word in words: - if len(lines[len(lines) - 1] + word + u' ') <= length + 1: + if len(lines[len(lines) - 1] + word + ' ') <= length + 1: # word fits in current line - lines[len(lines) - 1] += word + u' ' + lines[len(lines) - 1] += word + ' ' else: if len(word) <= length: # word fits in next line - lines.append(word + u' ') + lines.append(word + ' ') else: # word is longer than line position = length - len(lines[len(lines) - 1]) lines[len(lines) - 1] += word[0:position] for i in range(position, len(word), length): lines.append(word[i:i+length]) - lines[len(lines) - 1] += u' ' + lines[len(lines) - 1] += ' ' return separator.join(lines).strip() def truncate_string(string, length, padding='...', position='right'): @@ -576,14 +578,14 @@ def get_valid_filename(s): def get_text_list(list_, last_word='or'): """ - >>> get_text_list([u'a', u'b', u'c', u'd']) - u'a, b, c or d' - >>> get_text_list([u'a', u'b', u'c'], 'and') - u'a, b and c' - >>> get_text_list([u'a', u'b'], 'and') - u'a and b' - >>> get_text_list([u'a']) - u'a' + >>> get_text_list(['a', 'b', 'c', 'd']) + 'a, b, c or d' + >>> get_text_list(['a', 'b', 'c'], 'and') + 'a, b and c' + >>> get_text_list(['a', 'b'], 'and') + 'a and b' + >>> get_text_list(['a']) + 'a' >>> get_text_list([]) '' """ @@ -591,24 +593,24 @@ def get_text_list(list_, last_word='or'): return '' if len(list_) == 1: return list_[0] - return u'%s %s %s' % (u', '.join([i for i in list_][:-1]), last_word, list_[-1]) + return '%s %s %s' % (', '.join([i for i in list_][:-1]), last_word, list_[-1]) def get_list_text(text, last_word='or'): """ - >>> get_list_text(u'a, b, c or d') - [u'a', u'b', u'c', u'd'] - >>> get_list_text(u'a, b and c', u'and') - [u'a', u'b', u'c'] - >>> get_list_text(u'a and b', u'and') - [u'a', u'b'] - >>> get_list_text(u'a') - [u'a'] - >>> get_list_text(u'') + >>> get_list_text('a, b, c or d') + ['a', 'b', 'c', 'd'] + >>> get_list_text('a, b and c', 'and') + ['a', 'b', 'c'] + >>> get_list_text('a and b', 'and') + ['a', 'b'] + >>> get_list_text('a') + ['a'] + >>> get_list_text('') [] """ list_ = [] if text: - list_ = text.split(u', ') + list_ = text.split(', ') if list_: i = len(list_)-1 last = list_[i].split(last_word) @@ -646,8 +648,6 @@ def phone2numeric(phone): return letters.sub(char2number, phone) def compress_string(s): - import gzip - from six import BytesIO zbuf = BytesIO() zfile = gzip.GzipFile(mode='wb', compresslevel=6, fileobj=zbuf) zfile.write(s) @@ -682,7 +682,7 @@ def words(text): return [re.sub("(([.!?:-_]|'s)$)", '', x) for x in text] def sort_string(string): - string = string.replace(u'Æ', 'AE').replace(u'Ø', 'O').replace(u'Þ', 'Th') + string = string.replace('Æ', 'AE').replace('Ø', 'O').replace('Þ', 'Th') # pad numbered titles string = re.sub('(\d),(\d{3})', '\\1\\2', string) diff --git a/ox/torrent/__init__.py b/ox/torrent/__init__.py index a250215..9c399fe 100644 --- a/ox/torrent/__init__.py +++ b/ox/torrent/__init__.py @@ -5,12 +5,8 @@ from threading import Event from hashlib import sha1 import os -from six import PY2 -if PY2: - from .bencode import bencode, bdecode -else: - from .bencode3 import bencode, bdecode +from .bencode3 import bencode, bdecode __all__ = ['create_torrent', 'get_info_hash', 'get_torrent_info', 'get_files', 'get_torrent_size'] diff --git a/ox/torrent/bencode.py b/ox/torrent/bencode.py deleted file mode 100644 index b586001..0000000 --- a/ox/torrent/bencode.py +++ /dev/null @@ -1,321 +0,0 @@ -# Written by Petru Paler, Uoti Urpala, Ross Cohen and John Hoffman -# see LICENSE.txt for license information -from __future__ import print_function - -from types import IntType, LongType, StringType, ListType, TupleType, DictType -try: - from types import BooleanType -except ImportError: - BooleanType = None -try: - from types import UnicodeType -except ImportError: - UnicodeType = None -from cStringIO import StringIO - -def decode_int(x, f): - f += 1 - newf = x.index('e', f) - try: - n = int(x[f:newf]) - except: - n = long(x[f:newf]) - if x[f] == '-': - if x[f + 1] == '0': - raise ValueError - elif x[f] == '0' and newf != f+1: - raise ValueError - return (n, newf+1) - -def decode_string(x, f): - colon = x.index(':', f) - try: - n = int(x[f:colon]) - except (OverflowError, ValueError): - n = long(x[f:colon]) - if x[f] == '0' and colon != f+1: - raise ValueError - colon += 1 - return (x[colon:colon+n], colon+n) - -def decode_unicode(x, f): - s, f = decode_string(x, f+1) - return (s.decode('UTF-8'),f) - -def decode_list(x, f): - r, f = [], f+1 - while x[f] != 'e': - v, f = decode_func[x[f]](x, f) - r.append(v) - return (r, f + 1) - -def decode_dict(x, f): - r, f = {}, f+1 - lastkey = None - while x[f] != 'e': - k, f = decode_string(x, f) - # why is this needed - # if lastkey >= k: - # raise ValueError - lastkey = k - r[k], f = decode_func[x[f]](x, f) - return (r, f + 1) - -decode_func = {} -decode_func['l'] = decode_list -decode_func['d'] = decode_dict -decode_func['i'] = decode_int -decode_func['0'] = decode_string -decode_func['1'] = decode_string -decode_func['2'] = decode_string -decode_func['3'] = decode_string -decode_func['4'] = decode_string -decode_func['5'] = decode_string -decode_func['6'] = decode_string -decode_func['7'] = decode_string -decode_func['8'] = decode_string -decode_func['9'] = decode_string -#decode_func['u'] = decode_unicode - -def bdecode(x, sloppy = 1): - try: - r, l = decode_func[x[0]](x, 0) -# except (IndexError, KeyError): - except (IndexError, KeyError, ValueError): - raise ValueError("bad bencoded data") - if not sloppy and l != len(x): - raise ValueError("bad bencoded data") - return r - -def test_bdecode(): - try: - bdecode('0:0:') - assert 0 - except ValueError: - pass - try: - bdecode('ie') - assert 0 - except ValueError: - pass - try: - bdecode('i341foo382e') - assert 0 - except ValueError: - pass - assert bdecode('i4e') == 4 - assert bdecode('i0e') == 0 - assert bdecode('i123456789e') == 123456789 - assert bdecode('i-10e') == -10 - try: - bdecode('i-0e') - assert 0 - except ValueError: - pass - try: - bdecode('i123') - assert 0 - except ValueError: - pass - try: - bdecode('') - assert 0 - except ValueError: - pass - try: - bdecode('i6easd') - assert 0 - except ValueError: - pass - try: - bdecode('35208734823ljdahflajhdf') - assert 0 - except ValueError: - pass - try: - bdecode('2:abfdjslhfld') - assert 0 - except ValueError: - pass - assert bdecode('0:') == '' - assert bdecode('3:abc') == 'abc' - assert bdecode('10:1234567890') == '1234567890' - try: - bdecode('02:xy') - assert 0 - except ValueError: - pass - try: - bdecode('l') - assert 0 - except ValueError: - pass - assert bdecode('le') == [] - try: - bdecode('leanfdldjfh') - assert 0 - except ValueError: - pass - assert bdecode('l0:0:0:e') == ['', '', ''] - try: - bdecode('relwjhrlewjh') - assert 0 - except ValueError: - pass - assert bdecode('li1ei2ei3ee') == [1, 2, 3] - assert bdecode('l3:asd2:xye') == ['asd', 'xy'] - assert bdecode('ll5:Alice3:Bobeli2ei3eee') == [['Alice', 'Bob'], [2, 3]] - try: - bdecode('d') - assert 0 - except ValueError: - pass - try: - bdecode('defoobar') - assert 0 - except ValueError: - pass - assert bdecode('de') == {} - assert bdecode('d3:agei25e4:eyes4:bluee') == {'age': 25, 'eyes': 'blue'} - assert bdecode('d8:spam.mp3d6:author5:Alice6:lengthi100000eee') == {'spam.mp3': {'author': 'Alice', 'length': 100000}} - try: - bdecode('d3:fooe') - assert 0 - except ValueError: - pass - try: - bdecode('di1e0:e') - assert 0 - except ValueError: - pass - try: - bdecode('d1:b0:1:a0:e') - assert 0 - except ValueError: - pass - try: - bdecode('d1:a0:1:a0:e') - assert 0 - except ValueError: - pass - try: - bdecode('i03e') - assert 0 - except ValueError: - pass - try: - bdecode('l01:ae') - assert 0 - except ValueError: - pass - try: - bdecode('9999:x') - assert 0 - except ValueError: - pass - try: - bdecode('l0:') - assert 0 - except ValueError: - pass - try: - bdecode('d0:0:') - assert 0 - except ValueError: - pass - try: - bdecode('d0:') - assert 0 - except ValueError: - pass - -bencached_marker = [] - -class Bencached: - def __init__(self, s): - self.marker = bencached_marker - self.bencoded = s - -BencachedType = type(Bencached('')) # insufficient, but good as a filter - -def encode_bencached(x,r): - assert x.marker == bencached_marker - r.append(x.bencoded) - -def encode_int(x,r): - r.extend(('i',str(x),'e')) - -def encode_bool(x,r): - encode_int(int(x),r) - -def encode_string(x,r): - r.extend((str(len(x)),':',x)) - -def encode_unicode(x,r): - #r.append('u') - encode_string(x.encode('UTF-8'),r) - -def encode_list(x,r): - r.append('l') - for e in x: - encode_func[type(e)](e, r) - r.append('e') - -def encode_dict(x,r): - r.append('d') - ilist = x.items() - ilist.sort() - for k,v in ilist: - r.extend((str(len(k)),':',k)) - encode_func[type(v)](v, r) - r.append('e') - -encode_func = {} -encode_func[BencachedType] = encode_bencached -encode_func[IntType] = encode_int -encode_func[LongType] = encode_int -encode_func[StringType] = encode_string -encode_func[ListType] = encode_list -encode_func[TupleType] = encode_list -encode_func[DictType] = encode_dict -if BooleanType: - encode_func[BooleanType] = encode_bool -if UnicodeType: - encode_func[UnicodeType] = encode_unicode - -def bencode(x): - r = [] - try: - encode_func[type(x)](x, r) - except: - print("*** error *** could not encode type %s (value: %s)" % (type(x), x)) - assert 0 - return ''.join(r) - -def test_bencode(): - assert bencode(4) == 'i4e' - assert bencode(0) == 'i0e' - assert bencode(-10) == 'i-10e' - assert bencode(12345678901234567890) == 'i12345678901234567890e' - assert bencode('') == '0:' - assert bencode('abc') == '3:abc' - assert bencode('1234567890') == '10:1234567890' - assert bencode([]) == 'le' - assert bencode([1, 2, 3]) == 'li1ei2ei3ee' - assert bencode([['Alice', 'Bob'], [2, 3]]) == 'll5:Alice3:Bobeli2ei3eee' - assert bencode({}) == 'de' - assert bencode({'age': 25, 'eyes': 'blue'}) == 'd3:agei25e4:eyes4:bluee' - assert bencode({'spam.mp3': {'author': 'Alice', 'length': 100000}}) == 'd8:spam.mp3d6:author5:Alice6:lengthi100000eee' - try: - bencode({1: 'foo'}) - assert 0 - except AssertionError: - pass - - -try: - import psyco - psyco.bind(bdecode) - psyco.bind(bencode) -except ImportError: - pass diff --git a/ox/torrent/makemetafile.py b/ox/torrent/makemetafile.py index 31d6ebe..c2db27a 100644 --- a/ox/torrent/makemetafile.py +++ b/ox/torrent/makemetafile.py @@ -8,11 +8,7 @@ from hashlib import sha1 as sha from copy import copy import re -from six import PY2 -if PY2: - from .bencode import bencode -else: - from .bencode3 import bencode +from .bencode3 import bencode from threading import Event from time import time from traceback import print_exc diff --git a/ox/web/allmovie.py b/ox/web/allmovie.py index fdb7a46..c94c438 100644 --- a/ox/web/allmovie.py +++ b/ox/web/allmovie.py @@ -13,13 +13,13 @@ def get_id(url): def get_data(id): ''' >>> get_data('129689')['cast'][1][1] - u'Marianne' + 'Marianne' >>> get_data('129689')['credits'][0][0] - u'Jean-Luc Godard' + 'Jean-Luc Godard' >>> get_data('129689')['posters'][0] - u'http://image.allmusic.com/00/adg/cov200/dru800/u812/u81260bbffr.jpg' + 'http://image.allmusic.com/00/adg/cov200/dru800/u812/u81260bbffr.jpg' >>> get_data('129689')['rating'] - u'4.5' + '4.5' ''' if id.startswith('http'): id = get_id(id) diff --git a/ox/web/amazon.py b/ox/web/amazon.py index 19a72c7..d721d5c 100644 --- a/ox/web/amazon.py +++ b/ox/web/amazon.py @@ -2,7 +2,7 @@ # vi:si:et:sw=4:sts=4:ts=4 from __future__ import print_function import re -from six.moves.urllib.parse import quote +from urllib.parse import quote from ox import find_re, strip_tags, decode_html from ox.cache import read_url diff --git a/ox/web/apple.py b/ox/web/apple.py index 099d6cb..84abba0 100644 --- a/ox/web/apple.py +++ b/ox/web/apple.py @@ -2,7 +2,6 @@ from __future__ import print_function import json import re -from six import text_type from ox.cache import read_url HEADERS = { @@ -17,9 +16,9 @@ USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7) ' USER_AGENT += 'AppleWebKit/534.48.3 (KHTML, like Gecko) Version/5.1 Safari/534.48.3' def get_movie_data(title, director): - if isinstance(title, text_type): + if isinstance(title, str): title = title.encode('utf-8') - if isinstance(director, text_type): + if isinstance(director, str): director = director.encode('utf-8') data = {} # itunes section (preferred source for link) diff --git a/ox/web/archive.py b/ox/web/archive.py index 0c733c3..3e7ab47 100644 --- a/ox/web/archive.py +++ b/ox/web/archive.py @@ -3,8 +3,6 @@ from .. import cache from ..utils import json -from six import string_types - def get_id(url): return url.split("/")[-1] @@ -21,7 +19,7 @@ def get_data(id): data[key] = details['metadata'][key] if isinstance(data[key], list): data[key] = data[key][0] - if isinstance(data[key], string_types): + if isinstance(data[key], str): data[key] = data[key].strip() if data[key][0] == '[' and data[key][-1] == ']': data[key] = data[key][1:-1] diff --git a/ox/web/arsenalberlin.py b/ox/web/arsenalberlin.py index e5a0dd2..ca77b5e 100644 --- a/ox/web/arsenalberlin.py +++ b/ox/web/arsenalberlin.py @@ -19,18 +19,18 @@ def get_data(id, language='en'): if 'Willkommen in der Datenbank des Arsenal' in html: return None data = {} - data[u'id'] = id - data[u'url'] = url + data['id'] = id + data['url'] = url m = re.compile('

(.*?)

').findall(html) if m: - data[u'title'] = m[0] + data['title'] = m[0] m = re.compile("Director: (.*?)").findall(html) if m: - data[u'director'] = m[0] + data['director'] = m[0] m = re.compile("caUI.initImageScroller\(\[\{url:'(.*?)'").findall(html) if m: - data[u'image'] = m[0] + data['image'] = m[0] units = re.compile("
(.*?)
", re.DOTALL).findall(html) for x in map(re.compile('(.*?): (.*)', re.DOTALL).findall, units): @@ -43,7 +43,7 @@ def get_data(id, language='en'): else: data[key] = strip_tags(data[key]) if "running time (minutes)" in data: - data[u'runtime'] = float(data.pop("running time (minutes)").replace(',', '.')) * 60 + data['runtime'] = float(data.pop("running time (minutes)").replace(',', '.')) * 60 for key in ('year', 'length in metres', 'forum participation year', 'number of reels'): if key in data and data[key].isdigit(): data[key] = int(data[key]) diff --git a/ox/web/criterion.py b/ox/web/criterion.py index 6cef01e..67d4a8a 100644 --- a/ox/web/criterion.py +++ b/ox/web/criterion.py @@ -19,13 +19,13 @@ def get_url(id): def get_data(id, timeout=ox.cache.cache_timeout, get_imdb=False): ''' >>> get_data('1333').get('imdbId') - u'0060304' + '0060304' >>> get_data('236')['posters'][0] - u'http://s3.amazonaws.com/criterion-production/release_images/1586/ThirdManReplace.jpg' + 'http://s3.amazonaws.com/criterion-production/release_images/1586/ThirdManReplace.jpg' >>> get_data('786')['posters'][0] - u'http://s3.amazonaws.com/criterion-production/product_images/185/343_box_348x490.jpg' + 'http://s3.amazonaws.com/criterion-production/product_images/185/343_box_348x490.jpg' ''' data = { "id": id, @@ -39,12 +39,16 @@ def get_data(id, timeout=ox.cache.cache_timeout, get_imdb=False): data["number"] = find_re(html, "Spine #(\d+)") data["title"] = decode_html(find_re(html, "

(.*?)

")) - data["title"] = data["title"].split(u' \u2014 The Television Version')[0].strip() + data["title"] = data["title"].split(' \u2014 The Television Version')[0].strip() results = find_re(html, '
    (.*?)
') info = re.compile('
  • (.*?)
  • ', re.DOTALL).findall(results) info = {k: strip_tags(v).strip() for k, v in info} + meta = re.compile('.*?src="(.*?)"', re.DOTALL).findall(html) + #result = find_re(html, "\"Film>> get_id(imdb='0133093') - u'the-matrix' + 'the-matrix' #>>> get_id(imdb='0060304') - #u'2-or-3-things-i-know-about-her' + #'2-or-3-things-i-know-about-her' ''' if imdb: i = ImdbCombined(imdb) diff --git a/ox/web/google.py b/ox/web/google.py index 72aa32f..01bb7ce 100644 --- a/ox/web/google.py +++ b/ox/web/google.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- # vi:si:et:sw=4:sts=4:ts=4 import re -from six.moves import urllib +import urllib import ox from ox import strip_tags, decode_html @@ -17,6 +17,31 @@ def quote_plus(s): s = s.encode('utf-8') return urllib.parse.quote_plus(s) + +def infobox(query, timeout=DEFAULT_TIMEOUT): + import lxml.html + data = read_url(url, timeout=timeout) + doc = lxml.html.document_fromstring(data) + k = 'kp-wholepage' + wholepage = doc.cssselect('.' + k) + infobox = {} + if wholepage: + page = wholepage[0] + for a in page.cssselect('a'): + if a.attrib.get('href', '').startswith('http'): + domain = '.'.join(a.attrib['href'].split('/')[2].split('.')[-2:]) + infobox[domain] = a.attrib['href'] + for e in page.cssselect('*[data-attrid]'): + key = e.attrib['data-attrid'] + value = e.text_content() + if value and key not in ( + 'kc:/film/film:media_actions_wholepage', + 'action:watch_film' + ): + infobox[key] = value + return infobox + + def find(query, max_results=DEFAULT_MAX_RESULTS, timeout=DEFAULT_TIMEOUT): """ Return max_results tuples with title, url, description diff --git a/ox/web/imdb.py b/ox/web/imdb.py index ac12c83..4b08cab 100644 --- a/ox/web/imdb.py +++ b/ox/web/imdb.py @@ -1,13 +1,13 @@ # -*- coding: utf-8 -*- # vi:si:et:sw=4:sts=4:ts=4 -from __future__ import print_function +from collections import defaultdict +import json import re import time import unicodedata -from six.moves.urllib.parse import urlencode -from six import text_type, string_types +from urllib.parse import urlencode from .. import find_re, strip_tags, decode_html from .. import cache @@ -16,13 +16,13 @@ from .. import cache from . siteparser import SiteParser from . import duckduckgo from ..utils import datetime -from ..geo import normalize_country_name +from ..geo import normalize_country_name, get_country_name def prepare_url(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None, unicode=False): headers = headers.copy() # https://webapps.stackexchange.com/questions/11003/how-can-i-disable-reconfigure-imdbs-automatic-geo-location-so-it-does-not-defau - headers['X-Forwarded-For'] = '72.21.206.80' + #headers['X-Forwarded-For'] = '72.21.206.80' headers['Accept-Language'] = 'en' return url, data, headers, timeout, unicode @@ -106,6 +106,131 @@ def technical(label): } +def tech_spec(metadata): + tech = {} + for row in metadata['props']['pageProps']['contentData']['section']['items']: + title = { + 'aspect ratio': 'aspectratio', + 'sound mix': 'sound', + }.get(row['rowTitle'].lower(), row['rowTitle'].lower()) + tech[title] = [] + for content in row['listContent']: + value = content['text'] + tech[title].append(value) + return tech + + +def movie_connections(metadata): + + connections = {} + if 'props' not in metadata: + return connections + for row in metadata['props']['pageProps']['contentData']['categories']: + title = { + }.get(row['name'], row['name']) + if title not in connections: + connections[title] = [] + + for item in row['section']['items']: + item_ = { + 'id': item['id'][2:], + } + + item_['title'] = re.compile('(.*?)').findall(item['listContent'][0]['html'])[0] + if len(item['listContent']) >=2: + item_['description'] = strip_tags(item['listContent'][1]['html']) + connections[title].append(item_) + return connections + + +def get_category_by_id(metadata, id): + for category in metadata['props']['pageProps']['contentData']['categories']: + if category['id'] == id: + return category + + +def get_release_date(metadata): + releases = get_category_by_id(metadata, 'releases') + def parse_date(d): + parsed = None + for fmt in ( + '%B %d, %Y', + '%d %B %Y', + '%B %Y', + ): + try: + parsed = datetime.strptime(d, fmt) + break + except: + pass + if not parsed: + return None + return '%d-%02d-%02d' % (parsed.year, parsed.month, parsed.day) + + dates = [] + for item in releases['section']['items']: + content = item['listContent'][0] + date = parse_date(content['text']) + if date: + dates.append(date) + + if dates: + return min(dates) + +def get_locations(metadata): + try: + locations = [ + row['cardText'] + for row in metadata['props']['pageProps']['contentData']['categories'][0]['section']['items'] + ] + except: + locations = [] + return locations + + +def get_keywords(metadata): + try: + keywords = [ + row['rowTitle'] + for row in metadata['props']['pageProps']['contentData']['section']['items'] + ] + except: + keywords = [] + return keywords + + +def get_entity_metadata(metadata): + data = {} + entity = metadata['props']['pageProps']['contentData']['entityMetadata'] + data['title'] = entity['titleText']['text'] + data['originalTitle'] = entity['originalTitleText']['text'] + data['year'] = entity['releaseYear']['year'] + data['plot'] = entity['plot']['plotText']['plainText'] + data['country'] = [get_country_name(c['id']) for c in entity['countriesOfOrigin']['countries']] + data['poster'] = metadata['props']['pageProps']['contentData']['posterData']['image']['url'] + return data + + +def alternative_titles(metadata): + titles = defaultdict(list) + akas = get_category_by_id(metadata, 'akas') + + skip = [ + metadata['props']['pageProps']['contentData']['entityMetadata']['titleText']['text'], + metadata['props']['pageProps']['contentData']['entityMetadata']['originalTitleText']['text'] + ] + for row in akas['section']['items']: + content = row['listContent'][0] + title = content['text'] + country = row['rowTitle'] + if title in skip: + continue + titles[title].append(country) + #if content.get('subText'): + # titles[-1]['subText'] = content['subText'] + return [kv for kv in titles.items()] + + ''' 'posterIds': { 'page': 'posters', @@ -116,18 +241,17 @@ def technical(label): class Imdb(SiteParser): ''' - >>> Imdb('0068646')['title'] == text_type(u'The Godfather') + >>> Imdb('0068646')['title'] == 'The Godfather' True - >>> Imdb('0133093')['title'] == text_type(u'The Matrix') + >>> Imdb('0133093')['title'] == 'The Matrix' True ''' regex = { 'alternativeTitles': { 'page': 'releaseinfo', 're': [ - ']*?id="akas"[^>]*?>(.*?)', - "td[^>]*?>(.*?).*?]*?>(.*?)" + '