diff --git a/ox/__init__.py b/ox/__init__.py index c90de79..98402fb 100644 --- a/ox/__init__.py +++ b/ox/__init__.py @@ -5,7 +5,7 @@ try: from . import __version __version__ = __version.VERSION except: - __version__ = '3.0.x' + __version__ = '2.3.x' from . import cache from . import js @@ -17,6 +17,7 @@ from . import vtt from .api import * from .file import * +from .form import * from .format import * from .geo import * from .html import * diff --git a/ox/api.py b/ox/api.py index 3e43493..b784788 100644 --- a/ox/api.py +++ b/ox/api.py @@ -4,20 +4,19 @@ from __future__ import print_function from types import MethodType import gzip -import mimetypes import os import shutil import sys import time -from http import cookiejar as cookielib -from io import BytesIO -import urllib -from urllib.parse import urlparse -import requests +from six.moves import http_cookiejar as cookielib +from six import BytesIO, PY2 +from six.moves import urllib +from six.moves.urllib.parse import urlparse from . import __version__ from .utils import json +from .form import MultiPartForm __all__ = ['getAPI', 'API'] @@ -38,13 +37,12 @@ class API(object): self._cj = cj else: self._cj = cookielib.CookieJar() + self._opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(self._cj), + urllib.request.HTTPHandler(debuglevel=self.debuglevel)) + self._opener.addheaders = [ + ('User-Agent', '%s/%s' % (self.__name__, self.__version__)) + ] - self._requests_session = requests.Session() - self._requests_session.cookies = self._cj - self._requests_session.headers = { - 'User-Agent': '%s/%s' % (self.__name__, self.__version__), - 'Accept-Encoding': 'gzip, deflate', - } self.url = url r = self._request('api', {'docs': True}) self._properties = r['data']['actions'] @@ -55,7 +53,10 @@ class API(object): def _add_method(self, method, name): if name is None: name = method.func_name - setattr(self, name, MethodType(method, self)) + if PY2: + setattr(self, name, MethodType(method, self, type(self))) + else: + setattr(self, name, MethodType(method, self)) def _add_action(self, action): def method(self, *args, **kw): @@ -69,20 +70,37 @@ class API(object): return self._request(action, kw) if 'doc' in self._properties[action]: method.__doc__ = self._properties[action]['doc'] - method.func_name = action + if PY2: + method.func_name = str(action) + else: + method.func_name = action self._add_method(method, action) - def _json_request(self, url, data, files=None): + def _json_request(self, url, form): result = {} try: - request = self._requests_session.post(url, data=data, files=files) - result = request.json() - return result + body = form.body() + if PY2: + if not isinstance(url, bytes): + url = url.encode('utf-8') + request = urllib.request.Request(url) + request.add_data(body) + else: + request = urllib.request.Request(url, data=body, method='POST') + request.add_header('Content-Type', form.get_content_type()) + request.add_header('Content-Length', str(len(body))) + request.add_header('Accept-Encoding', 'gzip, deflate') + f = self._opener.open(request) + result = f.read() + if f.headers.get('content-encoding', None) == 'gzip': + result = gzip.GzipFile(fileobj=BytesIO(result)).read() + result = result.decode('utf-8') + return json.loads(result) except urllib.error.HTTPError as e: if self.DEBUG: import webbrowser if e.code >= 500: - with open('/tmp/error.html', 'wb') as f: + with open('/tmp/error.html', 'w') as f: f.write(e.read()) webbrowser.open_new_tab('/tmp/error.html') @@ -107,15 +125,17 @@ class API(object): raise def _request(self, action, data=None): - form = { - 'action': action - } + form = MultiPartForm() + form.add_field('action', action) if data: - form['data'] = json.dumps(data) + form.add_field('data', json.dumps(data)) return self._json_request(self.url, form) def get_url(self, url): - return self._requests_session.get(url).content + request = urllib.request.Request(url, method='GET') + f = self._opener.open(request) + result = f.read() + return result def save_url(self, url, filename, overwrite=False): chunk_size = 16 * 1024 @@ -123,15 +143,21 @@ class API(object): dirname = os.path.dirname(filename) if dirname and not os.path.exists(dirname): os.makedirs(dirname) + request = urllib.request.Request(url, method='GET') tmpname = filename + '.tmp' with open(tmpname, 'wb') as fd: - r = self._requests_session.get(url) - for chunk in iter(lambda: r.read(chunk_size), b''): + u = self._opener.open(request) + for chunk in iter(lambda: u.read(chunk_size), b''): fd.write(chunk) shutil.move(tmpname, filename) + def upload_chunks(self, url, filename, data=None, silent=False): - data = self._json_request(url, data) + form = MultiPartForm() + if data: + for key in data: + form.add_field(key, data[key]) + data = self._json_request(url, form) def full_url(path): if path.startswith('/'): @@ -152,20 +178,16 @@ class API(object): resume_offset = 0 chunk = f.read(CHUNK_SIZE) fname = os.path.basename(filename) - mime_type = mimetypes.guess_type(fname)[0] or 'application/octet-stream' if not isinstance(fname, bytes): fname = fname.encode('utf-8') while chunk: - meta = { - 'offset': str(done) - } + form = MultiPartForm() + form.add_file('chunk', fname, chunk) if len(chunk) < CHUNK_SIZE or f.tell() == fsize: - meta['done'] = '1' - files = [ - ('chunk', (fname, chunk, mime_type)) - ] + form.add_field('done', '1') + form.add_field('offset', str(done)) try: - data = self._json_request(uploadUrl, meta, files=files) + data = self._json_request(uploadUrl, form) except KeyboardInterrupt: if not slient: print("\ninterrupted by user.") diff --git a/ox/cache.py b/ox/cache.py index 3954ea7..c359cbd 100644 --- a/ox/cache.py +++ b/ox/cache.py @@ -10,11 +10,15 @@ import sqlite3 import time import zlib -from io import BytesIO -import urllib -import requests -from requests.structures import CaseInsensitiveDict - +from six import BytesIO +from six.moves import urllib +from six import PY2 +try: + import requests + USE_REQUESTS = True + requests_session = requests.Session() +except: + USE_REQUESTS = False from .utils import json from .file import makedirs @@ -24,7 +28,6 @@ from .net import DEFAULT_HEADERS, detect_encoding cache_timeout = 30*24*60*60 # default is 30 days -requests_session = requests.Session() COMPRESS_TYPES = ( 'text/html', @@ -66,7 +69,7 @@ def get_headers(url, data=None, headers=None, timeout=cache_timeout): if not url_headers: url_headers = net.get_headers(url, data, headers) store.set(url, data, -1, url_headers) - return CaseInsensitiveDict(url_headers) + return url_headers def get_json(url, data=None, headers=None, timeout=cache_timeout): return json.loads(read_url(url, data, headers, timeout).decode('utf-8')) @@ -98,20 +101,32 @@ def read_url(url, data=None, headers=None, timeout=cache_timeout, valid=None, un result = store.get(url, data, headers, timeout) url_headers = {} if not result: - if headers is None: - headers = DEFAULT_HEADERS.copy() - if data: - r = requests_session.post(url, data=data, headers=headers) - else: + if USE_REQUESTS: + if headers is None: + headers = DEFAULT_HEADERS.copy() r = requests_session.get(url, headers=headers) - for key in r.headers: - url_headers[key.lower()] = r.headers[key] - result = r.content - url_headers['Status'] = "%s" % r.status_code - if not valid or valid(result, url_headers): - store.set(url, post_data=data, data=result, headers=url_headers) + for key in r.headers: + url_headers[key.lower()] = r.headers[key] + result = r.content + url_headers['Status'] = "%s" % r.status_code + if not valid or valid(result, url_headers): + store.set(url, post_data=data, data=result, headers=url_headers) + else: + raise InvalidResult(result, url_headers) else: - raise InvalidResult(result, url_headers) + try: + url_headers, result = net.read_url(url, data, headers, return_headers=True) + except urllib.error.HTTPError as e: + e.headers['Status'] = "%s" % e.code + for key in e.headers: + url_headers[key.lower()] = e.headers[key] + result = e.read() + if url_headers.get('content-encoding', None) == 'gzip': + result = gzip.GzipFile(fileobj=BytesIO(result)).read() + if not valid or valid(result, url_headers): + store.set(url, post_data=data, data=result, headers=url_headers) + else: + raise InvalidResult(result, url_headers) if unicode: ctype = url_headers.get('content-type', '').lower() if 'charset' in ctype: @@ -224,6 +239,8 @@ class SQLiteCache(Cache): elif value == 'data': if row[1] == 1: r = zlib.decompress(r) + elif PY2: + r = str(r) break c.close() @@ -262,8 +279,6 @@ class SQLiteCache(Cache): data = zlib.compress(data) else: compressed = 0 - if isinstance(data, str): - data = data.encode("utf-8") data = sqlite3.Binary(data) #fixme: this looks wrong diff --git a/ox/file.py b/ox/file.py index 4b74b36..f12aee7 100644 --- a/ox/file.py +++ b/ox/file.py @@ -19,8 +19,7 @@ __all__ = ['sha1sum', 'oshash', 'avinfo', 'makedirs', 'iexists'] EXTENSIONS = { 'audio': [ 'aac', 'aif', 'aiff', 'amr', - 'flac', 'm4a', 'mp3', 'oga', 'ogg', 'wav', 'wma', 'opus', - 'ra', # Real Audio + 'flac', 'm4a', 'mp3', 'oga', 'ogg', 'wav', 'wma', 'opus' ], 'image': [ 'bmp', 'gif', 'jpeg', 'jpg', 'png', 'svg', 'webp' @@ -30,12 +29,11 @@ EXTENSIONS = { ], 'video': [ '3gp', - 'avi', 'divx', 'dv', 'flv', 'm2t', 'm2ts', 'm2v', 'm4v', 'mkv', 'mov', 'mp4', - 'mpeg', 'mpg', 'mts', 'ogm', 'ogv', 'vob', 'webm', 'wmv', 'asf', + 'avi', 'divx', 'dv', 'flv', 'm2t', 'm2ts', 'm4v', 'mkv', 'mov', 'mp4', + 'mpeg', 'mpg', 'mts', 'ogm', 'ogv', 'rm', 'rmvb', 'vob', 'webm', 'wmv', 'asf', 'mod', 'tod', # http://en.wikipedia.org/wiki/MOD_and_TOD 'mxf', 'ts', 'dat', # VOD files - 'rm', 'rmvb', # Real Media ], } @@ -216,16 +214,12 @@ def ffprobe(filename): ] for s in ffinfo['streams']: tags = s.pop('tags', {}) - side_data_list = s.pop('side_data_list', []) language = None for t in tags: if t == 'language': language = tags[t] else: info['metadata'][t] = tags[t] - for kv in side_data_list: - for k, v in kv.items(): - info['metadata'][k] = v if s.get('codec_type') in ('audio', 'video'): stream = {} if language and language != 'und': @@ -279,15 +273,9 @@ def ffprobe(filename): pass # print s for v in info['video']: + if 'rotate' in info.get('metadata', {}) and int(info['metadata']['rotate']) in (-180, -90, 90, 180): + v['width'], v['height'] = v['height'], v['width'] k = 'display_aspect_ratio' - if 'rotate' in info.get('metadata', {}) and int(info['metadata']['rotate']) in (-90, 90): - v['width'], v['height'] = v['height'], v['width'] - if k in v: - v[k] = ':'.join(reversed(v[k].split(':'))) - elif 'rotation' in info.get('metadata', {}) and int(info['metadata']['rotation']) in (-90, 90): - v['width'], v['height'] = v['height'], v['width'] - if k in v: - v[k] = ':'.join(reversed(v[k].split(':'))) if k not in v and 'width' in v \ or (k in v and v[k] == '0:1'): v[k] = '%d:%d' % (v['width'], v['height']) diff --git a/ox/fixunicode.py b/ox/fixunicode.py index e0386c6..d3a162d 100644 --- a/ox/fixunicode.py +++ b/ox/fixunicode.py @@ -6,6 +6,7 @@ from __future__ import print_function import unicodedata +from six import unichr, text_type __all__ = ['fix_bad_unicode'] @@ -150,7 +151,7 @@ def text_badness(text): - Improbable single-byte characters, such as ƒ or ¬ - Letters in somewhat rare scripts ''' - assert isinstance(text, str) + assert isinstance(text, text_type) errors = 0 very_weird_things = 0 weird_things = 0 @@ -288,7 +289,7 @@ SINGLE_BYTE_WEIRDNESS = ( # Pre-cache the Unicode data saying which of these first 256 characters are # letters. We'll need it often. SINGLE_BYTE_LETTERS = [ - unicodedata.category(chr(i)).startswith('L') + unicodedata.category(unichr(i)).startswith('L') for i in range(256) ] diff --git a/ox/form.py b/ox/form.py new file mode 100644 index 0000000..faa1551 --- /dev/null +++ b/ox/form.py @@ -0,0 +1,108 @@ +# -*- coding: utf-8 -*- +# vi:si:et:sw=4:sts=4:ts=4 +# GPL 2014 +from __future__ import print_function + +import itertools +import mimetypes +import os +import hashlib +import sys + +from six import PY2 + + +__all__ = ['MultiPartForm'] + +# from /usr/lib/python3.4/email/generator.py +# Helper used by Generator._make_boundary +_width = len(repr(sys.maxsize-1)) +_fmt = '%%0%dd' % _width + +def _make_boundary(): + # Craft a random boundary. + boundary = ('=' * 15) + hashlib.sha1(os.urandom(32)).hexdigest() + '==' + return boundary + +class MultiPartForm(object): + """Accumulate the data to be used when posting a form.""" + + def __init__(self): + self.form_fields = [] + self.files = [] + self.boundary = _make_boundary() + return + + def get_content_type(self): + return 'multipart/form-data; boundary=%s' % self.boundary + + def add_field(self, name, value): + """Add a simple field to the form data.""" + if isinstance(name, bytes): + name = name.decode('utf-8') + if isinstance(value, bytes): + value = value.decode('utf-8') + self.form_fields.append((name, value)) + return + + def add_file(self, fieldname, filename, fileHandle, mimetype=None): + """Add a file to be uploaded.""" + if isinstance(fieldname, bytes): + fieldname = fieldname.decode('utf-8') + if isinstance(filename, bytes): + filename = filename.decode('utf-8') + + if hasattr(fileHandle, 'read'): + body = fileHandle.read() + else: + body = fileHandle + if mimetype is None: + mimetype = mimetypes.guess_type(filename)[0] or 'application/octet-stream' + self.files.append((fieldname, filename, mimetype, body)) + return + + def __str__(self): + body = self.body() + if not PY2: + body = body.decode('utf-8') + return body + + def body(self): + """Return a byte string representing the form data, including attached files.""" + # Build a list of lists, each containing "lines" of the + # request. Each part is separated by a boundary string. + # Once the list is built, return a string where each + # line is separated by '\r\n'. + parts = [] + part_boundary = '--' + self.boundary + + # Add the form fields + parts.extend( + [ part_boundary, + 'Content-Disposition: form-data; name="%s"' % name, + '', + value, + ] + for name, value in self.form_fields + ) + + # Add the files to upload + parts.extend( + [ part_boundary, + 'Content-Disposition: file; name="%s"; filename="%s"' % \ + (field_name, filename), + 'Content-Type: %s' % content_type, + '', + body, + ] + for field_name, filename, content_type, body in self.files + ) + + # Flatten the list and add closing boundary marker, + # then return CR+LF separated data + flattened = list(itertools.chain(*parts)) + flattened.append('--' + self.boundary + '--') + flattened.append('') + flattened = [part if isinstance(part, bytes) else part.encode('utf-8') for part in flattened] + return b'\r\n'.join(flattened) + diff --git a/ox/format.py b/ox/format.py index 83756c1..ad18c31 100644 --- a/ox/format.py +++ b/ox/format.py @@ -4,6 +4,8 @@ import math import re import string +from six import text_type + def toAZ(num): """ Converts an integer to bijective base 26 string using A-Z @@ -106,7 +108,7 @@ def to32(q): >>> to32(555306645) 'GHJKMN' - >>> to32(800197332334559) + >>> to32(800197332334559L) 'PQRSTVWXYZ' >>> to32(32) @@ -224,36 +226,36 @@ def to36(q): def from36(q): return int(q, 36) -def int_value(strValue, default=''): +def int_value(strValue, default=u''): """ >>> int_value('abc23') - '23' + u'23' >>> int_value(' abc23') - '23' + u'23' >>> int_value('ab') - '' + u'' """ try: - val = re.compile('(\d+)').findall(str(strValue).strip())[0] + val = re.compile('(\d+)').findall(text_type(strValue).strip())[0] except: val = default return val -def float_value(strValue, default=''): +def float_value(strValue, default=u''): """ >>> float_value('abc23.4') - '23.4' + u'23.4' >>> float_value(' abc23.4') - '23.4' + u'23.4' >>> float_value('ab') - '' + u'' """ try: - val = re.compile('([\d.]+)').findall(str(strValue).strip())[0] + val = re.compile('([\d.]+)').findall(text_type(strValue).strip())[0] except: val = default return val diff --git a/ox/html.py b/ox/html.py index 8666713..73234ea 100644 --- a/ox/html.py +++ b/ox/html.py @@ -3,7 +3,8 @@ # GPL 2008 import re import string -from html.entities import name2codepoint +from six.moves.html_entities import name2codepoint +from six import unichr, PY2, string_types letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz' @@ -25,7 +26,8 @@ link_target_attribute_re = re.compile(r'(]*?)target=[^\s>]+') html_gunk_re = re.compile(r'(?:
|<\/i>|<\/b>|<\/em>|<\/strong>|<\/?smallcaps>|<\/?uppercase>)', re.IGNORECASE) hard_coded_bullets_re = re.compile(r'((?:

(?:%s).*?[a-zA-Z].*?

\s*)+)' % '|'.join([re.escape(x) for x in DOTS]), re.DOTALL) trailing_empty_content_re = re.compile(r'(?:

(?: |\s|
)*?

\s*)+\Z') - +if PY2: + del x # Temporary variable def escape(html): ''' @@ -34,7 +36,7 @@ def escape(html): >>> escape('html "test" & ') 'html "test" & <brothers>' ''' - if not isinstance(html, str): + if not isinstance(html, string_types): html = str(html) return html.replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"').replace("'", ''') @@ -145,20 +147,20 @@ charrefpat = re.compile(r'&(#(\d+|x[\da-fA-F]+)|[\w.:-]+);?') def decode_html(html): """ >>> decode_html('me & you and $&%') - 'me & you and $&%' + u'me & you and $&%' >>> decode_html('€') - '\u20ac' + u'\u20ac' >>> decode_html('Anniversary of Daoud's Republic') - "Anniversary of Daoud's Republic" + u"Anniversary of Daoud's Republic" """ if isinstance(html, bytes): html = html.decode('utf-8') - uchr = chr + uchr = unichr def entitydecode(match, uchr=uchr): entity = match.group(1) if entity == '#x80': - return '€' + return u'€' elif entity.startswith('#x'): return uchr(int(entity[2:], 16)) elif entity.startswith('#'): @@ -169,7 +171,7 @@ def decode_html(html): return "'" else: return match.group(0) - return charrefpat.sub(entitydecode, html).replace('\xa0', ' ') + return charrefpat.sub(entitydecode, html).replace(u'\xa0', ' ') def highlight(text, query, hlClass="hl"): """ @@ -187,51 +189,51 @@ def highlight(text, query, hlClass="hl"): def escape_html(value): ''' - >>> escape_html('') - '<script>alert()</script>' + u'<script>alert()</script>' >>> sanitize_html("'foo' < 'bar' && \\"foo\\" > \\"bar\\"") - '\\'foo\\' < \\'bar\\' && "foo" > "bar"' + u'\\'foo\\' < \\'bar\\' && "foo" > "bar"' >>> sanitize_html('foo') - 'foo' + u'foo' >>> sanitize_html('foo') - 'foo' + u'foo' >>> sanitize_html('Anniversary of Daoud's Republic') - "Anniversary of Daoud's Republic" + u"Anniversary of Daoud's Republic" >>> sanitize_html('') - '' + u'' >>> sanitize_html(' ') - ' ' - >>> sanitize_html(' ') # canonicalised to a space: okay, I suppose - ' ' - >>> sanitize_html('\u00a0') # also nbsp - ' ' + u' ' + >>> sanitize_html(u' ') # canonicalised to a space: okay, I suppose + u' ' + >>> sanitize_html(u'\u00a0') # also nbsp + u' ' ''' if not tags: valid_url = '^((https?:\/\/|\/|mailto:).*?)' @@ -412,24 +414,24 @@ def sanitize_fragment(html): are quoted, etc. Does not strip potentially-malicious HTML: use sanitize_html() for that. - >>> sanitize_fragment('') - '' - >>> sanitize_fragment('') - '' - >>> sanitize_fragment('


') - '

' - >>> sanitize_fragment('
foo') - 'foo' - >>> sanitize_fragment('') - '' - >>> sanitize_fragment(' ') - ' ' - >>> sanitize_fragment(' ') - '\\xa0' - >>> sanitize_fragment('\\u00a0') # nbsp - '\\xa0' - >>> sanitize_fragment('\\ufeff') # zero-width no-break space - '\\ufeff' + >>> sanitize_fragment(u'') + u'' + >>> sanitize_fragment(u'') + u'' + >>> sanitize_fragment(u'


') + u'

' + >>> sanitize_fragment(u'foo') + u'foo' + >>> sanitize_fragment(u'') + u'' + >>> sanitize_fragment(u' ') + u' ' + >>> sanitize_fragment(u' ') + u'\\xa0' + >>> sanitize_fragment(u'\\u00a0') # nbsp + u'\\xa0' + >>> sanitize_fragment(u'\\ufeff') # zero-width no-break space + u'\\ufeff' ''' ''' @@ -440,12 +442,7 @@ def sanitize_fragment(html): if not html.strip(): return html import lxml.html - try: - body = lxml.html.document_fromstring(html).find('body') - except lxml.etree.ParserError as e: - if e.args and e.args[0] == 'Document is empty': - return html - raise e + body = lxml.html.document_fromstring(html).find('body') html = lxml.html.tostring(body, encoding='utf-8')[6:-7].decode('utf-8') if html.startswith('

') and html.endswith('

'): html = html[3:-4] diff --git a/ox/image.py b/ox/image.py index 5b76042..0fad5eb 100644 --- a/ox/image.py +++ b/ox/image.py @@ -22,17 +22,11 @@ ZONE_INDEX = [ ] ] - -def textsize(draw, text, font): - left, top, right, bottom = draw.textbbox((0, 0), text, font=font) - return (right, bottom) - - def drawText(image, position, text, font_file, font_size, color): draw = ImageDraw.Draw(image) font = ImageFont.truetype(font_file, font_size, encoding='unic') draw.text(position, text, fill=color, font=font) - size = textsize(draw, text, font) + size = draw.textsize(text, font=font) version = getattr(Image, 'PILLOW_VERSION', None) if version and version > '2.1.0' and version < '2.6.1': offset = font.getoffset(text) @@ -63,7 +57,7 @@ def getHSL(rgb): return tuple(hsl) def getImageHash(image_file, mode): - image = Image.open(image_file).convert('RGB').resize((8, 8), Image.LANCZOS) + image = Image.open(image_file).convert('RGB').resize((8, 8), Image.ANTIALIAS) image_hash = 0 if mode == 'color': # divide the image into 8 zones: @@ -105,7 +99,7 @@ def getImageHash(image_file, mode): return image_hash def getImageHeat(image_file): - image = Image.open(image_file).convert('RGB').resize((16, 16), Image.LANCZOS) + image = Image.open(image_file).convert('RGB').resize((16, 16), Image.ANTIALIAS) pixel = image.load() image_heat = 0 for y in range(image.size[1]): @@ -120,7 +114,7 @@ def getImageHeat(image_file): return image_heat / 256 def getImageHSL(image_file): - image = Image.open(image_file).convert('RGB').resize((1, 1), Image.LANCZOS) + image = Image.open(image_file).convert('RGB').resize((1, 1), Image.ANTIALIAS) return getHSL(image.getpixel((0, 0))) def getRGB(hsl): @@ -154,7 +148,7 @@ def getRGB(hsl): def getTextSize(image, text, font_file, font_size): draw = ImageDraw.Draw(image) font = ImageFont.truetype(font_file, font_size, encoding='unic') - size = textsize(draw, text, font) + size = draw.textsize(text, font=font) version = getattr(Image, 'PILLOW_VERSION', None) if version and version > '2.1.0' and version < '2.6.1': offset = font.getoffset(text) @@ -174,7 +168,7 @@ def wrapText(text, max_width, max_lines, font_file, font_size): return min_width def get_width(string): - return textsize(draw, string, font)[0] + return draw.textsize(string, font=font)[0] image = Image.new('RGB', (1, 1)) draw = ImageDraw.Draw(image) diff --git a/ox/js.py b/ox/js.py index 9e9f1cb..2f419bd 100644 --- a/ox/js.py +++ b/ox/js.py @@ -2,12 +2,19 @@ # -*- coding: utf-8 -*- # vi:si:et:sw=4:sts=4:ts=4 +from six import PY2 from .utils import json def minify(source, comment=''): # see https://github.com/douglascrockford/JSMin/blob/master/README def get_next_non_whitespace_token(): pass + # python2 performance with unicode string is terrible + if PY2: + if isinstance(source, unicode): # pylint: disable=undefined-variable + source = source.encode('utf-8') + if isinstance(comment, unicode): # pylint: disable=undefined-variable + comment = comment.encode('utf-8') tokens = tokenize(source) length = len(tokens) minified = '/*' + comment + '*/' if comment else '' diff --git a/ox/movie.py b/ox/movie.py index 54ede0c..cbf591d 100644 --- a/ox/movie.py +++ b/ox/movie.py @@ -206,7 +206,7 @@ def parse_path(path, directory_key='director'): string = re.sub(re.compile('(?<=\w)_(?=\w)', re.U), '/', string) string = re.sub(' _ ', ' / ', string) # 'foo_ ' is ':' - string = re.sub(re.compile('(?<=[\w\)\]])_ ', re.U), ': ', string) + string = re.sub(re.compile('(?<=\w)_ ', re.U), ': ', string) string = unicodedata.normalize('NFD', string) return string diff --git a/ox/net.py b/ox/net.py index 4d58bad..3a07d91 100644 --- a/ox/net.py +++ b/ox/net.py @@ -8,10 +8,13 @@ import os import re import struct -import requests - -from io import BytesIO -import urllib +try: + import requests + USE_REQUESTS = True +except: + USE_REQUESTS = False +from six import BytesIO, PY2 +from six.moves import urllib from chardet.universaldetector import UniversalDetector @@ -56,10 +59,14 @@ def get_json(url, data=None, headers=None): def open_url(url, data=None, headers=None): if headers is None: headers = DEFAULT_HEADERS.copy() - if isinstance(url, bytes): - url = url.decode('utf-8') + if PY2: + if not isinstance(url, bytes): + url = url.encode('utf-8') + else: + if isinstance(url, bytes): + url = url.decode('utf-8') url = url.replace(' ', '%20') - if data and not isinstance(data, bytes): + if data and not PY2 and not isinstance(data, bytes): data = data.encode('utf-8') req = urllib.request.Request(url, data, headers) return urllib.request.urlopen(req) @@ -116,11 +123,16 @@ def save_url(url, filename, overwrite=False): if dirname and not os.path.exists(dirname): os.makedirs(dirname) headers = DEFAULT_HEADERS.copy() - r = requests.get(url, headers=headers, stream=True) - with open(filename, 'wb') as f: - for chunk in r.iter_content(chunk_size=1024): - if chunk: # filter out keep-alive new chunks - f.write(chunk) + if USE_REQUESTS: + r = requests.get(url, headers=headers, stream=True) + with open(filename, 'wb') as f: + for chunk in r.iter_content(chunk_size=1024): + if chunk: # filter out keep-alive new chunks + f.write(chunk) + else: + data = read_url(url) + with open(filename, 'wb') as f: + f.write(data) def _get_size(url): req = urllib.request.Request(url, headers=DEFAULT_HEADERS.copy()) diff --git a/ox/normalize.py b/ox/normalize.py index 4ee9293..dea40ae 100644 --- a/ox/normalize.py +++ b/ox/normalize.py @@ -4,6 +4,8 @@ import re import unicodedata +from six import string_types + _articles = ('the', 'la', 'a', 'die', 'der', 'le', 'el', "l'", 'il', 'das', 'les', 'o', 'ein', 'i', 'un', 'los', 'de', @@ -101,7 +103,7 @@ def normalize_imdbid(imdbId): >>> normalize_imdbid('tt0159206') '0159206' """ - if isinstance(imdbId, str): + if isinstance(imdbId, string_types): imdbId = re.sub('.*(\d{7}).*', '\\1', imdbId) elif isinstance(imdbId, int): imdbId = "%07d" % imdbId diff --git a/ox/srt.py b/ox/srt.py index 464c08e..c29ae8b 100644 --- a/ox/srt.py +++ b/ox/srt.py @@ -5,6 +5,7 @@ import codecs import re import chardet +from six import PY2 import ox @@ -23,7 +24,10 @@ def _detect_encoding(fp): # go to beginning of file and get the first 4 bytes oldFP = fp.tell() fp.seek(0) - (byte1, byte2, byte3, byte4) = fp.read(4) + if PY2: + (byte1, byte2, byte3, byte4) = [ord(b) for b in fp.read(4)] + else: + (byte1, byte2, byte3, byte4) = fp.read(4) # try bom detection using 4 bytes, 3 bytes, or 2 bytes bomDetection = bomDict.get((byte1, byte2, byte3, byte4)) diff --git a/ox/text.py b/ox/text.py index d650262..282afa2 100644 --- a/ox/text.py +++ b/ox/text.py @@ -1,13 +1,11 @@ # -*- coding: utf-8 -*- # vi:si:et:sw=4:sts=4:ts=4 # GPL 2008 -import gzip import math import re import unicodedata -from io import BytesIO -from functools import reduce +from six.moves import reduce ARTICLES = list(set([ # def sg, def pl, indef sg, indef pl (each m/f/n) @@ -475,10 +473,10 @@ def wrap(text, width): def wrap_string(string, length=80, separator='\n', balance=False): ''' - >>> wrap_string("Anticonstitutionellement, Paris s'eveille", 16) - "Anticonstitution\\nellement, Paris \\ns'eveille" + >>> wrap_string(u"Anticonstitutionellement, Paris s'eveille", 16) + u"Anticonstitution\\nellement, Paris \\ns'eveille" >>> wrap_string(u'All you can eat', 12, '\\n', True) - 'All you \\ncan eat' + u'All you \\ncan eat' ''' words = string.split(' ') if balance: @@ -493,20 +491,20 @@ def wrap_string(string, length=80, separator='\n', balance=False): break lines = [''] for word in words: - if len(lines[len(lines) - 1] + word + ' ') <= length + 1: + if len(lines[len(lines) - 1] + word + u' ') <= length + 1: # word fits in current line - lines[len(lines) - 1] += word + ' ' + lines[len(lines) - 1] += word + u' ' else: if len(word) <= length: # word fits in next line - lines.append(word + ' ') + lines.append(word + u' ') else: # word is longer than line position = length - len(lines[len(lines) - 1]) lines[len(lines) - 1] += word[0:position] for i in range(position, len(word), length): lines.append(word[i:i+length]) - lines[len(lines) - 1] += ' ' + lines[len(lines) - 1] += u' ' return separator.join(lines).strip() def truncate_string(string, length, padding='...', position='right'): @@ -578,14 +576,14 @@ def get_valid_filename(s): def get_text_list(list_, last_word='or'): """ - >>> get_text_list(['a', 'b', 'c', 'd']) - 'a, b, c or d' - >>> get_text_list(['a', 'b', 'c'], 'and') - 'a, b and c' - >>> get_text_list(['a', 'b'], 'and') - 'a and b' - >>> get_text_list(['a']) - 'a' + >>> get_text_list([u'a', u'b', u'c', u'd']) + u'a, b, c or d' + >>> get_text_list([u'a', u'b', u'c'], 'and') + u'a, b and c' + >>> get_text_list([u'a', u'b'], 'and') + u'a and b' + >>> get_text_list([u'a']) + u'a' >>> get_text_list([]) '' """ @@ -593,24 +591,24 @@ def get_text_list(list_, last_word='or'): return '' if len(list_) == 1: return list_[0] - return '%s %s %s' % (', '.join([i for i in list_][:-1]), last_word, list_[-1]) + return u'%s %s %s' % (u', '.join([i for i in list_][:-1]), last_word, list_[-1]) def get_list_text(text, last_word='or'): """ - >>> get_list_text('a, b, c or d') - ['a', 'b', 'c', 'd'] - >>> get_list_text('a, b and c', 'and') - ['a', 'b', 'c'] - >>> get_list_text('a and b', 'and') - ['a', 'b'] - >>> get_list_text('a') - ['a'] - >>> get_list_text('') + >>> get_list_text(u'a, b, c or d') + [u'a', u'b', u'c', u'd'] + >>> get_list_text(u'a, b and c', u'and') + [u'a', u'b', u'c'] + >>> get_list_text(u'a and b', u'and') + [u'a', u'b'] + >>> get_list_text(u'a') + [u'a'] + >>> get_list_text(u'') [] """ list_ = [] if text: - list_ = text.split(', ') + list_ = text.split(u', ') if list_: i = len(list_)-1 last = list_[i].split(last_word) @@ -648,6 +646,8 @@ def phone2numeric(phone): return letters.sub(char2number, phone) def compress_string(s): + import gzip + from six import BytesIO zbuf = BytesIO() zfile = gzip.GzipFile(mode='wb', compresslevel=6, fileobj=zbuf) zfile.write(s) @@ -682,7 +682,7 @@ def words(text): return [re.sub("(([.!?:-_]|'s)$)", '', x) for x in text] def sort_string(string): - string = string.replace('Æ', 'AE').replace('Ø', 'O').replace('Þ', 'Th') + string = string.replace(u'Æ', 'AE').replace(u'Ø', 'O').replace(u'Þ', 'Th') # pad numbered titles string = re.sub('(\d),(\d{3})', '\\1\\2', string) diff --git a/ox/torrent/__init__.py b/ox/torrent/__init__.py index 9c399fe..a250215 100644 --- a/ox/torrent/__init__.py +++ b/ox/torrent/__init__.py @@ -5,8 +5,12 @@ from threading import Event from hashlib import sha1 import os +from six import PY2 -from .bencode3 import bencode, bdecode +if PY2: + from .bencode import bencode, bdecode +else: + from .bencode3 import bencode, bdecode __all__ = ['create_torrent', 'get_info_hash', 'get_torrent_info', 'get_files', 'get_torrent_size'] diff --git a/ox/torrent/bencode.py b/ox/torrent/bencode.py new file mode 100644 index 0000000..b586001 --- /dev/null +++ b/ox/torrent/bencode.py @@ -0,0 +1,321 @@ +# Written by Petru Paler, Uoti Urpala, Ross Cohen and John Hoffman +# see LICENSE.txt for license information +from __future__ import print_function + +from types import IntType, LongType, StringType, ListType, TupleType, DictType +try: + from types import BooleanType +except ImportError: + BooleanType = None +try: + from types import UnicodeType +except ImportError: + UnicodeType = None +from cStringIO import StringIO + +def decode_int(x, f): + f += 1 + newf = x.index('e', f) + try: + n = int(x[f:newf]) + except: + n = long(x[f:newf]) + if x[f] == '-': + if x[f + 1] == '0': + raise ValueError + elif x[f] == '0' and newf != f+1: + raise ValueError + return (n, newf+1) + +def decode_string(x, f): + colon = x.index(':', f) + try: + n = int(x[f:colon]) + except (OverflowError, ValueError): + n = long(x[f:colon]) + if x[f] == '0' and colon != f+1: + raise ValueError + colon += 1 + return (x[colon:colon+n], colon+n) + +def decode_unicode(x, f): + s, f = decode_string(x, f+1) + return (s.decode('UTF-8'),f) + +def decode_list(x, f): + r, f = [], f+1 + while x[f] != 'e': + v, f = decode_func[x[f]](x, f) + r.append(v) + return (r, f + 1) + +def decode_dict(x, f): + r, f = {}, f+1 + lastkey = None + while x[f] != 'e': + k, f = decode_string(x, f) + # why is this needed + # if lastkey >= k: + # raise ValueError + lastkey = k + r[k], f = decode_func[x[f]](x, f) + return (r, f + 1) + +decode_func = {} +decode_func['l'] = decode_list +decode_func['d'] = decode_dict +decode_func['i'] = decode_int +decode_func['0'] = decode_string +decode_func['1'] = decode_string +decode_func['2'] = decode_string +decode_func['3'] = decode_string +decode_func['4'] = decode_string +decode_func['5'] = decode_string +decode_func['6'] = decode_string +decode_func['7'] = decode_string +decode_func['8'] = decode_string +decode_func['9'] = decode_string +#decode_func['u'] = decode_unicode + +def bdecode(x, sloppy = 1): + try: + r, l = decode_func[x[0]](x, 0) +# except (IndexError, KeyError): + except (IndexError, KeyError, ValueError): + raise ValueError("bad bencoded data") + if not sloppy and l != len(x): + raise ValueError("bad bencoded data") + return r + +def test_bdecode(): + try: + bdecode('0:0:') + assert 0 + except ValueError: + pass + try: + bdecode('ie') + assert 0 + except ValueError: + pass + try: + bdecode('i341foo382e') + assert 0 + except ValueError: + pass + assert bdecode('i4e') == 4 + assert bdecode('i0e') == 0 + assert bdecode('i123456789e') == 123456789 + assert bdecode('i-10e') == -10 + try: + bdecode('i-0e') + assert 0 + except ValueError: + pass + try: + bdecode('i123') + assert 0 + except ValueError: + pass + try: + bdecode('') + assert 0 + except ValueError: + pass + try: + bdecode('i6easd') + assert 0 + except ValueError: + pass + try: + bdecode('35208734823ljdahflajhdf') + assert 0 + except ValueError: + pass + try: + bdecode('2:abfdjslhfld') + assert 0 + except ValueError: + pass + assert bdecode('0:') == '' + assert bdecode('3:abc') == 'abc' + assert bdecode('10:1234567890') == '1234567890' + try: + bdecode('02:xy') + assert 0 + except ValueError: + pass + try: + bdecode('l') + assert 0 + except ValueError: + pass + assert bdecode('le') == [] + try: + bdecode('leanfdldjfh') + assert 0 + except ValueError: + pass + assert bdecode('l0:0:0:e') == ['', '', ''] + try: + bdecode('relwjhrlewjh') + assert 0 + except ValueError: + pass + assert bdecode('li1ei2ei3ee') == [1, 2, 3] + assert bdecode('l3:asd2:xye') == ['asd', 'xy'] + assert bdecode('ll5:Alice3:Bobeli2ei3eee') == [['Alice', 'Bob'], [2, 3]] + try: + bdecode('d') + assert 0 + except ValueError: + pass + try: + bdecode('defoobar') + assert 0 + except ValueError: + pass + assert bdecode('de') == {} + assert bdecode('d3:agei25e4:eyes4:bluee') == {'age': 25, 'eyes': 'blue'} + assert bdecode('d8:spam.mp3d6:author5:Alice6:lengthi100000eee') == {'spam.mp3': {'author': 'Alice', 'length': 100000}} + try: + bdecode('d3:fooe') + assert 0 + except ValueError: + pass + try: + bdecode('di1e0:e') + assert 0 + except ValueError: + pass + try: + bdecode('d1:b0:1:a0:e') + assert 0 + except ValueError: + pass + try: + bdecode('d1:a0:1:a0:e') + assert 0 + except ValueError: + pass + try: + bdecode('i03e') + assert 0 + except ValueError: + pass + try: + bdecode('l01:ae') + assert 0 + except ValueError: + pass + try: + bdecode('9999:x') + assert 0 + except ValueError: + pass + try: + bdecode('l0:') + assert 0 + except ValueError: + pass + try: + bdecode('d0:0:') + assert 0 + except ValueError: + pass + try: + bdecode('d0:') + assert 0 + except ValueError: + pass + +bencached_marker = [] + +class Bencached: + def __init__(self, s): + self.marker = bencached_marker + self.bencoded = s + +BencachedType = type(Bencached('')) # insufficient, but good as a filter + +def encode_bencached(x,r): + assert x.marker == bencached_marker + r.append(x.bencoded) + +def encode_int(x,r): + r.extend(('i',str(x),'e')) + +def encode_bool(x,r): + encode_int(int(x),r) + +def encode_string(x,r): + r.extend((str(len(x)),':',x)) + +def encode_unicode(x,r): + #r.append('u') + encode_string(x.encode('UTF-8'),r) + +def encode_list(x,r): + r.append('l') + for e in x: + encode_func[type(e)](e, r) + r.append('e') + +def encode_dict(x,r): + r.append('d') + ilist = x.items() + ilist.sort() + for k,v in ilist: + r.extend((str(len(k)),':',k)) + encode_func[type(v)](v, r) + r.append('e') + +encode_func = {} +encode_func[BencachedType] = encode_bencached +encode_func[IntType] = encode_int +encode_func[LongType] = encode_int +encode_func[StringType] = encode_string +encode_func[ListType] = encode_list +encode_func[TupleType] = encode_list +encode_func[DictType] = encode_dict +if BooleanType: + encode_func[BooleanType] = encode_bool +if UnicodeType: + encode_func[UnicodeType] = encode_unicode + +def bencode(x): + r = [] + try: + encode_func[type(x)](x, r) + except: + print("*** error *** could not encode type %s (value: %s)" % (type(x), x)) + assert 0 + return ''.join(r) + +def test_bencode(): + assert bencode(4) == 'i4e' + assert bencode(0) == 'i0e' + assert bencode(-10) == 'i-10e' + assert bencode(12345678901234567890) == 'i12345678901234567890e' + assert bencode('') == '0:' + assert bencode('abc') == '3:abc' + assert bencode('1234567890') == '10:1234567890' + assert bencode([]) == 'le' + assert bencode([1, 2, 3]) == 'li1ei2ei3ee' + assert bencode([['Alice', 'Bob'], [2, 3]]) == 'll5:Alice3:Bobeli2ei3eee' + assert bencode({}) == 'de' + assert bencode({'age': 25, 'eyes': 'blue'}) == 'd3:agei25e4:eyes4:bluee' + assert bencode({'spam.mp3': {'author': 'Alice', 'length': 100000}}) == 'd8:spam.mp3d6:author5:Alice6:lengthi100000eee' + try: + bencode({1: 'foo'}) + assert 0 + except AssertionError: + pass + + +try: + import psyco + psyco.bind(bdecode) + psyco.bind(bencode) +except ImportError: + pass diff --git a/ox/torrent/makemetafile.py b/ox/torrent/makemetafile.py index c2db27a..31d6ebe 100644 --- a/ox/torrent/makemetafile.py +++ b/ox/torrent/makemetafile.py @@ -8,7 +8,11 @@ from hashlib import sha1 as sha from copy import copy import re -from .bencode3 import bencode +from six import PY2 +if PY2: + from .bencode import bencode +else: + from .bencode3 import bencode from threading import Event from time import time from traceback import print_exc diff --git a/ox/web/allmovie.py b/ox/web/allmovie.py index c94c438..fdb7a46 100644 --- a/ox/web/allmovie.py +++ b/ox/web/allmovie.py @@ -13,13 +13,13 @@ def get_id(url): def get_data(id): ''' >>> get_data('129689')['cast'][1][1] - 'Marianne' + u'Marianne' >>> get_data('129689')['credits'][0][0] - 'Jean-Luc Godard' + u'Jean-Luc Godard' >>> get_data('129689')['posters'][0] - 'http://image.allmusic.com/00/adg/cov200/dru800/u812/u81260bbffr.jpg' + u'http://image.allmusic.com/00/adg/cov200/dru800/u812/u81260bbffr.jpg' >>> get_data('129689')['rating'] - '4.5' + u'4.5' ''' if id.startswith('http'): id = get_id(id) diff --git a/ox/web/amazon.py b/ox/web/amazon.py index d721d5c..19a72c7 100644 --- a/ox/web/amazon.py +++ b/ox/web/amazon.py @@ -2,7 +2,7 @@ # vi:si:et:sw=4:sts=4:ts=4 from __future__ import print_function import re -from urllib.parse import quote +from six.moves.urllib.parse import quote from ox import find_re, strip_tags, decode_html from ox.cache import read_url diff --git a/ox/web/apple.py b/ox/web/apple.py index 84abba0..099d6cb 100644 --- a/ox/web/apple.py +++ b/ox/web/apple.py @@ -2,6 +2,7 @@ from __future__ import print_function import json import re +from six import text_type from ox.cache import read_url HEADERS = { @@ -16,9 +17,9 @@ USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7) ' USER_AGENT += 'AppleWebKit/534.48.3 (KHTML, like Gecko) Version/5.1 Safari/534.48.3' def get_movie_data(title, director): - if isinstance(title, str): + if isinstance(title, text_type): title = title.encode('utf-8') - if isinstance(director, str): + if isinstance(director, text_type): director = director.encode('utf-8') data = {} # itunes section (preferred source for link) diff --git a/ox/web/archive.py b/ox/web/archive.py index 3e7ab47..0c733c3 100644 --- a/ox/web/archive.py +++ b/ox/web/archive.py @@ -3,6 +3,8 @@ from .. import cache from ..utils import json +from six import string_types + def get_id(url): return url.split("/")[-1] @@ -19,7 +21,7 @@ def get_data(id): data[key] = details['metadata'][key] if isinstance(data[key], list): data[key] = data[key][0] - if isinstance(data[key], str): + if isinstance(data[key], string_types): data[key] = data[key].strip() if data[key][0] == '[' and data[key][-1] == ']': data[key] = data[key][1:-1] diff --git a/ox/web/arsenalberlin.py b/ox/web/arsenalberlin.py index ca77b5e..e5a0dd2 100644 --- a/ox/web/arsenalberlin.py +++ b/ox/web/arsenalberlin.py @@ -19,18 +19,18 @@ def get_data(id, language='en'): if 'Willkommen in der Datenbank des Arsenal' in html: return None data = {} - data['id'] = id - data['url'] = url + data[u'id'] = id + data[u'url'] = url m = re.compile('

(.*?)

').findall(html) if m: - data['title'] = m[0] + data[u'title'] = m[0] m = re.compile("Director: (.*?)").findall(html) if m: - data['director'] = m[0] + data[u'director'] = m[0] m = re.compile("caUI.initImageScroller\(\[\{url:'(.*?)'").findall(html) if m: - data['image'] = m[0] + data[u'image'] = m[0] units = re.compile("
(.*?)
", re.DOTALL).findall(html) for x in map(re.compile('(.*?): (.*)', re.DOTALL).findall, units): @@ -43,7 +43,7 @@ def get_data(id, language='en'): else: data[key] = strip_tags(data[key]) if "running time (minutes)" in data: - data['runtime'] = float(data.pop("running time (minutes)").replace(',', '.')) * 60 + data[u'runtime'] = float(data.pop("running time (minutes)").replace(',', '.')) * 60 for key in ('year', 'length in metres', 'forum participation year', 'number of reels'): if key in data and data[key].isdigit(): data[key] = int(data[key]) diff --git a/ox/web/criterion.py b/ox/web/criterion.py index 67d4a8a..6cef01e 100644 --- a/ox/web/criterion.py +++ b/ox/web/criterion.py @@ -19,13 +19,13 @@ def get_url(id): def get_data(id, timeout=ox.cache.cache_timeout, get_imdb=False): ''' >>> get_data('1333').get('imdbId') - '0060304' + u'0060304' >>> get_data('236')['posters'][0] - 'http://s3.amazonaws.com/criterion-production/release_images/1586/ThirdManReplace.jpg' + u'http://s3.amazonaws.com/criterion-production/release_images/1586/ThirdManReplace.jpg' >>> get_data('786')['posters'][0] - 'http://s3.amazonaws.com/criterion-production/product_images/185/343_box_348x490.jpg' + u'http://s3.amazonaws.com/criterion-production/product_images/185/343_box_348x490.jpg' ''' data = { "id": id, @@ -39,16 +39,12 @@ def get_data(id, timeout=ox.cache.cache_timeout, get_imdb=False): data["number"] = find_re(html, "Spine #(\d+)") data["title"] = decode_html(find_re(html, "

(.*?)

")) - data["title"] = data["title"].split(' \u2014 The Television Version')[0].strip() + data["title"] = data["title"].split(u' \u2014 The Television Version')[0].strip() results = find_re(html, '
    (.*?)
') info = re.compile('
  • (.*?)
  • ', re.DOTALL).findall(results) info = {k: strip_tags(v).strip() for k, v in info} - meta = re.compile('.*?src="(.*?)"', re.DOTALL).findall(html) - #result = find_re(html, "\"Film>> get_id(imdb='0133093') - 'the-matrix' + u'the-matrix' #>>> get_id(imdb='0060304') - #'2-or-3-things-i-know-about-her' + #u'2-or-3-things-i-know-about-her' ''' if imdb: i = ImdbCombined(imdb) diff --git a/ox/web/google.py b/ox/web/google.py index 01bb7ce..72aa32f 100644 --- a/ox/web/google.py +++ b/ox/web/google.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- # vi:si:et:sw=4:sts=4:ts=4 import re -import urllib +from six.moves import urllib import ox from ox import strip_tags, decode_html @@ -17,31 +17,6 @@ def quote_plus(s): s = s.encode('utf-8') return urllib.parse.quote_plus(s) - -def infobox(query, timeout=DEFAULT_TIMEOUT): - import lxml.html - data = read_url(url, timeout=timeout) - doc = lxml.html.document_fromstring(data) - k = 'kp-wholepage' - wholepage = doc.cssselect('.' + k) - infobox = {} - if wholepage: - page = wholepage[0] - for a in page.cssselect('a'): - if a.attrib.get('href', '').startswith('http'): - domain = '.'.join(a.attrib['href'].split('/')[2].split('.')[-2:]) - infobox[domain] = a.attrib['href'] - for e in page.cssselect('*[data-attrid]'): - key = e.attrib['data-attrid'] - value = e.text_content() - if value and key not in ( - 'kc:/film/film:media_actions_wholepage', - 'action:watch_film' - ): - infobox[key] = value - return infobox - - def find(query, max_results=DEFAULT_MAX_RESULTS, timeout=DEFAULT_TIMEOUT): """ Return max_results tuples with title, url, description diff --git a/ox/web/imdb.py b/ox/web/imdb.py index 4b08cab..ac12c83 100644 --- a/ox/web/imdb.py +++ b/ox/web/imdb.py @@ -1,13 +1,13 @@ # -*- coding: utf-8 -*- # vi:si:et:sw=4:sts=4:ts=4 -from collections import defaultdict +from __future__ import print_function -import json import re import time import unicodedata -from urllib.parse import urlencode +from six.moves.urllib.parse import urlencode +from six import text_type, string_types from .. import find_re, strip_tags, decode_html from .. import cache @@ -16,13 +16,13 @@ from .. import cache from . siteparser import SiteParser from . import duckduckgo from ..utils import datetime -from ..geo import normalize_country_name, get_country_name +from ..geo import normalize_country_name def prepare_url(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None, unicode=False): headers = headers.copy() # https://webapps.stackexchange.com/questions/11003/how-can-i-disable-reconfigure-imdbs-automatic-geo-location-so-it-does-not-defau - #headers['X-Forwarded-For'] = '72.21.206.80' + headers['X-Forwarded-For'] = '72.21.206.80' headers['Accept-Language'] = 'en' return url, data, headers, timeout, unicode @@ -106,131 +106,6 @@ def technical(label): } -def tech_spec(metadata): - tech = {} - for row in metadata['props']['pageProps']['contentData']['section']['items']: - title = { - 'aspect ratio': 'aspectratio', - 'sound mix': 'sound', - }.get(row['rowTitle'].lower(), row['rowTitle'].lower()) - tech[title] = [] - for content in row['listContent']: - value = content['text'] - tech[title].append(value) - return tech - - -def movie_connections(metadata): - - connections = {} - if 'props' not in metadata: - return connections - for row in metadata['props']['pageProps']['contentData']['categories']: - title = { - }.get(row['name'], row['name']) - if title not in connections: - connections[title] = [] - - for item in row['section']['items']: - item_ = { - 'id': item['id'][2:], - } - - item_['title'] = re.compile('(.*?)').findall(item['listContent'][0]['html'])[0] - if len(item['listContent']) >=2: - item_['description'] = strip_tags(item['listContent'][1]['html']) - connections[title].append(item_) - return connections - - -def get_category_by_id(metadata, id): - for category in metadata['props']['pageProps']['contentData']['categories']: - if category['id'] == id: - return category - - -def get_release_date(metadata): - releases = get_category_by_id(metadata, 'releases') - def parse_date(d): - parsed = None - for fmt in ( - '%B %d, %Y', - '%d %B %Y', - '%B %Y', - ): - try: - parsed = datetime.strptime(d, fmt) - break - except: - pass - if not parsed: - return None - return '%d-%02d-%02d' % (parsed.year, parsed.month, parsed.day) - - dates = [] - for item in releases['section']['items']: - content = item['listContent'][0] - date = parse_date(content['text']) - if date: - dates.append(date) - - if dates: - return min(dates) - -def get_locations(metadata): - try: - locations = [ - row['cardText'] - for row in metadata['props']['pageProps']['contentData']['categories'][0]['section']['items'] - ] - except: - locations = [] - return locations - - -def get_keywords(metadata): - try: - keywords = [ - row['rowTitle'] - for row in metadata['props']['pageProps']['contentData']['section']['items'] - ] - except: - keywords = [] - return keywords - - -def get_entity_metadata(metadata): - data = {} - entity = metadata['props']['pageProps']['contentData']['entityMetadata'] - data['title'] = entity['titleText']['text'] - data['originalTitle'] = entity['originalTitleText']['text'] - data['year'] = entity['releaseYear']['year'] - data['plot'] = entity['plot']['plotText']['plainText'] - data['country'] = [get_country_name(c['id']) for c in entity['countriesOfOrigin']['countries']] - data['poster'] = metadata['props']['pageProps']['contentData']['posterData']['image']['url'] - return data - - -def alternative_titles(metadata): - titles = defaultdict(list) - akas = get_category_by_id(metadata, 'akas') - - skip = [ - metadata['props']['pageProps']['contentData']['entityMetadata']['titleText']['text'], - metadata['props']['pageProps']['contentData']['entityMetadata']['originalTitleText']['text'] - ] - for row in akas['section']['items']: - content = row['listContent'][0] - title = content['text'] - country = row['rowTitle'] - if title in skip: - continue - titles[title].append(country) - #if content.get('subText'): - # titles[-1]['subText'] = content['subText'] - return [kv for kv in titles.items()] - - ''' 'posterIds': { 'page': 'posters', @@ -241,17 +116,18 @@ def alternative_titles(metadata): class Imdb(SiteParser): ''' - >>> Imdb('0068646')['title'] == 'The Godfather' + >>> Imdb('0068646')['title'] == text_type(u'The Godfather') True - >>> Imdb('0133093')['title'] == 'The Matrix' + >>> Imdb('0133093')['title'] == text_type(u'The Matrix') True ''' regex = { 'alternativeTitles': { 'page': 'releaseinfo', 're': [ - '