use six to support python 2 and 3

This commit is contained in:
j 2014-09-30 21:04:46 +02:00
parent 1b1dcf1c58
commit d4d09b56b6
28 changed files with 1730 additions and 1678 deletions

View file

@ -3,28 +3,32 @@
# GPL 2011
__version__ = '2.1.1'
import cache
import js
import jsonc
import net
import srt
import utils
from . import cache
from . import js
from . import jsonc
from . import net
from . import srt
from . import utils
from api import *
from file import *
from form import *
from format import *
from geo import *
from html import *
from .api import *
from .file import *
from .form import *
from .format import *
from .geo import *
from .html import *
#image depends on PIL, not easy enough to instal on osx
try:
from image import *
from .image import *
except:
pass
from location import *
from movie import *
from normalize import *
from oembed import *
from text import *
from torrent import *
from fixunicode import *
from .location import *
from .movie import *
from .normalize import *
from .oembed import *
from .text import *
#currently broken in python3
try:
from .torrent import *
except:
pass
from .fixunicode import *

View file

@ -3,10 +3,10 @@
# GPL 2011
from __future__ import with_statement
import cookielib
from six.moves import http_cookiejar as cookielib
import gzip
import StringIO
import urllib2
from six import StringIO
from six.moves import urllib
from types import MethodType
from . import __version__
@ -29,8 +29,8 @@ class API(object):
self._cj = cj
else:
self._cj = cookielib.CookieJar()
self._opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(self._cj),
urllib2.HTTPHandler(debuglevel=self.debuglevel))
self._opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(self._cj),
urllib.HTTPHandler(debuglevel=self.debuglevel))
self._opener.addheaders = [
('User-Agent', '%s/%s' % (self.__name__, self.__version__))
]
@ -64,7 +64,7 @@ class API(object):
result = {}
try:
body = str(form)
request = urllib2.Request(str(url))
request = urllib.reuqest.Request(str(url))
request.add_header('Content-type', form.get_content_type())
request.add_header('Content-Length', str(len(body)))
request.add_header('Accept-Encoding', 'gzip, deflate')
@ -75,7 +75,7 @@ class API(object):
result = gzip.GzipFile(fileobj=StringIO.StringIO(result)).read()
result = result.decode('utf-8')
return json.loads(result)
except urllib2.HTTPError, e:
except urllib.error.HTTPError as e:
if self.DEBUG:
import webbrowser
if e.code >= 500:

View file

@ -1,24 +1,22 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
# GPL 2011
from __future__ import with_statement
from __future__ import with_statement, print_function
import gzip
import zlib
import hashlib
import os
import StringIO
from six import BytesIO
import time
import urlparse
import urllib2
from six.moves import urllib
import sqlite3
import chardet
from utils import json
from .utils import json
from .file import makedirs
import net
from net import DEFAULT_HEADERS, detect_encoding
from . import net
from .net import DEFAULT_HEADERS, detect_encoding
cache_timeout = 30*24*60*60 # default is 30 days
@ -69,7 +67,7 @@ class InvalidResult(Exception):
self.headers = headers
def _fix_unicode_url(url):
if isinstance(url, unicode):
if not isinstance(url, bytes):
url = url.encode('utf-8')
return url
@ -83,24 +81,30 @@ def read_url(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout, val
if this function fails, InvalidResult will be raised deal with it in your code
'''
if net.DEBUG:
print 'ox.cache.read_url', url
print('ox.cache.read_url', url)
#FIXME: send last-modified / etag from cache and only update if needed
url = _fix_unicode_url(url)
#url = _fix_unicode_url(url)
result = store.get(url, data, headers, timeout)
url_headers = {}
if not result:
try:
url_headers, result = net.read_url(url, data, headers, return_headers=True)
except urllib2.HTTPError, e:
except urllib.error.HTTPError as e:
e.headers['Status'] = "%s" % e.code
url_headers = dict(e.headers)
for key in e.headers:
url_headers[key.lower()] = e.headers[key]
result = e.read()
if url_headers.get('content-encoding', None) == 'gzip':
result = gzip.GzipFile(fileobj=StringIO.StringIO(result)).read()
result = gzip.GzipFile(fileobj=BytesIO(result)).read()
if not valid or valid(result, url_headers):
store.set(url, post_data=data, data=result, headers=url_headers)
else:
raise InvalidResult(result, url_headers)
if unicode:
ctype = url_headers.get('content-type', '').lower()
if 'charset' in ctype:
encoding = ctype.split('charset=')[-1]
else:
encoding = detect_encoding(result)
if not encoding:
encoding = 'latin-1'
@ -143,9 +147,8 @@ class SQLiteCache(Cache):
self.create()
def connect(self):
conn = sqlite3.connect(self.db, timeout=10)
conn.text_factory = str
return conn
self.conn = sqlite3.connect(self.db, timeout=10)
return self.conn
def create(self):
conn = self.connect()
@ -177,9 +180,9 @@ class SQLiteCache(Cache):
if timeout == 0:
return r
if data:
url_hash = hashlib.sha1(url + '?' + data).hexdigest()
url_hash = hashlib.sha1((url + '?' + data).encode('utf-8')).hexdigest()
else:
url_hash = hashlib.sha1(url).hexdigest()
url_hash = hashlib.sha1(url.encode('utf-8')).hexdigest()
conn = self.connect()
c = conn.cursor()
@ -210,11 +213,11 @@ class SQLiteCache(Cache):
def set(self, url, post_data, data, headers):
if post_data:
url_hash = hashlib.sha1(url + '?' + post_data).hexdigest()
url_hash = hashlib.sha1((url + '?' + post_data).encode('utf-8')).hexdigest()
else:
url_hash = hashlib.sha1(url).hexdigest()
url_hash = hashlib.sha1(url.encode('utf-8')).hexdigest()
domain = ".".join(urlparse.urlparse(url)[1].split('.')[-2:])
domain = ".".join(urllib.parse.urlparse(url)[1].split('.')[-2:])
conn = self.connect()
c = conn.cursor()
@ -266,11 +269,11 @@ class FileCache(Cache):
return r
if data:
url_hash = hashlib.sha1(url + '?' + data).hexdigest()
url_hash = hashlib.sha1((url + '?' + data).encode('utf-8')).hexdigest()
else:
url_hash = hashlib.sha1(url).hexdigest()
url_hash = hashlib.sha1(url.encode('utf-8')).hexdigest()
domain = ".".join(urlparse.urlparse(url)[1].split('.')[-2:])
domain = ".".join(urllib.parse.urlparse(url)[1].split('.')[-2:])
prefix, i, f = self.files(domain, url_hash)
if os.path.exists(i):
with open(i) as _i:
@ -295,11 +298,11 @@ class FileCache(Cache):
def set(self, url, post_data, data, headers):
if post_data:
url_hash = hashlib.sha1(url + '?' + post_data).hexdigest()
url_hash = hashlib.sha1((url + '?' + post_data).encode('utf-8')).hexdigest()
else:
url_hash = hashlib.sha1(url).hexdigest()
url_hash = hashlib.sha1(url.encode('utf-8')).hexdigest()
domain = ".".join(urlparse.urlparse(url)[1].split('.')[-2:])
domain = ".".join(urllib.parse.urlparse(url)[1].split('.')[-2:])
prefix, i, f = self.files(domain, url_hash)
makedirs(prefix)

View file

@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
# GPL 2008
from __future__ import division, with_statement
from __future__ import division, with_statement, print_function
import os
import hashlib
import re
@ -10,7 +10,7 @@ import struct
import subprocess
import sqlite3
from ox.utils import json
from .utils import json
__all__ = ['sha1sum', 'oshash', 'avinfo', 'makedirs']
@ -283,19 +283,19 @@ def makedirs(path):
if not os.path.exists(path):
try:
os.makedirs(path)
except OSError, e:
except OSError as e:
if e.errno != 17:
raise
def copy_file(source, target, verbose=False):
if verbose:
print 'copying', source, 'to', target
print('copying', source, 'to', target)
write_path(target)
shutil.copyfile(source, target)
def read_file(file, verbose=False):
if verbose:
print 'reading', file
print('reading', file)
f = open(file)
data = f.read()
f.close()
@ -303,14 +303,14 @@ def read_file(file, verbose=False):
def read_json(file, verbose=False):
if verbose:
print 'reading', file
print('reading', file)
with open(file) as fd:
data = json.load(fd)
return data
def write_file(file, data, verbose=False):
if verbose:
print 'writing', file
print('writing', file)
write_path(file)
f = open(file, 'w')
f.write(data)
@ -319,7 +319,7 @@ def write_file(file, data, verbose=False):
def write_image(file, image, verbose=False):
if verbose:
print 'writing', file
print('writing', file)
write_path(file)
image.save(file)
@ -329,7 +329,7 @@ def write_json(file, data, ensure_ascii=True, indent=0, sort_keys=False, verbose
def write_link(source, target, verbose=False):
if verbose:
print 'linking', source, 'to', target
print('linking', source, 'to', target)
write_path(target)
if os.path.exists(target):
os.unlink(target)

View file

@ -2,13 +2,16 @@
# -*- coding: utf-8 -*-
# from http://blog.lumino.so/2012/08/20/fix-unicode-mistakes-with-python/
# MIT
from __future__ import print_function
import unicodedata
from six import unichr
__all__ = ['fix_bad_unicode']
def fix_bad_unicode(text):
u"""
"""
Something you will find all over the place, in real-world text, is text
that's mistakenly encoded as utf-8, decoded in some ugly format like
latin-1 or even Windows codepage 1252, and encoded as utf-8 again.
@ -26,52 +29,53 @@ def fix_bad_unicode(text):
auto-decode bytes for you -- then it would just create the problems it's
supposed to fix.
>>> print fix_bad_unicode(u'único')
único
>>> fix_bad_unicode(u'único')
'único'
>>> fix_bad_unicode('This text is fine already :þ')
'This text is fine already :þ'
>>> print fix_bad_unicode(u'This text is fine already :þ')
This text is fine already :þ
Because these characters often come from Microsoft products, we allow
for the possibility that we get not just Unicode characters 128-255, but
also Windows's conflicting idea of what characters 128-160 are.
>>> print fix_bad_unicode(u'This — should be an em dash')
This should be an em dash
>>> fix_bad_unicode('This — should be an em dash')
'This — should be an em dash'
We might have to deal with both Windows characters and raw control
characters at the same time, especially when dealing with characters like
\x81 that have no mapping in Windows.
>>> print fix_bad_unicode(u'This text is sad .â\x81”.')
This text is sad ..
>>> fix_bad_unicode('This text is sad .â\x81”.')
'This text is sad .⁔.'
This function even fixes multiple levels of badness:
>>> wtf = u'\xc3\xa0\xc2\xb2\xc2\xa0_\xc3\xa0\xc2\xb2\xc2\xa0'
>>> print fix_bad_unicode(wtf)
ಠ_ಠ
>>> wtf = '\xc3\xa0\xc2\xb2\xc2\xa0_\xc3\xa0\xc2\xb2\xc2\xa0'
>>> fix_bad_unicode(wtf)
'ಠ_ಠ'
However, it has safeguards against fixing sequences of letters and
punctuation that can occur in valid text:
>>> print fix_bad_unicode(u'not such a fan of Charlotte Brontë…”')
not such a fan of Charlotte Brontë
>>> fix_bad_unicode('not such a fan of Charlotte Brontë…”')
'not such a fan of Charlotte Brontë…”'
Cases of genuine ambiguity can sometimes be addressed by finding other
characters that are not double-encoding, and expecting the encoding to
be consistent:
>>> print fix_bad_unicode(u'AHÅ™, the new sofa from IKEA®')
AHÅ, the new sofa from IKEA®
>>> fix_bad_unicode('AHÅ™, the new sofa from IKEA®')
'AHÅ™, the new sofa from IKEA®'
Finally, we handle the case where the text is in a single-byte encoding
that was intended as Windows-1252 all along but read as Latin-1:
>>> print fix_bad_unicode(u'This text was never Unicode at all\x85')
This text was never Unicode at all
>>> fix_bad_unicode('This text was never Unicode at all\x85')
'This text was never Unicode at all…'
"""
if not isinstance(text, unicode):
if not isinstance(text, str):
raise TypeError("This isn't even decoded into Unicode yet. "
"Decode it first.")
if len(text) == 0:
@ -118,7 +122,7 @@ def reinterpret_windows1252_as_utf8(wrongtext):
altered_bytes.append(char.encode('WINDOWS_1252'))
else:
altered_bytes.append(char.encode('latin-1', 'replace'))
return ''.join(altered_bytes).decode('utf-8', 'replace')
return b''.join(altered_bytes).decode('utf-8', 'replace')
def reinterpret_latin1_as_windows1252(wrongtext):
@ -130,7 +134,7 @@ def reinterpret_latin1_as_windows1252(wrongtext):
def text_badness(text):
u'''
'''
Look for red flags that text is encoded incorrectly:
Obvious problems:
@ -147,12 +151,12 @@ def text_badness(text):
- Improbable single-byte characters, such as ƒ or ¬
- Letters in somewhat rare scripts
'''
assert isinstance(text, unicode)
assert isinstance(text, str)
errors = 0
very_weird_things = 0
weird_things = 0
prev_letter_script = None
for pos in xrange(len(text)):
for pos in range(len(text)):
char = text[pos]
index = ord(char)
if index < 256:
@ -241,7 +245,7 @@ WINDOWS_1252_GREMLINS = [
]
# a list of Unicode characters that might appear in Windows-1252 text
WINDOWS_1252_CODEPOINTS = range(256) + WINDOWS_1252_GREMLINS
WINDOWS_1252_CODEPOINTS = list(range(256)) + WINDOWS_1252_GREMLINS
# Rank the characters typically represented by a single byte -- that is, in
# Latin-1 or Windows-1252 -- by how weird it would be to see them in running
@ -286,7 +290,7 @@ SINGLE_BYTE_WEIRDNESS = (
# letters. We'll need it often.
SINGLE_BYTE_LETTERS = [
unicodedata.category(unichr(i)).startswith('L')
for i in xrange(256)
for i in range(256)
]
# A table telling us how to interpret the first word of a letter's Unicode

View file

@ -1,17 +1,34 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
# GPL 2014
from __future__ import with_statement, print_function
import itertools
import mimetools
import mimetypes
import random
import sys
__all__ = ['MultiPartForm']
# from /usr/lib/python3.4/email/generator.py
# Helper used by Generator._make_boundary
_width = len(repr(sys.maxsize-1))
_fmt = '%%0%dd' % _width
def _make_boundary():
# Craft a random boundary.
token = random.randrange(sys.maxsize)
boundary = ('=' * 15) + (_fmt % token) + '=='
return boundary
class MultiPartForm(object):
"""Accumulate the data to be used when posting a form."""
def __init__(self):
self.form_fields = []
self.files = []
self.boundary = mimetools.choose_boundary()
self.boundary = _make_boundary()
return
def get_content_type(self):

View file

@ -20,7 +20,7 @@ def toAZ(num):
>>> toAZ(1234567890)
'CYWOQVJ'
"""
if num < 1: raise ValueError, "must supply a positive integer"
if num < 1: raise ValueError("must supply a positive integer")
digits = string.ascii_uppercase
az = ''
while num != 0:
@ -62,7 +62,7 @@ def to26(q):
>>> to26(347485647)
'BDGKMAP'
"""
if q < 0: raise ValueError, "must supply a positive integer"
if q < 0: raise ValueError("must supply a positive integer")
base26 = string.ascii_uppercase
converted = []
while q != 0:
@ -119,7 +119,7 @@ def to32(q):
ValueError: must supply a positive integer
"""
if q < 0: raise ValueError, "must supply a positive integer"
if q < 0: raise ValueError("must supply a positive integer")
letters = "0123456789ABCDEFGHJKMNPQRSTVWXYZ"
converted = []
while q != 0:
@ -206,7 +206,7 @@ def to36(q):
...
ValueError: must supply a positive integer
"""
if q < 0: raise ValueError, "must supply a positive integer"
if q < 0: raise ValueError("must supply a positive integer")
letters = "0123456789abcdefghijklmnopqrstuvwxyz"
converted = []
while q != 0:

2957
ox/geo.py

File diff suppressed because it is too large Load diff

View file

@ -1,9 +1,11 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
# GPL 2008
import sys
import re
import string
from htmlentitydefs import name2codepoint
from six.moves.html_entities import name2codepoint
from six import unichr
# Configuration for add_links() function
@ -23,6 +25,7 @@ link_target_attribute_re = re.compile(r'(<a [^>]*?)target=[^\s>]+')
html_gunk_re = re.compile(r'(?:<br clear="all">|<i><\/i>|<b><\/b>|<em><\/em>|<strong><\/strong>|<\/?smallcaps>|<\/?uppercase>)', re.IGNORECASE)
hard_coded_bullets_re = re.compile(r'((?:<p>(?:%s).*?[a-zA-Z].*?</p>\s*)+)' % '|'.join([re.escape(x) for x in DOTS]), re.DOTALL)
trailing_empty_content_re = re.compile(r'(?:<p>(?:&nbsp;|\s|<br \/>)*?</p>\s*)+\Z')
if sys.version[0] == 2:
del x # Temporary variable
def escape(html):
@ -146,12 +149,9 @@ def decode_html(html):
>>> decode_html('Anniversary of Daoud&apos;s Republic')
u"Anniversary of Daoud's Republic"
"""
if type(html) != unicode:
html = unicode(html)[:]
if type(html) is unicode:
if isinstance(html, bytes):
html = html.decode('utf-8')
uchr = unichr
else:
uchr = lambda value: value > 255 and unichr(value) or chr(value)
def entitydecode(match, uchr=uchr):
entity = match.group(1)
if entity == '#x80':

View file

@ -1,10 +1,10 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
from __future__ import with_statement
from __future__ import with_statement, print_function
from js import minify
from utils import json
from .js import minify
from .utils import json
def load(f):
@ -14,7 +14,7 @@ def loads(source):
try:
minified = minify(source)
return json.loads(minified)
except json.JSONDecodeError, e:
except json.JSONDecodeError as e:
s = minified.split('\n')
context = s[e.lineno-1][max(0, e.colno-1):e.colno+30]
msg = e.msg + ' at ' + context

View file

@ -9,9 +9,9 @@ import os
import re
import unicodedata
from normalize import normalize_name
from text import get_sort_name, find_re
from file import EXTENSIONS
from .normalize import normalize_name
from .text import get_sort_name, find_re
from .file import EXTENSIONS
__all__ = ['parse_movie_path', 'create_movie_path', 'get_oxid']

View file

@ -1,13 +1,13 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
# GPL 2008
from __future__ import with_statement, print_function
import os
import gzip
import re
import StringIO
from six import BytesIO
import struct
import urllib
import urllib2
from six.moves import urllib
from chardet.universaldetector import UniversalDetector
@ -26,7 +26,7 @@ def status(url, data=None, headers=DEFAULT_HEADERS):
try:
f = open_url(url, data, headers)
s = f.code
except urllib2.HTTPError, e:
except urllib.error.HTTPError as e:
s = e.code
return s
@ -42,46 +42,59 @@ def get_headers(url, data=None, headers=DEFAULT_HEADERS):
f.headers['Status'] = "%s" % f.code
headers = f.headers
f.close()
except urllib2.HTTPError, e:
except urllib.error.HTTPError as e:
e.headers['Status'] = "%s" % e.code
headers = e.headers
return dict(headers)
def open_url(url, data=None, headers=DEFAULT_HEADERS):
if isinstance(url, bytes):
url = url.decode('utf-8')
url = url.replace(' ', '%20')
req = urllib2.Request(url, data, headers)
return urllib2.urlopen(req)
req = urllib.request.Request(url, data, headers)
return urllib.request.urlopen(req)
def read_url(url, data=None, headers=DEFAULT_HEADERS, return_headers=False, unicode=False):
if DEBUG:
print 'ox.net.read_url', url
print('ox.net.read_url', url)
f = open_url(url, data, headers)
result = f.read()
f.close()
if f.headers.get('content-encoding', None) == 'gzip':
result = gzip.GzipFile(fileobj=StringIO.StringIO(result)).read()
result = gzip.GzipFile(fileobj=BytesIO(result)).read()
if unicode:
ctype = f.headers.get('content-type', '').lower()
if 'charset' in ctype:
encoding = ctype.split('charset=')[-1]
else:
encoding = detect_encoding(result)
if not encoding:
encoding = 'latin-1'
result = result.decode(encoding)
if return_headers:
f.headers['Status'] = "%s" % f.code
return dict(f.headers), result
headers = {}
for key in f.headers:
headers[key.lower()] = f.headers[key]
return headers, result
return result
def detect_encoding(data):
data_lower = data.lower()
charset = re.compile('content="text/html; charset=(.*?)"').findall(data)
data_lower = data.lower().decode('utf-8', 'ignore')
charset = re.compile('content="text/html; charset=(.*?)"').findall(data_lower)
if not charset:
charset = re.compile('meta charset="(.*?)"').findall(data)
charset = re.compile('meta charset="(.*?)"').findall(data_lower)
if charset:
return charset[0].lower()
detector = UniversalDetector()
for line in data.split('\n'):
detector.feed(line)
p = 0
l = len(data)
s = 1024
while p < l:
detector.feed(data[p:p+s])
if detector.done:
break
p += s
detector.close()
return detector.result['encoding']
@ -97,9 +110,9 @@ def save_url(url, filename, overwrite=False):
def oshash(url):
def get_size(url):
req = urllib2.Request(url, headers=DEFAULT_HEADERS.copy())
req = urllib.request.Request(url, headers=DEFAULT_HEADERS.copy())
req.get_method = lambda : 'HEAD'
u = urllib2.urlopen(req)
u = urllib.request.urlopen(req)
if u.code != 200 or not 'Content-Length' in u.headers:
raise IOError
return int(u.headers['Content-Length'])
@ -107,8 +120,8 @@ def oshash(url):
def get_range(url, start, end):
headers = DEFAULT_HEADERS.copy()
headers['Range'] = 'bytes=%s-%s' % (start, end)
req = urllib2.Request(url, headers=headers)
u = urllib2.urlopen(req)
req = urllib.request.Request(url, headers=headers)
u = urllib.request.urlopen(req)
return u.read()
try:

View file

@ -1,9 +1,10 @@
# -*- coding: utf-8 -*-
# ci:si:et:sw=4:sts=4:ts=4
import re
from text import find_re
import cache
from utils import json, ET
from . import cache
from .text import find_re
from .utils import json, ET
def get_embed_code(url, maxwidth=None, maxheight=None):
embed = {}

View file

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
from __future__ import with_statement, division
from __future__ import with_statement, division, print_function
import chardet
import re
import codecs
@ -71,7 +71,7 @@ def load(filename, offset=0):
try:
data = unicode(data, 'latin-1')
except:
print "failed to detect encoding, giving up"
print("failed to detect encoding, giving up")
return srt
data = data.replace('\r\n', '\n')

View file

@ -6,7 +6,7 @@ from threading import Event
from hashlib import sha1
import os
from bencode import bencode, bdecode
from .bencode import bencode, bdecode
__all__ = ['create_torrent', 'get_info_hash', 'get_torrent_info', 'get_files', 'get_torrent_size']
@ -24,9 +24,8 @@ def get_info_hash(torrentFile):
return sha1(bencode(info)).hexdigest()
def get_torrent_info(data=None, file=None):
from bencode import bencode
if file:
if isinstance(file, unicode):
if not isinstance(file, bytes):
file = file.encode('utf-8')
with open(file, 'rb') as f:
data = f.read()
@ -36,7 +35,7 @@ def get_torrent_info(data=None, file=None):
metainfo = bdecode(data)
info = metainfo['info']
piece_length = info['piece length']
if info.has_key('length'):
if 'length' in info:
# let's assume we just have one file
file_length = info['length']
else:

View file

@ -2,8 +2,8 @@
# encoding: utf-8
__version__ = '1.0.0'
import imdb
import wikipedia
import google
import piratecinema
import oxdb
from . import imdb
from . import wikipedia
from . import google
from . import piratecinema
from . import oxdb

View file

@ -1,7 +1,6 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import re
import time
from ox import strip_tags, find_re
from ox.cache import read_url

View file

@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import re
from urllib import quote
from six.moves.urllib.parse import quote
from ox import find_re, strip_tags, decode_html
from ox.cache import read_url

View file

@ -1,14 +1,11 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
from datetime import datetime
from urllib import urlencode
import json
import os
import re
from ox import find_re, strip_tags, decode_html
from ox import find_re, strip_tags
from ox.cache import read_url
from ox.net import open_url
def get_data(id, language='en'):
if language == 'de':
@ -57,7 +54,7 @@ def backup(filename):
data = json.load(f)
else:
data = {}
start = ids and max(map(int, data)) or 1
start = max(map(int, data)) or 1
for i in range(start, 11872):
info = get_data(i)
if info:

View file

@ -5,7 +5,7 @@ import re
import ox.cache
from ox.cache import read_url
from ox.html import strip_tags
from ox.text import find_re, remove_special_characters
from ox.text import find_re
import imdb

View file

@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import re
from urllib import unquote
from six.moves.urllib.parse import unquote
from ox.cache import read_url

View file

@ -1,17 +1,17 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import re
import urllib
from six.moves import urllib
import ox
from ox import strip_tags, decode_html
from ox.utils import json
from ox.cache import read_url
def find(query, timeout=ox.cache.cache_timeout):
if isinstance(query, unicode):
if not isinstance(query, bytes):
query = query.encode('utf-8')
params = urllib.urlencode({'q': query})
params = urllib.parse.urlencode({'q': query})
url = 'http://duckduckgo.com/html/?' + params
data = read_url(url, timeout=timeout).decode('utf-8')
results = []

View file

@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import re
import urllib
from six.moves import urllib
import ox
from ox import strip_tags, decode_html
@ -13,9 +13,9 @@ def read_url(url, data=None, headers=ox.net.DEFAULT_HEADERS, timeout=DEFAULT_TIM
return ox.cache.read_url(url, data, headers, timeout, unicode=True)
def quote_plus(s):
if not isinstance(s, str):
if not isinstance(s, bytes):
s = s.encode('utf-8')
return urllib.quote_plus(s)
return urllib.parse.quote_plus(s)
def find(query, max_results=DEFAULT_MAX_RESULTS, timeout=DEFAULT_TIMEOUT):
"""

View file

@ -1,23 +1,27 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
import urllib
from __future__ import print_function
import re
import time
import unicodedata
import ox
from ox import find_re, strip_tags
import ox.cache
from six.moves import urllib
from six import string_types
from siteparser import SiteParser
import duckduckgo
from .. import find_re, strip_tags, decode_html
from .. import cache
from . siteparser import SiteParser
from . import duckduckgo
from ..utils import datetime
from ..geo import normalize_country_name
def read_url(url, data=None, headers=ox.cache.DEFAULT_HEADERS, timeout=ox.cache.cache_timeout, valid=None, unicode=False):
def read_url(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None, unicode=False):
headers = headers.copy()
return ox.cache.read_url(url, data, headers, timeout, unicode=unicode)
return cache.read_url(url, data, headers, timeout, unicode=unicode)
def get_url(id):
return "http://www.imdb.com/title/tt%s/" % id
@ -49,7 +53,7 @@ class Imdb(SiteParser):
'page': 'business',
're': [
'<h5>Budget</h5>\s*?\$(.*?)<br',
lambda data: find_re(ox.decode_html(data).replace(',', ''), '\d+')
lambda data: find_re(decode_html(data).replace(',', ''), '\d+')
],
'type': 'int'
},
@ -211,7 +215,7 @@ class Imdb(SiteParser):
'page': 'releaseinfo',
're': [
'<td class="release_date">(.*?)</td>',
ox.strip_tags,
strip_tags,
],
'type': 'list'
},
@ -326,7 +330,7 @@ class Imdb(SiteParser):
if 'alternativeTitles' in self:
if len(self['alternativeTitles']) == 2 and \
isinstance(self['alternativeTitles'][0], basestring):
isinstance(self['alternativeTitles'][0], string_types):
self['alternativeTitles'] = [self['alternativeTitles']]
#normalize country names
@ -472,7 +476,7 @@ class Imdb(SiteParser):
if c:
alt[title].append(c)
self['alternativeTitles'] = []
for t in sorted(alt, lambda a, b: cmp(sorted(alt[a]), sorted(alt[b]))):
for t in sorted(alt, key=lambda a: sorted(alt[a])):
if alt[t]:
countries = sorted([normalize_country_name(c) or c for c in alt[t]])
self['alternativeTitles'].append((t, countries))
@ -492,7 +496,7 @@ class Imdb(SiteParser):
if 'votes' in self: self['votes'] = self['votes'].replace(',', '')
if 'cast' in self:
if isinstance(self['cast'][0], basestring):
if isinstance(self['cast'][0], string_types):
self['cast'] = [self['cast']]
self['actor'] = [c[0] for c in self['cast']]
def cleanup_character(c):
@ -503,10 +507,12 @@ class Imdb(SiteParser):
if 'connections' in self:
cc={}
if len(self['connections']) == 3 and isinstance(self['connections'][0], basestring):
if len(self['connections']) == 3 and isinstance(self['connections'][0], string_types):
self['connections'] = [self['connections']]
for rel, data, _ in self['connections']:
#cc[unicode(rel)] = re.compile('<a href="/title/tt(\d{7})/">(.*?)</a>').findall(data)
if isinstance(rel, bytes):
rel = rel.decode('utf-8')
#cc[rel] = re.compile('<a href="/title/tt(\d{7})/">(.*?)</a>').findall(data)
def get_conn(c):
r = {
'id': c[0],
@ -516,14 +522,14 @@ class Imdb(SiteParser):
if len(description) == 2 and description[-1].strip() != '-':
r['description'] = description[-1].strip()
return r
cc[unicode(rel)] = map(get_conn, re.compile('<a href="/title/tt(\d{7})/">(.*?)</a>(.*?)<\/div', re.DOTALL).findall(data))
cc[rel] = list(map(get_conn, re.compile('<a href="/title/tt(\d{7})/">(.*?)</a>(.*?)<\/div', re.DOTALL).findall(data)))
self['connections'] = cc
for key in ('country', 'genre'):
if key in self:
self[key] = filter(lambda x: x.lower() != 'home', self[key])
self[key] = list(filter(lambda x: x.lower() != 'home', self[key]))
#0092999
if '_director' in self:
if 'series' in self or 'isSeries' in self:
@ -590,8 +596,8 @@ class Imdb(SiteParser):
if key in self:
if isinstance(self[key][0], list):
self[key] = [i[0] for i in self[key] if i]
self[key] = sorted(list(set(self[key])),
lambda a, b: self[key].index(a) - self[key].index(b))
self[key] = sorted(list(set(self[key])), key=lambda a: self[key].index(a))
if 'budget' in self and 'gross' in self:
self['profit'] = self['gross'] - self['budget']
@ -655,7 +661,7 @@ def get_movie_by_title(title, timeout=-1):
u'0866567'
'''
params = {'s':'tt','q': title}
if isinstance(title, unicode):
if not isinstance(title, bytes):
try:
params['q'] = unicodedata.normalize('NFKC', params['q']).encode('latin-1')
except:
@ -731,7 +737,7 @@ def get_movie_id(title, director='', year='', timeout=-1):
if year:
params['q'] = u'"%s (%s)" %s' % (title, year, director)
google_query = "site:imdb.com %s" % params['q']
if isinstance(params['q'], unicode):
if not isinstance(params['q'], bytes):
try:
params['q'] = unicodedata.normalize('NFKC', params['q']).encode('latin-1')
except:
@ -775,7 +781,7 @@ def get_movie_poster(imdbId):
info = ImdbCombined(imdbId)
if 'posterId' in info:
url = "http://www.imdb.com/media/rm%s/tt%s" % (info['posterId'], imdbId)
data = read_url(url)
data = read_url(url).decode('utf-8', 'ignore')
poster = find_re(data, 'img.*?id="primary-img".*?src="(.*?)"')
return poster
elif 'series' in info:
@ -787,11 +793,11 @@ def get_episodes(imdbId, season=None):
url = 'http://www.imdb.com/title/tt%s/episodes' % imdbId
if season:
url += '?season=%d' % season
data = ox.cache.read_url(url)
data = cache.read_url(url)
for e in re.compile('<div data-const="tt(\d{7})".*?>.*?<div>S(\d+), Ep(\d+)<\/div>\n<\/div>', re.DOTALL).findall(data):
episodes['S%02dE%02d' %(int(e[1]), int(e[2]))] = e[0]
else:
data = ox.cache.read_url(url)
data = cache.read_url(url)
match = re.compile('<strong>Season (\d+)</strong>').findall(data)
if match:
for season in range(1, int(match[0]) + 1):
@ -800,7 +806,7 @@ def get_episodes(imdbId, season=None):
def max_votes():
url = 'http://www.imdb.com/search/title?num_votes=500000,&sort=num_votes,desc'
data = ox.cache.read_url(url)
data = cache.read_url(url)
votes = max([int(v.replace(',', ''))
for v in re.compile('<td class="sort_col">([\d,]+)</td>').findall(data)])
return votes
@ -810,6 +816,6 @@ def guess(title, director='', timeout=-1):
if __name__ == "__main__":
import json
print json.dumps(Imdb('0306414'), indent=2)
print(json.dumps(Imdb('0306414'), indent=2))
#print json.dumps(Imdb('0133093'), indent=2)

View file

@ -1,5 +1,7 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
from __future__ import print_function
import re
from ox.net import read_url
@ -13,5 +15,5 @@ def get_poster_url(id):
return ''
if __name__ == '__main__':
print get_poster_url('0749451')
print(get_poster_url('0749451'))

View file

@ -2,22 +2,24 @@
# vi:si:et:sw=4:sts=4:ts=4
import re
from six import string_types
from ..cache import read_url
from .. import strip_tags, decode_html
from .. import decode_html
from ..utils import datetime
def cleanup(key, data, data_type):
if data:
if isinstance(data[0], basestring):
if isinstance(data[0], string_types):
#FIXME: some types need strip_tags
#data = [strip_tags(decode_html(p)).strip() for p in data]
data = [decode_html(p).strip() for p in data]
elif isinstance(data[0], list) or isinstance(data[0], tuple):
data = [cleanup(key, p, data_type) for p in data]
while len(data) == 1 and not isinstance(data, basestring):
while len(data) == 1 and not isinstance(data, string_types):
data = data[0]
if data_type == 'list' and isinstance(data, basestring):
if data_type == 'list' and isinstance(data, string_types):
data = [data, ]
elif data_type != 'list':
data = ''
@ -40,7 +42,7 @@ class SiteParser(dict):
for key in self.regex:
url = self.get_url(self.regex[key]['page'])
data = self.read_url(url, timeout)
if isinstance(self.regex[key]['re'], basestring):
if isinstance(self.regex[key]['re'], string_types):
data = re.compile(self.regex[key]['re'], re.DOTALL).findall(data)
data = cleanup(key, data, self.regex[key]['type'])
elif callable(self.regex[key]['re']):
@ -51,7 +53,7 @@ class SiteParser(dict):
f = r
else:
f = re.compile(r, re.DOTALL).findall
if isinstance(data, basestring):
if isinstance(data, string_types):
data = f(data)
else:
data = [f(d) for d in data]

View file

@ -1,11 +1,14 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
from __future__ import print_function
import re
from urllib import urlencode
from six.moves import urllib
from ox.utils import json
from ox.cache import read_url
from ox import find_re, decode_html
from ox import find_re
def get_id(url):
@ -138,11 +141,11 @@ def get_allmovie_id(wikipedia_url):
def find(query, max_results=10):
query = {'action': 'query', 'list':'search', 'format': 'json',
'srlimit': max_results, 'srwhat': 'text', 'srsearch': query.encode('utf-8')}
url = "http://en.wikipedia.org/w/api.php?" + urlencode(query)
url = "http://en.wikipedia.org/w/api.php?" + urllib.parse.urlencode(query)
data = read_url(url)
if not data:
data = read_url(url, timeout=0)
result = json.loads(data)
result = json.loads(data.decode('utf-8'))
results = []
if result and 'query' in result:
for r in result['query']['search']:

View file

@ -36,15 +36,16 @@ setup(
download_url="http://code.0x2620.org/python-ox/download",
license="GPLv3",
packages=['ox', 'ox.django', 'ox.django.api', 'ox.torrent', 'ox.web'],
install_requires=['chardet', 'feedparser'],
install_requires=['six', 'chardet', 'feedparser'],
keywords = [
],
classifiers = [
'Operating System :: OS Independent',
'Programming Language :: Python',
'Programming Language :: Python :: 2',
'Programming Language :: Python :: 2.6',
'Programming Language :: Python :: 2.7',
'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3.4',
'Topic :: Software Development :: Libraries :: Python Modules',
],
)