use six to support python 2 and 3

This commit is contained in:
j 2014-09-30 21:04:46 +02:00
parent 1b1dcf1c58
commit d4d09b56b6
28 changed files with 1730 additions and 1678 deletions

View file

@ -3,28 +3,32 @@
# GPL 2011 # GPL 2011
__version__ = '2.1.1' __version__ = '2.1.1'
import cache from . import cache
import js from . import js
import jsonc from . import jsonc
import net from . import net
import srt from . import srt
import utils from . import utils
from api import * from .api import *
from file import * from .file import *
from form import * from .form import *
from format import * from .format import *
from geo import * from .geo import *
from html import * from .html import *
#image depends on PIL, not easy enough to instal on osx #image depends on PIL, not easy enough to instal on osx
try: try:
from image import * from .image import *
except: except:
pass pass
from location import * from .location import *
from movie import * from .movie import *
from normalize import * from .normalize import *
from oembed import * from .oembed import *
from text import * from .text import *
from torrent import * #currently broken in python3
from fixunicode import * try:
from .torrent import *
except:
pass
from .fixunicode import *

View file

@ -3,10 +3,10 @@
# GPL 2011 # GPL 2011
from __future__ import with_statement from __future__ import with_statement
import cookielib from six.moves import http_cookiejar as cookielib
import gzip import gzip
import StringIO from six import StringIO
import urllib2 from six.moves import urllib
from types import MethodType from types import MethodType
from . import __version__ from . import __version__
@ -29,8 +29,8 @@ class API(object):
self._cj = cj self._cj = cj
else: else:
self._cj = cookielib.CookieJar() self._cj = cookielib.CookieJar()
self._opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(self._cj), self._opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(self._cj),
urllib2.HTTPHandler(debuglevel=self.debuglevel)) urllib.HTTPHandler(debuglevel=self.debuglevel))
self._opener.addheaders = [ self._opener.addheaders = [
('User-Agent', '%s/%s' % (self.__name__, self.__version__)) ('User-Agent', '%s/%s' % (self.__name__, self.__version__))
] ]
@ -64,7 +64,7 @@ class API(object):
result = {} result = {}
try: try:
body = str(form) body = str(form)
request = urllib2.Request(str(url)) request = urllib.reuqest.Request(str(url))
request.add_header('Content-type', form.get_content_type()) request.add_header('Content-type', form.get_content_type())
request.add_header('Content-Length', str(len(body))) request.add_header('Content-Length', str(len(body)))
request.add_header('Accept-Encoding', 'gzip, deflate') request.add_header('Accept-Encoding', 'gzip, deflate')
@ -75,7 +75,7 @@ class API(object):
result = gzip.GzipFile(fileobj=StringIO.StringIO(result)).read() result = gzip.GzipFile(fileobj=StringIO.StringIO(result)).read()
result = result.decode('utf-8') result = result.decode('utf-8')
return json.loads(result) return json.loads(result)
except urllib2.HTTPError, e: except urllib.error.HTTPError as e:
if self.DEBUG: if self.DEBUG:
import webbrowser import webbrowser
if e.code >= 500: if e.code >= 500:

View file

@ -1,24 +1,22 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4 # vi:si:et:sw=4:sts=4:ts=4
# GPL 2011 # GPL 2011
from __future__ import with_statement from __future__ import with_statement, print_function
import gzip import gzip
import zlib import zlib
import hashlib import hashlib
import os import os
import StringIO from six import BytesIO
import time import time
import urlparse from six.moves import urllib
import urllib2
import sqlite3 import sqlite3
import chardet from .utils import json
from utils import json
from .file import makedirs from .file import makedirs
import net from . import net
from net import DEFAULT_HEADERS, detect_encoding from .net import DEFAULT_HEADERS, detect_encoding
cache_timeout = 30*24*60*60 # default is 30 days cache_timeout = 30*24*60*60 # default is 30 days
@ -69,7 +67,7 @@ class InvalidResult(Exception):
self.headers = headers self.headers = headers
def _fix_unicode_url(url): def _fix_unicode_url(url):
if isinstance(url, unicode): if not isinstance(url, bytes):
url = url.encode('utf-8') url = url.encode('utf-8')
return url return url
@ -83,24 +81,30 @@ def read_url(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout, val
if this function fails, InvalidResult will be raised deal with it in your code if this function fails, InvalidResult will be raised deal with it in your code
''' '''
if net.DEBUG: if net.DEBUG:
print 'ox.cache.read_url', url print('ox.cache.read_url', url)
#FIXME: send last-modified / etag from cache and only update if needed #FIXME: send last-modified / etag from cache and only update if needed
url = _fix_unicode_url(url) #url = _fix_unicode_url(url)
result = store.get(url, data, headers, timeout) result = store.get(url, data, headers, timeout)
url_headers = {}
if not result: if not result:
try: try:
url_headers, result = net.read_url(url, data, headers, return_headers=True) url_headers, result = net.read_url(url, data, headers, return_headers=True)
except urllib2.HTTPError, e: except urllib.error.HTTPError as e:
e.headers['Status'] = "%s" % e.code e.headers['Status'] = "%s" % e.code
url_headers = dict(e.headers) for key in e.headers:
url_headers[key.lower()] = e.headers[key]
result = e.read() result = e.read()
if url_headers.get('content-encoding', None) == 'gzip': if url_headers.get('content-encoding', None) == 'gzip':
result = gzip.GzipFile(fileobj=StringIO.StringIO(result)).read() result = gzip.GzipFile(fileobj=BytesIO(result)).read()
if not valid or valid(result, url_headers): if not valid or valid(result, url_headers):
store.set(url, post_data=data, data=result, headers=url_headers) store.set(url, post_data=data, data=result, headers=url_headers)
else: else:
raise InvalidResult(result, url_headers) raise InvalidResult(result, url_headers)
if unicode: if unicode:
ctype = url_headers.get('content-type', '').lower()
if 'charset' in ctype:
encoding = ctype.split('charset=')[-1]
else:
encoding = detect_encoding(result) encoding = detect_encoding(result)
if not encoding: if not encoding:
encoding = 'latin-1' encoding = 'latin-1'
@ -143,9 +147,8 @@ class SQLiteCache(Cache):
self.create() self.create()
def connect(self): def connect(self):
conn = sqlite3.connect(self.db, timeout=10) self.conn = sqlite3.connect(self.db, timeout=10)
conn.text_factory = str return self.conn
return conn
def create(self): def create(self):
conn = self.connect() conn = self.connect()
@ -177,9 +180,9 @@ class SQLiteCache(Cache):
if timeout == 0: if timeout == 0:
return r return r
if data: if data:
url_hash = hashlib.sha1(url + '?' + data).hexdigest() url_hash = hashlib.sha1((url + '?' + data).encode('utf-8')).hexdigest()
else: else:
url_hash = hashlib.sha1(url).hexdigest() url_hash = hashlib.sha1(url.encode('utf-8')).hexdigest()
conn = self.connect() conn = self.connect()
c = conn.cursor() c = conn.cursor()
@ -210,11 +213,11 @@ class SQLiteCache(Cache):
def set(self, url, post_data, data, headers): def set(self, url, post_data, data, headers):
if post_data: if post_data:
url_hash = hashlib.sha1(url + '?' + post_data).hexdigest() url_hash = hashlib.sha1((url + '?' + post_data).encode('utf-8')).hexdigest()
else: else:
url_hash = hashlib.sha1(url).hexdigest() url_hash = hashlib.sha1(url.encode('utf-8')).hexdigest()
domain = ".".join(urlparse.urlparse(url)[1].split('.')[-2:]) domain = ".".join(urllib.parse.urlparse(url)[1].split('.')[-2:])
conn = self.connect() conn = self.connect()
c = conn.cursor() c = conn.cursor()
@ -266,11 +269,11 @@ class FileCache(Cache):
return r return r
if data: if data:
url_hash = hashlib.sha1(url + '?' + data).hexdigest() url_hash = hashlib.sha1((url + '?' + data).encode('utf-8')).hexdigest()
else: else:
url_hash = hashlib.sha1(url).hexdigest() url_hash = hashlib.sha1(url.encode('utf-8')).hexdigest()
domain = ".".join(urlparse.urlparse(url)[1].split('.')[-2:]) domain = ".".join(urllib.parse.urlparse(url)[1].split('.')[-2:])
prefix, i, f = self.files(domain, url_hash) prefix, i, f = self.files(domain, url_hash)
if os.path.exists(i): if os.path.exists(i):
with open(i) as _i: with open(i) as _i:
@ -295,11 +298,11 @@ class FileCache(Cache):
def set(self, url, post_data, data, headers): def set(self, url, post_data, data, headers):
if post_data: if post_data:
url_hash = hashlib.sha1(url + '?' + post_data).hexdigest() url_hash = hashlib.sha1((url + '?' + post_data).encode('utf-8')).hexdigest()
else: else:
url_hash = hashlib.sha1(url).hexdigest() url_hash = hashlib.sha1(url.encode('utf-8')).hexdigest()
domain = ".".join(urlparse.urlparse(url)[1].split('.')[-2:]) domain = ".".join(urllib.parse.urlparse(url)[1].split('.')[-2:])
prefix, i, f = self.files(domain, url_hash) prefix, i, f = self.files(domain, url_hash)
makedirs(prefix) makedirs(prefix)

View file

@ -1,7 +1,7 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4 # vi:si:et:sw=4:sts=4:ts=4
# GPL 2008 # GPL 2008
from __future__ import division, with_statement from __future__ import division, with_statement, print_function
import os import os
import hashlib import hashlib
import re import re
@ -10,7 +10,7 @@ import struct
import subprocess import subprocess
import sqlite3 import sqlite3
from ox.utils import json from .utils import json
__all__ = ['sha1sum', 'oshash', 'avinfo', 'makedirs'] __all__ = ['sha1sum', 'oshash', 'avinfo', 'makedirs']
@ -283,19 +283,19 @@ def makedirs(path):
if not os.path.exists(path): if not os.path.exists(path):
try: try:
os.makedirs(path) os.makedirs(path)
except OSError, e: except OSError as e:
if e.errno != 17: if e.errno != 17:
raise raise
def copy_file(source, target, verbose=False): def copy_file(source, target, verbose=False):
if verbose: if verbose:
print 'copying', source, 'to', target print('copying', source, 'to', target)
write_path(target) write_path(target)
shutil.copyfile(source, target) shutil.copyfile(source, target)
def read_file(file, verbose=False): def read_file(file, verbose=False):
if verbose: if verbose:
print 'reading', file print('reading', file)
f = open(file) f = open(file)
data = f.read() data = f.read()
f.close() f.close()
@ -303,14 +303,14 @@ def read_file(file, verbose=False):
def read_json(file, verbose=False): def read_json(file, verbose=False):
if verbose: if verbose:
print 'reading', file print('reading', file)
with open(file) as fd: with open(file) as fd:
data = json.load(fd) data = json.load(fd)
return data return data
def write_file(file, data, verbose=False): def write_file(file, data, verbose=False):
if verbose: if verbose:
print 'writing', file print('writing', file)
write_path(file) write_path(file)
f = open(file, 'w') f = open(file, 'w')
f.write(data) f.write(data)
@ -319,7 +319,7 @@ def write_file(file, data, verbose=False):
def write_image(file, image, verbose=False): def write_image(file, image, verbose=False):
if verbose: if verbose:
print 'writing', file print('writing', file)
write_path(file) write_path(file)
image.save(file) image.save(file)
@ -329,7 +329,7 @@ def write_json(file, data, ensure_ascii=True, indent=0, sort_keys=False, verbose
def write_link(source, target, verbose=False): def write_link(source, target, verbose=False):
if verbose: if verbose:
print 'linking', source, 'to', target print('linking', source, 'to', target)
write_path(target) write_path(target)
if os.path.exists(target): if os.path.exists(target):
os.unlink(target) os.unlink(target)

View file

@ -2,13 +2,16 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# from http://blog.lumino.so/2012/08/20/fix-unicode-mistakes-with-python/ # from http://blog.lumino.so/2012/08/20/fix-unicode-mistakes-with-python/
# MIT # MIT
from __future__ import print_function
import unicodedata import unicodedata
from six import unichr
__all__ = ['fix_bad_unicode'] __all__ = ['fix_bad_unicode']
def fix_bad_unicode(text): def fix_bad_unicode(text):
u""" """
Something you will find all over the place, in real-world text, is text Something you will find all over the place, in real-world text, is text
that's mistakenly encoded as utf-8, decoded in some ugly format like that's mistakenly encoded as utf-8, decoded in some ugly format like
latin-1 or even Windows codepage 1252, and encoded as utf-8 again. latin-1 or even Windows codepage 1252, and encoded as utf-8 again.
@ -26,52 +29,53 @@ def fix_bad_unicode(text):
auto-decode bytes for you -- then it would just create the problems it's auto-decode bytes for you -- then it would just create the problems it's
supposed to fix. supposed to fix.
>>> print fix_bad_unicode(u'único') >>> fix_bad_unicode(u'único')
único 'único'
>>> fix_bad_unicode('This text is fine already :þ')
'This text is fine already :þ'
>>> print fix_bad_unicode(u'This text is fine already :þ')
This text is fine already :þ
Because these characters often come from Microsoft products, we allow Because these characters often come from Microsoft products, we allow
for the possibility that we get not just Unicode characters 128-255, but for the possibility that we get not just Unicode characters 128-255, but
also Windows's conflicting idea of what characters 128-160 are. also Windows's conflicting idea of what characters 128-160 are.
>>> print fix_bad_unicode(u'This — should be an em dash') >>> fix_bad_unicode('This — should be an em dash')
This should be an em dash 'This — should be an em dash'
We might have to deal with both Windows characters and raw control We might have to deal with both Windows characters and raw control
characters at the same time, especially when dealing with characters like characters at the same time, especially when dealing with characters like
\x81 that have no mapping in Windows. \x81 that have no mapping in Windows.
>>> print fix_bad_unicode(u'This text is sad .â\x81”.') >>> fix_bad_unicode('This text is sad .â\x81”.')
This text is sad .. 'This text is sad .⁔.'
This function even fixes multiple levels of badness: This function even fixes multiple levels of badness:
>>> wtf = u'\xc3\xa0\xc2\xb2\xc2\xa0_\xc3\xa0\xc2\xb2\xc2\xa0' >>> wtf = '\xc3\xa0\xc2\xb2\xc2\xa0_\xc3\xa0\xc2\xb2\xc2\xa0'
>>> print fix_bad_unicode(wtf) >>> fix_bad_unicode(wtf)
ಠ_ಠ 'ಠ_ಠ'
However, it has safeguards against fixing sequences of letters and However, it has safeguards against fixing sequences of letters and
punctuation that can occur in valid text: punctuation that can occur in valid text:
>>> print fix_bad_unicode(u'not such a fan of Charlotte Brontë…”') >>> fix_bad_unicode('not such a fan of Charlotte Brontë…”')
not such a fan of Charlotte Brontë 'not such a fan of Charlotte Brontë…”'
Cases of genuine ambiguity can sometimes be addressed by finding other Cases of genuine ambiguity can sometimes be addressed by finding other
characters that are not double-encoding, and expecting the encoding to characters that are not double-encoding, and expecting the encoding to
be consistent: be consistent:
>>> print fix_bad_unicode(u'AHÅ™, the new sofa from IKEA®') >>> fix_bad_unicode('AHÅ™, the new sofa from IKEA®')
AHÅ, the new sofa from IKEA® 'AHÅ™, the new sofa from IKEA®'
Finally, we handle the case where the text is in a single-byte encoding Finally, we handle the case where the text is in a single-byte encoding
that was intended as Windows-1252 all along but read as Latin-1: that was intended as Windows-1252 all along but read as Latin-1:
>>> print fix_bad_unicode(u'This text was never Unicode at all\x85') >>> fix_bad_unicode('This text was never Unicode at all\x85')
This text was never Unicode at all 'This text was never Unicode at all…'
""" """
if not isinstance(text, unicode): if not isinstance(text, str):
raise TypeError("This isn't even decoded into Unicode yet. " raise TypeError("This isn't even decoded into Unicode yet. "
"Decode it first.") "Decode it first.")
if len(text) == 0: if len(text) == 0:
@ -118,7 +122,7 @@ def reinterpret_windows1252_as_utf8(wrongtext):
altered_bytes.append(char.encode('WINDOWS_1252')) altered_bytes.append(char.encode('WINDOWS_1252'))
else: else:
altered_bytes.append(char.encode('latin-1', 'replace')) altered_bytes.append(char.encode('latin-1', 'replace'))
return ''.join(altered_bytes).decode('utf-8', 'replace') return b''.join(altered_bytes).decode('utf-8', 'replace')
def reinterpret_latin1_as_windows1252(wrongtext): def reinterpret_latin1_as_windows1252(wrongtext):
@ -130,7 +134,7 @@ def reinterpret_latin1_as_windows1252(wrongtext):
def text_badness(text): def text_badness(text):
u''' '''
Look for red flags that text is encoded incorrectly: Look for red flags that text is encoded incorrectly:
Obvious problems: Obvious problems:
@ -147,12 +151,12 @@ def text_badness(text):
- Improbable single-byte characters, such as ƒ or ¬ - Improbable single-byte characters, such as ƒ or ¬
- Letters in somewhat rare scripts - Letters in somewhat rare scripts
''' '''
assert isinstance(text, unicode) assert isinstance(text, str)
errors = 0 errors = 0
very_weird_things = 0 very_weird_things = 0
weird_things = 0 weird_things = 0
prev_letter_script = None prev_letter_script = None
for pos in xrange(len(text)): for pos in range(len(text)):
char = text[pos] char = text[pos]
index = ord(char) index = ord(char)
if index < 256: if index < 256:
@ -241,7 +245,7 @@ WINDOWS_1252_GREMLINS = [
] ]
# a list of Unicode characters that might appear in Windows-1252 text # a list of Unicode characters that might appear in Windows-1252 text
WINDOWS_1252_CODEPOINTS = range(256) + WINDOWS_1252_GREMLINS WINDOWS_1252_CODEPOINTS = list(range(256)) + WINDOWS_1252_GREMLINS
# Rank the characters typically represented by a single byte -- that is, in # Rank the characters typically represented by a single byte -- that is, in
# Latin-1 or Windows-1252 -- by how weird it would be to see them in running # Latin-1 or Windows-1252 -- by how weird it would be to see them in running
@ -286,7 +290,7 @@ SINGLE_BYTE_WEIRDNESS = (
# letters. We'll need it often. # letters. We'll need it often.
SINGLE_BYTE_LETTERS = [ SINGLE_BYTE_LETTERS = [
unicodedata.category(unichr(i)).startswith('L') unicodedata.category(unichr(i)).startswith('L')
for i in xrange(256) for i in range(256)
] ]
# A table telling us how to interpret the first word of a letter's Unicode # A table telling us how to interpret the first word of a letter's Unicode

View file

@ -1,17 +1,34 @@
# -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4
# GPL 2014
from __future__ import with_statement, print_function
import itertools import itertools
import mimetools
import mimetypes import mimetypes
import random
import sys
__all__ = ['MultiPartForm'] __all__ = ['MultiPartForm']
# from /usr/lib/python3.4/email/generator.py
# Helper used by Generator._make_boundary
_width = len(repr(sys.maxsize-1))
_fmt = '%%0%dd' % _width
def _make_boundary():
# Craft a random boundary.
token = random.randrange(sys.maxsize)
boundary = ('=' * 15) + (_fmt % token) + '=='
return boundary
class MultiPartForm(object): class MultiPartForm(object):
"""Accumulate the data to be used when posting a form.""" """Accumulate the data to be used when posting a form."""
def __init__(self): def __init__(self):
self.form_fields = [] self.form_fields = []
self.files = [] self.files = []
self.boundary = mimetools.choose_boundary() self.boundary = _make_boundary()
return return
def get_content_type(self): def get_content_type(self):

View file

@ -20,7 +20,7 @@ def toAZ(num):
>>> toAZ(1234567890) >>> toAZ(1234567890)
'CYWOQVJ' 'CYWOQVJ'
""" """
if num < 1: raise ValueError, "must supply a positive integer" if num < 1: raise ValueError("must supply a positive integer")
digits = string.ascii_uppercase digits = string.ascii_uppercase
az = '' az = ''
while num != 0: while num != 0:
@ -62,7 +62,7 @@ def to26(q):
>>> to26(347485647) >>> to26(347485647)
'BDGKMAP' 'BDGKMAP'
""" """
if q < 0: raise ValueError, "must supply a positive integer" if q < 0: raise ValueError("must supply a positive integer")
base26 = string.ascii_uppercase base26 = string.ascii_uppercase
converted = [] converted = []
while q != 0: while q != 0:
@ -119,7 +119,7 @@ def to32(q):
ValueError: must supply a positive integer ValueError: must supply a positive integer
""" """
if q < 0: raise ValueError, "must supply a positive integer" if q < 0: raise ValueError("must supply a positive integer")
letters = "0123456789ABCDEFGHJKMNPQRSTVWXYZ" letters = "0123456789ABCDEFGHJKMNPQRSTVWXYZ"
converted = [] converted = []
while q != 0: while q != 0:
@ -206,7 +206,7 @@ def to36(q):
... ...
ValueError: must supply a positive integer ValueError: must supply a positive integer
""" """
if q < 0: raise ValueError, "must supply a positive integer" if q < 0: raise ValueError("must supply a positive integer")
letters = "0123456789abcdefghijklmnopqrstuvwxyz" letters = "0123456789abcdefghijklmnopqrstuvwxyz"
converted = [] converted = []
while q != 0: while q != 0:

2957
ox/geo.py

File diff suppressed because it is too large Load diff

View file

@ -1,9 +1,11 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4 # vi:si:et:sw=4:sts=4:ts=4
# GPL 2008 # GPL 2008
import sys
import re import re
import string import string
from htmlentitydefs import name2codepoint from six.moves.html_entities import name2codepoint
from six import unichr
# Configuration for add_links() function # Configuration for add_links() function
@ -23,7 +25,8 @@ link_target_attribute_re = re.compile(r'(<a [^>]*?)target=[^\s>]+')
html_gunk_re = re.compile(r'(?:<br clear="all">|<i><\/i>|<b><\/b>|<em><\/em>|<strong><\/strong>|<\/?smallcaps>|<\/?uppercase>)', re.IGNORECASE) html_gunk_re = re.compile(r'(?:<br clear="all">|<i><\/i>|<b><\/b>|<em><\/em>|<strong><\/strong>|<\/?smallcaps>|<\/?uppercase>)', re.IGNORECASE)
hard_coded_bullets_re = re.compile(r'((?:<p>(?:%s).*?[a-zA-Z].*?</p>\s*)+)' % '|'.join([re.escape(x) for x in DOTS]), re.DOTALL) hard_coded_bullets_re = re.compile(r'((?:<p>(?:%s).*?[a-zA-Z].*?</p>\s*)+)' % '|'.join([re.escape(x) for x in DOTS]), re.DOTALL)
trailing_empty_content_re = re.compile(r'(?:<p>(?:&nbsp;|\s|<br \/>)*?</p>\s*)+\Z') trailing_empty_content_re = re.compile(r'(?:<p>(?:&nbsp;|\s|<br \/>)*?</p>\s*)+\Z')
del x # Temporary variable if sys.version[0] == 2:
del x # Temporary variable
def escape(html): def escape(html):
''' '''
@ -146,12 +149,9 @@ def decode_html(html):
>>> decode_html('Anniversary of Daoud&apos;s Republic') >>> decode_html('Anniversary of Daoud&apos;s Republic')
u"Anniversary of Daoud's Republic" u"Anniversary of Daoud's Republic"
""" """
if type(html) != unicode: if isinstance(html, bytes):
html = unicode(html)[:] html = html.decode('utf-8')
if type(html) is unicode:
uchr = unichr uchr = unichr
else:
uchr = lambda value: value > 255 and unichr(value) or chr(value)
def entitydecode(match, uchr=uchr): def entitydecode(match, uchr=uchr):
entity = match.group(1) entity = match.group(1)
if entity == '#x80': if entity == '#x80':

View file

@ -1,10 +1,10 @@
#!/usr/bin/python #!/usr/bin/python
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4 # vi:si:et:sw=4:sts=4:ts=4
from __future__ import with_statement from __future__ import with_statement, print_function
from js import minify from .js import minify
from utils import json from .utils import json
def load(f): def load(f):
@ -14,7 +14,7 @@ def loads(source):
try: try:
minified = minify(source) minified = minify(source)
return json.loads(minified) return json.loads(minified)
except json.JSONDecodeError, e: except json.JSONDecodeError as e:
s = minified.split('\n') s = minified.split('\n')
context = s[e.lineno-1][max(0, e.colno-1):e.colno+30] context = s[e.lineno-1][max(0, e.colno-1):e.colno+30]
msg = e.msg + ' at ' + context msg = e.msg + ' at ' + context

View file

@ -9,9 +9,9 @@ import os
import re import re
import unicodedata import unicodedata
from normalize import normalize_name from .normalize import normalize_name
from text import get_sort_name, find_re from .text import get_sort_name, find_re
from file import EXTENSIONS from .file import EXTENSIONS
__all__ = ['parse_movie_path', 'create_movie_path', 'get_oxid'] __all__ = ['parse_movie_path', 'create_movie_path', 'get_oxid']

View file

@ -1,13 +1,13 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4 # vi:si:et:sw=4:sts=4:ts=4
# GPL 2008 # GPL 2008
from __future__ import with_statement, print_function
import os import os
import gzip import gzip
import re import re
import StringIO from six import BytesIO
import struct import struct
import urllib from six.moves import urllib
import urllib2
from chardet.universaldetector import UniversalDetector from chardet.universaldetector import UniversalDetector
@ -26,7 +26,7 @@ def status(url, data=None, headers=DEFAULT_HEADERS):
try: try:
f = open_url(url, data, headers) f = open_url(url, data, headers)
s = f.code s = f.code
except urllib2.HTTPError, e: except urllib.error.HTTPError as e:
s = e.code s = e.code
return s return s
@ -42,46 +42,59 @@ def get_headers(url, data=None, headers=DEFAULT_HEADERS):
f.headers['Status'] = "%s" % f.code f.headers['Status'] = "%s" % f.code
headers = f.headers headers = f.headers
f.close() f.close()
except urllib2.HTTPError, e: except urllib.error.HTTPError as e:
e.headers['Status'] = "%s" % e.code e.headers['Status'] = "%s" % e.code
headers = e.headers headers = e.headers
return dict(headers) return dict(headers)
def open_url(url, data=None, headers=DEFAULT_HEADERS): def open_url(url, data=None, headers=DEFAULT_HEADERS):
if isinstance(url, bytes):
url = url.decode('utf-8')
url = url.replace(' ', '%20') url = url.replace(' ', '%20')
req = urllib2.Request(url, data, headers) req = urllib.request.Request(url, data, headers)
return urllib2.urlopen(req) return urllib.request.urlopen(req)
def read_url(url, data=None, headers=DEFAULT_HEADERS, return_headers=False, unicode=False): def read_url(url, data=None, headers=DEFAULT_HEADERS, return_headers=False, unicode=False):
if DEBUG: if DEBUG:
print 'ox.net.read_url', url print('ox.net.read_url', url)
f = open_url(url, data, headers) f = open_url(url, data, headers)
result = f.read() result = f.read()
f.close() f.close()
if f.headers.get('content-encoding', None) == 'gzip': if f.headers.get('content-encoding', None) == 'gzip':
result = gzip.GzipFile(fileobj=StringIO.StringIO(result)).read() result = gzip.GzipFile(fileobj=BytesIO(result)).read()
if unicode: if unicode:
ctype = f.headers.get('content-type', '').lower()
if 'charset' in ctype:
encoding = ctype.split('charset=')[-1]
else:
encoding = detect_encoding(result) encoding = detect_encoding(result)
if not encoding: if not encoding:
encoding = 'latin-1' encoding = 'latin-1'
result = result.decode(encoding) result = result.decode(encoding)
if return_headers: if return_headers:
f.headers['Status'] = "%s" % f.code f.headers['Status'] = "%s" % f.code
return dict(f.headers), result headers = {}
for key in f.headers:
headers[key.lower()] = f.headers[key]
return headers, result
return result return result
def detect_encoding(data): def detect_encoding(data):
data_lower = data.lower() data_lower = data.lower().decode('utf-8', 'ignore')
charset = re.compile('content="text/html; charset=(.*?)"').findall(data) charset = re.compile('content="text/html; charset=(.*?)"').findall(data_lower)
if not charset: if not charset:
charset = re.compile('meta charset="(.*?)"').findall(data) charset = re.compile('meta charset="(.*?)"').findall(data_lower)
if charset: if charset:
return charset[0].lower() return charset[0].lower()
detector = UniversalDetector() detector = UniversalDetector()
for line in data.split('\n'): p = 0
detector.feed(line) l = len(data)
s = 1024
while p < l:
detector.feed(data[p:p+s])
if detector.done: if detector.done:
break break
p += s
detector.close() detector.close()
return detector.result['encoding'] return detector.result['encoding']
@ -97,9 +110,9 @@ def save_url(url, filename, overwrite=False):
def oshash(url): def oshash(url):
def get_size(url): def get_size(url):
req = urllib2.Request(url, headers=DEFAULT_HEADERS.copy()) req = urllib.request.Request(url, headers=DEFAULT_HEADERS.copy())
req.get_method = lambda : 'HEAD' req.get_method = lambda : 'HEAD'
u = urllib2.urlopen(req) u = urllib.request.urlopen(req)
if u.code != 200 or not 'Content-Length' in u.headers: if u.code != 200 or not 'Content-Length' in u.headers:
raise IOError raise IOError
return int(u.headers['Content-Length']) return int(u.headers['Content-Length'])
@ -107,8 +120,8 @@ def oshash(url):
def get_range(url, start, end): def get_range(url, start, end):
headers = DEFAULT_HEADERS.copy() headers = DEFAULT_HEADERS.copy()
headers['Range'] = 'bytes=%s-%s' % (start, end) headers['Range'] = 'bytes=%s-%s' % (start, end)
req = urllib2.Request(url, headers=headers) req = urllib.request.Request(url, headers=headers)
u = urllib2.urlopen(req) u = urllib.request.urlopen(req)
return u.read() return u.read()
try: try:

View file

@ -1,9 +1,10 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# ci:si:et:sw=4:sts=4:ts=4 # ci:si:et:sw=4:sts=4:ts=4
import re import re
from text import find_re
import cache from . import cache
from utils import json, ET from .text import find_re
from .utils import json, ET
def get_embed_code(url, maxwidth=None, maxheight=None): def get_embed_code(url, maxwidth=None, maxheight=None):
embed = {} embed = {}

View file

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4 # vi:si:et:sw=4:sts=4:ts=4
from __future__ import with_statement, division from __future__ import with_statement, division, print_function
import chardet import chardet
import re import re
import codecs import codecs
@ -71,7 +71,7 @@ def load(filename, offset=0):
try: try:
data = unicode(data, 'latin-1') data = unicode(data, 'latin-1')
except: except:
print "failed to detect encoding, giving up" print("failed to detect encoding, giving up")
return srt return srt
data = data.replace('\r\n', '\n') data = data.replace('\r\n', '\n')

View file

@ -6,7 +6,7 @@ from threading import Event
from hashlib import sha1 from hashlib import sha1
import os import os
from bencode import bencode, bdecode from .bencode import bencode, bdecode
__all__ = ['create_torrent', 'get_info_hash', 'get_torrent_info', 'get_files', 'get_torrent_size'] __all__ = ['create_torrent', 'get_info_hash', 'get_torrent_info', 'get_files', 'get_torrent_size']
@ -24,9 +24,8 @@ def get_info_hash(torrentFile):
return sha1(bencode(info)).hexdigest() return sha1(bencode(info)).hexdigest()
def get_torrent_info(data=None, file=None): def get_torrent_info(data=None, file=None):
from bencode import bencode
if file: if file:
if isinstance(file, unicode): if not isinstance(file, bytes):
file = file.encode('utf-8') file = file.encode('utf-8')
with open(file, 'rb') as f: with open(file, 'rb') as f:
data = f.read() data = f.read()
@ -36,7 +35,7 @@ def get_torrent_info(data=None, file=None):
metainfo = bdecode(data) metainfo = bdecode(data)
info = metainfo['info'] info = metainfo['info']
piece_length = info['piece length'] piece_length = info['piece length']
if info.has_key('length'): if 'length' in info:
# let's assume we just have one file # let's assume we just have one file
file_length = info['length'] file_length = info['length']
else: else:

View file

@ -2,8 +2,8 @@
# encoding: utf-8 # encoding: utf-8
__version__ = '1.0.0' __version__ = '1.0.0'
import imdb from . import imdb
import wikipedia from . import wikipedia
import google from . import google
import piratecinema from . import piratecinema
import oxdb from . import oxdb

View file

@ -1,7 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4 # vi:si:et:sw=4:sts=4:ts=4
import re import re
import time
from ox import strip_tags, find_re from ox import strip_tags, find_re
from ox.cache import read_url from ox.cache import read_url

View file

@ -1,7 +1,7 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4 # vi:si:et:sw=4:sts=4:ts=4
import re import re
from urllib import quote from six.moves.urllib.parse import quote
from ox import find_re, strip_tags, decode_html from ox import find_re, strip_tags, decode_html
from ox.cache import read_url from ox.cache import read_url

View file

@ -1,14 +1,11 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4 # vi:si:et:sw=4:sts=4:ts=4
from datetime import datetime
from urllib import urlencode
import json import json
import os import os
import re import re
from ox import find_re, strip_tags, decode_html from ox import find_re, strip_tags
from ox.cache import read_url from ox.cache import read_url
from ox.net import open_url
def get_data(id, language='en'): def get_data(id, language='en'):
if language == 'de': if language == 'de':
@ -57,7 +54,7 @@ def backup(filename):
data = json.load(f) data = json.load(f)
else: else:
data = {} data = {}
start = ids and max(map(int, data)) or 1 start = max(map(int, data)) or 1
for i in range(start, 11872): for i in range(start, 11872):
info = get_data(i) info = get_data(i)
if info: if info:

View file

@ -5,7 +5,7 @@ import re
import ox.cache import ox.cache
from ox.cache import read_url from ox.cache import read_url
from ox.html import strip_tags from ox.html import strip_tags
from ox.text import find_re, remove_special_characters from ox.text import find_re
import imdb import imdb

View file

@ -1,7 +1,7 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4 # vi:si:et:sw=4:sts=4:ts=4
import re import re
from urllib import unquote from six.moves.urllib.parse import unquote
from ox.cache import read_url from ox.cache import read_url

View file

@ -1,17 +1,17 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4 # vi:si:et:sw=4:sts=4:ts=4
import re import re
import urllib
from six.moves import urllib
import ox import ox
from ox import strip_tags, decode_html from ox import strip_tags, decode_html
from ox.utils import json
from ox.cache import read_url from ox.cache import read_url
def find(query, timeout=ox.cache.cache_timeout): def find(query, timeout=ox.cache.cache_timeout):
if isinstance(query, unicode): if not isinstance(query, bytes):
query = query.encode('utf-8') query = query.encode('utf-8')
params = urllib.urlencode({'q': query}) params = urllib.parse.urlencode({'q': query})
url = 'http://duckduckgo.com/html/?' + params url = 'http://duckduckgo.com/html/?' + params
data = read_url(url, timeout=timeout).decode('utf-8') data = read_url(url, timeout=timeout).decode('utf-8')
results = [] results = []

View file

@ -1,7 +1,7 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4 # vi:si:et:sw=4:sts=4:ts=4
import re import re
import urllib from six.moves import urllib
import ox import ox
from ox import strip_tags, decode_html from ox import strip_tags, decode_html
@ -13,9 +13,9 @@ def read_url(url, data=None, headers=ox.net.DEFAULT_HEADERS, timeout=DEFAULT_TIM
return ox.cache.read_url(url, data, headers, timeout, unicode=True) return ox.cache.read_url(url, data, headers, timeout, unicode=True)
def quote_plus(s): def quote_plus(s):
if not isinstance(s, str): if not isinstance(s, bytes):
s = s.encode('utf-8') s = s.encode('utf-8')
return urllib.quote_plus(s) return urllib.parse.quote_plus(s)
def find(query, max_results=DEFAULT_MAX_RESULTS, timeout=DEFAULT_TIMEOUT): def find(query, max_results=DEFAULT_MAX_RESULTS, timeout=DEFAULT_TIMEOUT):
""" """

View file

@ -1,23 +1,27 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4 # vi:si:et:sw=4:sts=4:ts=4
import urllib from __future__ import print_function
import re import re
import time import time
import unicodedata import unicodedata
import ox from six.moves import urllib
from ox import find_re, strip_tags from six import string_types
import ox.cache
from siteparser import SiteParser
import duckduckgo
from .. import find_re, strip_tags, decode_html
from .. import cache
from . siteparser import SiteParser
from . import duckduckgo
from ..utils import datetime from ..utils import datetime
from ..geo import normalize_country_name from ..geo import normalize_country_name
def read_url(url, data=None, headers=ox.cache.DEFAULT_HEADERS, timeout=ox.cache.cache_timeout, valid=None, unicode=False): def read_url(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None, unicode=False):
headers = headers.copy() headers = headers.copy()
return ox.cache.read_url(url, data, headers, timeout, unicode=unicode) return cache.read_url(url, data, headers, timeout, unicode=unicode)
def get_url(id): def get_url(id):
return "http://www.imdb.com/title/tt%s/" % id return "http://www.imdb.com/title/tt%s/" % id
@ -49,7 +53,7 @@ class Imdb(SiteParser):
'page': 'business', 'page': 'business',
're': [ 're': [
'<h5>Budget</h5>\s*?\$(.*?)<br', '<h5>Budget</h5>\s*?\$(.*?)<br',
lambda data: find_re(ox.decode_html(data).replace(',', ''), '\d+') lambda data: find_re(decode_html(data).replace(',', ''), '\d+')
], ],
'type': 'int' 'type': 'int'
}, },
@ -211,7 +215,7 @@ class Imdb(SiteParser):
'page': 'releaseinfo', 'page': 'releaseinfo',
're': [ 're': [
'<td class="release_date">(.*?)</td>', '<td class="release_date">(.*?)</td>',
ox.strip_tags, strip_tags,
], ],
'type': 'list' 'type': 'list'
}, },
@ -326,7 +330,7 @@ class Imdb(SiteParser):
if 'alternativeTitles' in self: if 'alternativeTitles' in self:
if len(self['alternativeTitles']) == 2 and \ if len(self['alternativeTitles']) == 2 and \
isinstance(self['alternativeTitles'][0], basestring): isinstance(self['alternativeTitles'][0], string_types):
self['alternativeTitles'] = [self['alternativeTitles']] self['alternativeTitles'] = [self['alternativeTitles']]
#normalize country names #normalize country names
@ -472,7 +476,7 @@ class Imdb(SiteParser):
if c: if c:
alt[title].append(c) alt[title].append(c)
self['alternativeTitles'] = [] self['alternativeTitles'] = []
for t in sorted(alt, lambda a, b: cmp(sorted(alt[a]), sorted(alt[b]))): for t in sorted(alt, key=lambda a: sorted(alt[a])):
if alt[t]: if alt[t]:
countries = sorted([normalize_country_name(c) or c for c in alt[t]]) countries = sorted([normalize_country_name(c) or c for c in alt[t]])
self['alternativeTitles'].append((t, countries)) self['alternativeTitles'].append((t, countries))
@ -492,7 +496,7 @@ class Imdb(SiteParser):
if 'votes' in self: self['votes'] = self['votes'].replace(',', '') if 'votes' in self: self['votes'] = self['votes'].replace(',', '')
if 'cast' in self: if 'cast' in self:
if isinstance(self['cast'][0], basestring): if isinstance(self['cast'][0], string_types):
self['cast'] = [self['cast']] self['cast'] = [self['cast']]
self['actor'] = [c[0] for c in self['cast']] self['actor'] = [c[0] for c in self['cast']]
def cleanup_character(c): def cleanup_character(c):
@ -503,10 +507,12 @@ class Imdb(SiteParser):
if 'connections' in self: if 'connections' in self:
cc={} cc={}
if len(self['connections']) == 3 and isinstance(self['connections'][0], basestring): if len(self['connections']) == 3 and isinstance(self['connections'][0], string_types):
self['connections'] = [self['connections']] self['connections'] = [self['connections']]
for rel, data, _ in self['connections']: for rel, data, _ in self['connections']:
#cc[unicode(rel)] = re.compile('<a href="/title/tt(\d{7})/">(.*?)</a>').findall(data) if isinstance(rel, bytes):
rel = rel.decode('utf-8')
#cc[rel] = re.compile('<a href="/title/tt(\d{7})/">(.*?)</a>').findall(data)
def get_conn(c): def get_conn(c):
r = { r = {
'id': c[0], 'id': c[0],
@ -516,14 +522,14 @@ class Imdb(SiteParser):
if len(description) == 2 and description[-1].strip() != '-': if len(description) == 2 and description[-1].strip() != '-':
r['description'] = description[-1].strip() r['description'] = description[-1].strip()
return r return r
cc[unicode(rel)] = map(get_conn, re.compile('<a href="/title/tt(\d{7})/">(.*?)</a>(.*?)<\/div', re.DOTALL).findall(data)) cc[rel] = list(map(get_conn, re.compile('<a href="/title/tt(\d{7})/">(.*?)</a>(.*?)<\/div', re.DOTALL).findall(data)))
self['connections'] = cc self['connections'] = cc
for key in ('country', 'genre'): for key in ('country', 'genre'):
if key in self: if key in self:
self[key] = filter(lambda x: x.lower() != 'home', self[key]) self[key] = list(filter(lambda x: x.lower() != 'home', self[key]))
#0092999 #0092999
if '_director' in self: if '_director' in self:
if 'series' in self or 'isSeries' in self: if 'series' in self or 'isSeries' in self:
@ -590,8 +596,8 @@ class Imdb(SiteParser):
if key in self: if key in self:
if isinstance(self[key][0], list): if isinstance(self[key][0], list):
self[key] = [i[0] for i in self[key] if i] self[key] = [i[0] for i in self[key] if i]
self[key] = sorted(list(set(self[key])), self[key] = sorted(list(set(self[key])), key=lambda a: self[key].index(a))
lambda a, b: self[key].index(a) - self[key].index(b))
if 'budget' in self and 'gross' in self: if 'budget' in self and 'gross' in self:
self['profit'] = self['gross'] - self['budget'] self['profit'] = self['gross'] - self['budget']
@ -655,7 +661,7 @@ def get_movie_by_title(title, timeout=-1):
u'0866567' u'0866567'
''' '''
params = {'s':'tt','q': title} params = {'s':'tt','q': title}
if isinstance(title, unicode): if not isinstance(title, bytes):
try: try:
params['q'] = unicodedata.normalize('NFKC', params['q']).encode('latin-1') params['q'] = unicodedata.normalize('NFKC', params['q']).encode('latin-1')
except: except:
@ -731,7 +737,7 @@ def get_movie_id(title, director='', year='', timeout=-1):
if year: if year:
params['q'] = u'"%s (%s)" %s' % (title, year, director) params['q'] = u'"%s (%s)" %s' % (title, year, director)
google_query = "site:imdb.com %s" % params['q'] google_query = "site:imdb.com %s" % params['q']
if isinstance(params['q'], unicode): if not isinstance(params['q'], bytes):
try: try:
params['q'] = unicodedata.normalize('NFKC', params['q']).encode('latin-1') params['q'] = unicodedata.normalize('NFKC', params['q']).encode('latin-1')
except: except:
@ -775,7 +781,7 @@ def get_movie_poster(imdbId):
info = ImdbCombined(imdbId) info = ImdbCombined(imdbId)
if 'posterId' in info: if 'posterId' in info:
url = "http://www.imdb.com/media/rm%s/tt%s" % (info['posterId'], imdbId) url = "http://www.imdb.com/media/rm%s/tt%s" % (info['posterId'], imdbId)
data = read_url(url) data = read_url(url).decode('utf-8', 'ignore')
poster = find_re(data, 'img.*?id="primary-img".*?src="(.*?)"') poster = find_re(data, 'img.*?id="primary-img".*?src="(.*?)"')
return poster return poster
elif 'series' in info: elif 'series' in info:
@ -787,11 +793,11 @@ def get_episodes(imdbId, season=None):
url = 'http://www.imdb.com/title/tt%s/episodes' % imdbId url = 'http://www.imdb.com/title/tt%s/episodes' % imdbId
if season: if season:
url += '?season=%d' % season url += '?season=%d' % season
data = ox.cache.read_url(url) data = cache.read_url(url)
for e in re.compile('<div data-const="tt(\d{7})".*?>.*?<div>S(\d+), Ep(\d+)<\/div>\n<\/div>', re.DOTALL).findall(data): for e in re.compile('<div data-const="tt(\d{7})".*?>.*?<div>S(\d+), Ep(\d+)<\/div>\n<\/div>', re.DOTALL).findall(data):
episodes['S%02dE%02d' %(int(e[1]), int(e[2]))] = e[0] episodes['S%02dE%02d' %(int(e[1]), int(e[2]))] = e[0]
else: else:
data = ox.cache.read_url(url) data = cache.read_url(url)
match = re.compile('<strong>Season (\d+)</strong>').findall(data) match = re.compile('<strong>Season (\d+)</strong>').findall(data)
if match: if match:
for season in range(1, int(match[0]) + 1): for season in range(1, int(match[0]) + 1):
@ -800,7 +806,7 @@ def get_episodes(imdbId, season=None):
def max_votes(): def max_votes():
url = 'http://www.imdb.com/search/title?num_votes=500000,&sort=num_votes,desc' url = 'http://www.imdb.com/search/title?num_votes=500000,&sort=num_votes,desc'
data = ox.cache.read_url(url) data = cache.read_url(url)
votes = max([int(v.replace(',', '')) votes = max([int(v.replace(',', ''))
for v in re.compile('<td class="sort_col">([\d,]+)</td>').findall(data)]) for v in re.compile('<td class="sort_col">([\d,]+)</td>').findall(data)])
return votes return votes
@ -810,6 +816,6 @@ def guess(title, director='', timeout=-1):
if __name__ == "__main__": if __name__ == "__main__":
import json import json
print json.dumps(Imdb('0306414'), indent=2) print(json.dumps(Imdb('0306414'), indent=2))
#print json.dumps(Imdb('0133093'), indent=2) #print json.dumps(Imdb('0133093'), indent=2)

View file

@ -1,5 +1,7 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4 # vi:si:et:sw=4:sts=4:ts=4
from __future__ import print_function
import re import re
from ox.net import read_url from ox.net import read_url
@ -13,5 +15,5 @@ def get_poster_url(id):
return '' return ''
if __name__ == '__main__': if __name__ == '__main__':
print get_poster_url('0749451') print(get_poster_url('0749451'))

View file

@ -2,22 +2,24 @@
# vi:si:et:sw=4:sts=4:ts=4 # vi:si:et:sw=4:sts=4:ts=4
import re import re
from six import string_types
from ..cache import read_url from ..cache import read_url
from .. import strip_tags, decode_html from .. import decode_html
from ..utils import datetime from ..utils import datetime
def cleanup(key, data, data_type): def cleanup(key, data, data_type):
if data: if data:
if isinstance(data[0], basestring): if isinstance(data[0], string_types):
#FIXME: some types need strip_tags #FIXME: some types need strip_tags
#data = [strip_tags(decode_html(p)).strip() for p in data] #data = [strip_tags(decode_html(p)).strip() for p in data]
data = [decode_html(p).strip() for p in data] data = [decode_html(p).strip() for p in data]
elif isinstance(data[0], list) or isinstance(data[0], tuple): elif isinstance(data[0], list) or isinstance(data[0], tuple):
data = [cleanup(key, p, data_type) for p in data] data = [cleanup(key, p, data_type) for p in data]
while len(data) == 1 and not isinstance(data, basestring): while len(data) == 1 and not isinstance(data, string_types):
data = data[0] data = data[0]
if data_type == 'list' and isinstance(data, basestring): if data_type == 'list' and isinstance(data, string_types):
data = [data, ] data = [data, ]
elif data_type != 'list': elif data_type != 'list':
data = '' data = ''
@ -40,7 +42,7 @@ class SiteParser(dict):
for key in self.regex: for key in self.regex:
url = self.get_url(self.regex[key]['page']) url = self.get_url(self.regex[key]['page'])
data = self.read_url(url, timeout) data = self.read_url(url, timeout)
if isinstance(self.regex[key]['re'], basestring): if isinstance(self.regex[key]['re'], string_types):
data = re.compile(self.regex[key]['re'], re.DOTALL).findall(data) data = re.compile(self.regex[key]['re'], re.DOTALL).findall(data)
data = cleanup(key, data, self.regex[key]['type']) data = cleanup(key, data, self.regex[key]['type'])
elif callable(self.regex[key]['re']): elif callable(self.regex[key]['re']):
@ -51,7 +53,7 @@ class SiteParser(dict):
f = r f = r
else: else:
f = re.compile(r, re.DOTALL).findall f = re.compile(r, re.DOTALL).findall
if isinstance(data, basestring): if isinstance(data, string_types):
data = f(data) data = f(data)
else: else:
data = [f(d) for d in data] data = [f(d) for d in data]

View file

@ -1,11 +1,14 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# vi:si:et:sw=4:sts=4:ts=4 # vi:si:et:sw=4:sts=4:ts=4
from __future__ import print_function
import re import re
from urllib import urlencode
from six.moves import urllib
from ox.utils import json from ox.utils import json
from ox.cache import read_url from ox.cache import read_url
from ox import find_re, decode_html from ox import find_re
def get_id(url): def get_id(url):
@ -138,11 +141,11 @@ def get_allmovie_id(wikipedia_url):
def find(query, max_results=10): def find(query, max_results=10):
query = {'action': 'query', 'list':'search', 'format': 'json', query = {'action': 'query', 'list':'search', 'format': 'json',
'srlimit': max_results, 'srwhat': 'text', 'srsearch': query.encode('utf-8')} 'srlimit': max_results, 'srwhat': 'text', 'srsearch': query.encode('utf-8')}
url = "http://en.wikipedia.org/w/api.php?" + urlencode(query) url = "http://en.wikipedia.org/w/api.php?" + urllib.parse.urlencode(query)
data = read_url(url) data = read_url(url)
if not data: if not data:
data = read_url(url, timeout=0) data = read_url(url, timeout=0)
result = json.loads(data) result = json.loads(data.decode('utf-8'))
results = [] results = []
if result and 'query' in result: if result and 'query' in result:
for r in result['query']['search']: for r in result['query']['search']:

View file

@ -36,15 +36,16 @@ setup(
download_url="http://code.0x2620.org/python-ox/download", download_url="http://code.0x2620.org/python-ox/download",
license="GPLv3", license="GPLv3",
packages=['ox', 'ox.django', 'ox.django.api', 'ox.torrent', 'ox.web'], packages=['ox', 'ox.django', 'ox.django.api', 'ox.torrent', 'ox.web'],
install_requires=['chardet', 'feedparser'], install_requires=['six', 'chardet', 'feedparser'],
keywords = [ keywords = [
], ],
classifiers = [ classifiers = [
'Operating System :: OS Independent', 'Operating System :: OS Independent',
'Programming Language :: Python', 'Programming Language :: Python',
'Programming Language :: Python :: 2', 'Programming Language :: Python :: 2',
'Programming Language :: Python :: 2.6',
'Programming Language :: Python :: 2.7', 'Programming Language :: Python :: 2.7',
'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3.4',
'Topic :: Software Development :: Libraries :: Python Modules', 'Topic :: Software Development :: Libraries :: Python Modules',
], ],
) )