use six to support python 2 and 3
This commit is contained in:
parent
1b1dcf1c58
commit
d4d09b56b6
28 changed files with 1730 additions and 1678 deletions
|
@ -3,28 +3,32 @@
|
|||
# GPL 2011
|
||||
__version__ = '2.1.1'
|
||||
|
||||
import cache
|
||||
import js
|
||||
import jsonc
|
||||
import net
|
||||
import srt
|
||||
import utils
|
||||
from . import cache
|
||||
from . import js
|
||||
from . import jsonc
|
||||
from . import net
|
||||
from . import srt
|
||||
from . import utils
|
||||
|
||||
from api import *
|
||||
from file import *
|
||||
from form import *
|
||||
from format import *
|
||||
from geo import *
|
||||
from html import *
|
||||
from .api import *
|
||||
from .file import *
|
||||
from .form import *
|
||||
from .format import *
|
||||
from .geo import *
|
||||
from .html import *
|
||||
#image depends on PIL, not easy enough to instal on osx
|
||||
try:
|
||||
from image import *
|
||||
from .image import *
|
||||
except:
|
||||
pass
|
||||
from location import *
|
||||
from movie import *
|
||||
from normalize import *
|
||||
from oembed import *
|
||||
from text import *
|
||||
from torrent import *
|
||||
from fixunicode import *
|
||||
from .location import *
|
||||
from .movie import *
|
||||
from .normalize import *
|
||||
from .oembed import *
|
||||
from .text import *
|
||||
#currently broken in python3
|
||||
try:
|
||||
from .torrent import *
|
||||
except:
|
||||
pass
|
||||
from .fixunicode import *
|
||||
|
|
14
ox/api.py
14
ox/api.py
|
@ -3,10 +3,10 @@
|
|||
# GPL 2011
|
||||
from __future__ import with_statement
|
||||
|
||||
import cookielib
|
||||
from six.moves import http_cookiejar as cookielib
|
||||
import gzip
|
||||
import StringIO
|
||||
import urllib2
|
||||
from six import StringIO
|
||||
from six.moves import urllib
|
||||
from types import MethodType
|
||||
|
||||
from . import __version__
|
||||
|
@ -29,8 +29,8 @@ class API(object):
|
|||
self._cj = cj
|
||||
else:
|
||||
self._cj = cookielib.CookieJar()
|
||||
self._opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(self._cj),
|
||||
urllib2.HTTPHandler(debuglevel=self.debuglevel))
|
||||
self._opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(self._cj),
|
||||
urllib.HTTPHandler(debuglevel=self.debuglevel))
|
||||
self._opener.addheaders = [
|
||||
('User-Agent', '%s/%s' % (self.__name__, self.__version__))
|
||||
]
|
||||
|
@ -64,7 +64,7 @@ class API(object):
|
|||
result = {}
|
||||
try:
|
||||
body = str(form)
|
||||
request = urllib2.Request(str(url))
|
||||
request = urllib.reuqest.Request(str(url))
|
||||
request.add_header('Content-type', form.get_content_type())
|
||||
request.add_header('Content-Length', str(len(body)))
|
||||
request.add_header('Accept-Encoding', 'gzip, deflate')
|
||||
|
@ -75,7 +75,7 @@ class API(object):
|
|||
result = gzip.GzipFile(fileobj=StringIO.StringIO(result)).read()
|
||||
result = result.decode('utf-8')
|
||||
return json.loads(result)
|
||||
except urllib2.HTTPError, e:
|
||||
except urllib.error.HTTPError as e:
|
||||
if self.DEBUG:
|
||||
import webbrowser
|
||||
if e.code >= 500:
|
||||
|
|
59
ox/cache.py
59
ox/cache.py
|
@ -1,24 +1,22 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
# GPL 2011
|
||||
from __future__ import with_statement
|
||||
from __future__ import with_statement, print_function
|
||||
|
||||
import gzip
|
||||
import zlib
|
||||
import hashlib
|
||||
import os
|
||||
import StringIO
|
||||
from six import BytesIO
|
||||
import time
|
||||
import urlparse
|
||||
import urllib2
|
||||
from six.moves import urllib
|
||||
import sqlite3
|
||||
|
||||
import chardet
|
||||
from utils import json
|
||||
from .utils import json
|
||||
from .file import makedirs
|
||||
|
||||
import net
|
||||
from net import DEFAULT_HEADERS, detect_encoding
|
||||
from . import net
|
||||
from .net import DEFAULT_HEADERS, detect_encoding
|
||||
|
||||
cache_timeout = 30*24*60*60 # default is 30 days
|
||||
|
||||
|
@ -69,7 +67,7 @@ class InvalidResult(Exception):
|
|||
self.headers = headers
|
||||
|
||||
def _fix_unicode_url(url):
|
||||
if isinstance(url, unicode):
|
||||
if not isinstance(url, bytes):
|
||||
url = url.encode('utf-8')
|
||||
return url
|
||||
|
||||
|
@ -83,24 +81,30 @@ def read_url(url, data=None, headers=DEFAULT_HEADERS, timeout=cache_timeout, val
|
|||
if this function fails, InvalidResult will be raised deal with it in your code
|
||||
'''
|
||||
if net.DEBUG:
|
||||
print 'ox.cache.read_url', url
|
||||
print('ox.cache.read_url', url)
|
||||
#FIXME: send last-modified / etag from cache and only update if needed
|
||||
url = _fix_unicode_url(url)
|
||||
#url = _fix_unicode_url(url)
|
||||
result = store.get(url, data, headers, timeout)
|
||||
url_headers = {}
|
||||
if not result:
|
||||
try:
|
||||
url_headers, result = net.read_url(url, data, headers, return_headers=True)
|
||||
except urllib2.HTTPError, e:
|
||||
except urllib.error.HTTPError as e:
|
||||
e.headers['Status'] = "%s" % e.code
|
||||
url_headers = dict(e.headers)
|
||||
for key in e.headers:
|
||||
url_headers[key.lower()] = e.headers[key]
|
||||
result = e.read()
|
||||
if url_headers.get('content-encoding', None) == 'gzip':
|
||||
result = gzip.GzipFile(fileobj=StringIO.StringIO(result)).read()
|
||||
result = gzip.GzipFile(fileobj=BytesIO(result)).read()
|
||||
if not valid or valid(result, url_headers):
|
||||
store.set(url, post_data=data, data=result, headers=url_headers)
|
||||
else:
|
||||
raise InvalidResult(result, url_headers)
|
||||
if unicode:
|
||||
ctype = url_headers.get('content-type', '').lower()
|
||||
if 'charset' in ctype:
|
||||
encoding = ctype.split('charset=')[-1]
|
||||
else:
|
||||
encoding = detect_encoding(result)
|
||||
if not encoding:
|
||||
encoding = 'latin-1'
|
||||
|
@ -143,9 +147,8 @@ class SQLiteCache(Cache):
|
|||
self.create()
|
||||
|
||||
def connect(self):
|
||||
conn = sqlite3.connect(self.db, timeout=10)
|
||||
conn.text_factory = str
|
||||
return conn
|
||||
self.conn = sqlite3.connect(self.db, timeout=10)
|
||||
return self.conn
|
||||
|
||||
def create(self):
|
||||
conn = self.connect()
|
||||
|
@ -177,9 +180,9 @@ class SQLiteCache(Cache):
|
|||
if timeout == 0:
|
||||
return r
|
||||
if data:
|
||||
url_hash = hashlib.sha1(url + '?' + data).hexdigest()
|
||||
url_hash = hashlib.sha1((url + '?' + data).encode('utf-8')).hexdigest()
|
||||
else:
|
||||
url_hash = hashlib.sha1(url).hexdigest()
|
||||
url_hash = hashlib.sha1(url.encode('utf-8')).hexdigest()
|
||||
|
||||
conn = self.connect()
|
||||
c = conn.cursor()
|
||||
|
@ -210,11 +213,11 @@ class SQLiteCache(Cache):
|
|||
|
||||
def set(self, url, post_data, data, headers):
|
||||
if post_data:
|
||||
url_hash = hashlib.sha1(url + '?' + post_data).hexdigest()
|
||||
url_hash = hashlib.sha1((url + '?' + post_data).encode('utf-8')).hexdigest()
|
||||
else:
|
||||
url_hash = hashlib.sha1(url).hexdigest()
|
||||
url_hash = hashlib.sha1(url.encode('utf-8')).hexdigest()
|
||||
|
||||
domain = ".".join(urlparse.urlparse(url)[1].split('.')[-2:])
|
||||
domain = ".".join(urllib.parse.urlparse(url)[1].split('.')[-2:])
|
||||
|
||||
conn = self.connect()
|
||||
c = conn.cursor()
|
||||
|
@ -266,11 +269,11 @@ class FileCache(Cache):
|
|||
return r
|
||||
|
||||
if data:
|
||||
url_hash = hashlib.sha1(url + '?' + data).hexdigest()
|
||||
url_hash = hashlib.sha1((url + '?' + data).encode('utf-8')).hexdigest()
|
||||
else:
|
||||
url_hash = hashlib.sha1(url).hexdigest()
|
||||
url_hash = hashlib.sha1(url.encode('utf-8')).hexdigest()
|
||||
|
||||
domain = ".".join(urlparse.urlparse(url)[1].split('.')[-2:])
|
||||
domain = ".".join(urllib.parse.urlparse(url)[1].split('.')[-2:])
|
||||
prefix, i, f = self.files(domain, url_hash)
|
||||
if os.path.exists(i):
|
||||
with open(i) as _i:
|
||||
|
@ -295,11 +298,11 @@ class FileCache(Cache):
|
|||
|
||||
def set(self, url, post_data, data, headers):
|
||||
if post_data:
|
||||
url_hash = hashlib.sha1(url + '?' + post_data).hexdigest()
|
||||
url_hash = hashlib.sha1((url + '?' + post_data).encode('utf-8')).hexdigest()
|
||||
else:
|
||||
url_hash = hashlib.sha1(url).hexdigest()
|
||||
url_hash = hashlib.sha1(url.encode('utf-8')).hexdigest()
|
||||
|
||||
domain = ".".join(urlparse.urlparse(url)[1].split('.')[-2:])
|
||||
domain = ".".join(urllib.parse.urlparse(url)[1].split('.')[-2:])
|
||||
prefix, i, f = self.files(domain, url_hash)
|
||||
makedirs(prefix)
|
||||
|
||||
|
|
18
ox/file.py
18
ox/file.py
|
@ -1,7 +1,7 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
# GPL 2008
|
||||
from __future__ import division, with_statement
|
||||
from __future__ import division, with_statement, print_function
|
||||
import os
|
||||
import hashlib
|
||||
import re
|
||||
|
@ -10,7 +10,7 @@ import struct
|
|||
import subprocess
|
||||
import sqlite3
|
||||
|
||||
from ox.utils import json
|
||||
from .utils import json
|
||||
|
||||
__all__ = ['sha1sum', 'oshash', 'avinfo', 'makedirs']
|
||||
|
||||
|
@ -283,19 +283,19 @@ def makedirs(path):
|
|||
if not os.path.exists(path):
|
||||
try:
|
||||
os.makedirs(path)
|
||||
except OSError, e:
|
||||
except OSError as e:
|
||||
if e.errno != 17:
|
||||
raise
|
||||
|
||||
def copy_file(source, target, verbose=False):
|
||||
if verbose:
|
||||
print 'copying', source, 'to', target
|
||||
print('copying', source, 'to', target)
|
||||
write_path(target)
|
||||
shutil.copyfile(source, target)
|
||||
|
||||
def read_file(file, verbose=False):
|
||||
if verbose:
|
||||
print 'reading', file
|
||||
print('reading', file)
|
||||
f = open(file)
|
||||
data = f.read()
|
||||
f.close()
|
||||
|
@ -303,14 +303,14 @@ def read_file(file, verbose=False):
|
|||
|
||||
def read_json(file, verbose=False):
|
||||
if verbose:
|
||||
print 'reading', file
|
||||
print('reading', file)
|
||||
with open(file) as fd:
|
||||
data = json.load(fd)
|
||||
return data
|
||||
|
||||
def write_file(file, data, verbose=False):
|
||||
if verbose:
|
||||
print 'writing', file
|
||||
print('writing', file)
|
||||
write_path(file)
|
||||
f = open(file, 'w')
|
||||
f.write(data)
|
||||
|
@ -319,7 +319,7 @@ def write_file(file, data, verbose=False):
|
|||
|
||||
def write_image(file, image, verbose=False):
|
||||
if verbose:
|
||||
print 'writing', file
|
||||
print('writing', file)
|
||||
write_path(file)
|
||||
image.save(file)
|
||||
|
||||
|
@ -329,7 +329,7 @@ def write_json(file, data, ensure_ascii=True, indent=0, sort_keys=False, verbose
|
|||
|
||||
def write_link(source, target, verbose=False):
|
||||
if verbose:
|
||||
print 'linking', source, 'to', target
|
||||
print('linking', source, 'to', target)
|
||||
write_path(target)
|
||||
if os.path.exists(target):
|
||||
os.unlink(target)
|
||||
|
|
|
@ -2,13 +2,16 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# from http://blog.lumino.so/2012/08/20/fix-unicode-mistakes-with-python/
|
||||
# MIT
|
||||
from __future__ import print_function
|
||||
|
||||
import unicodedata
|
||||
|
||||
from six import unichr
|
||||
|
||||
__all__ = ['fix_bad_unicode']
|
||||
|
||||
def fix_bad_unicode(text):
|
||||
u"""
|
||||
"""
|
||||
Something you will find all over the place, in real-world text, is text
|
||||
that's mistakenly encoded as utf-8, decoded in some ugly format like
|
||||
latin-1 or even Windows codepage 1252, and encoded as utf-8 again.
|
||||
|
@ -26,52 +29,53 @@ def fix_bad_unicode(text):
|
|||
auto-decode bytes for you -- then it would just create the problems it's
|
||||
supposed to fix.
|
||||
|
||||
>>> print fix_bad_unicode(u'único')
|
||||
único
|
||||
>>> fix_bad_unicode(u'único')
|
||||
'único'
|
||||
|
||||
>>> fix_bad_unicode('This text is fine already :þ')
|
||||
'This text is fine already :þ'
|
||||
|
||||
>>> print fix_bad_unicode(u'This text is fine already :þ')
|
||||
This text is fine already :þ
|
||||
|
||||
Because these characters often come from Microsoft products, we allow
|
||||
for the possibility that we get not just Unicode characters 128-255, but
|
||||
also Windows's conflicting idea of what characters 128-160 are.
|
||||
|
||||
>>> print fix_bad_unicode(u'This — should be an em dash')
|
||||
This — should be an em dash
|
||||
>>> fix_bad_unicode('This — should be an em dash')
|
||||
'This — should be an em dash'
|
||||
|
||||
We might have to deal with both Windows characters and raw control
|
||||
characters at the same time, especially when dealing with characters like
|
||||
\x81 that have no mapping in Windows.
|
||||
|
||||
>>> print fix_bad_unicode(u'This text is sad .â\x81”.')
|
||||
This text is sad .⁔.
|
||||
>>> fix_bad_unicode('This text is sad .â\x81”.')
|
||||
'This text is sad .⁔.'
|
||||
|
||||
This function even fixes multiple levels of badness:
|
||||
|
||||
>>> wtf = u'\xc3\xa0\xc2\xb2\xc2\xa0_\xc3\xa0\xc2\xb2\xc2\xa0'
|
||||
>>> print fix_bad_unicode(wtf)
|
||||
ಠ_ಠ
|
||||
>>> wtf = '\xc3\xa0\xc2\xb2\xc2\xa0_\xc3\xa0\xc2\xb2\xc2\xa0'
|
||||
>>> fix_bad_unicode(wtf)
|
||||
'ಠ_ಠ'
|
||||
|
||||
However, it has safeguards against fixing sequences of letters and
|
||||
punctuation that can occur in valid text:
|
||||
|
||||
>>> print fix_bad_unicode(u'not such a fan of Charlotte Brontë…”')
|
||||
not such a fan of Charlotte Brontë…”
|
||||
>>> fix_bad_unicode('not such a fan of Charlotte Brontë…”')
|
||||
'not such a fan of Charlotte Brontë…”'
|
||||
|
||||
Cases of genuine ambiguity can sometimes be addressed by finding other
|
||||
characters that are not double-encoding, and expecting the encoding to
|
||||
be consistent:
|
||||
|
||||
>>> print fix_bad_unicode(u'AHÅ™, the new sofa from IKEA®')
|
||||
AHÅ™, the new sofa from IKEA®
|
||||
>>> fix_bad_unicode('AHÅ™, the new sofa from IKEA®')
|
||||
'AHÅ™, the new sofa from IKEA®'
|
||||
|
||||
Finally, we handle the case where the text is in a single-byte encoding
|
||||
that was intended as Windows-1252 all along but read as Latin-1:
|
||||
|
||||
>>> print fix_bad_unicode(u'This text was never Unicode at all\x85')
|
||||
This text was never Unicode at all…
|
||||
>>> fix_bad_unicode('This text was never Unicode at all\x85')
|
||||
'This text was never Unicode at all…'
|
||||
"""
|
||||
if not isinstance(text, unicode):
|
||||
if not isinstance(text, str):
|
||||
raise TypeError("This isn't even decoded into Unicode yet. "
|
||||
"Decode it first.")
|
||||
if len(text) == 0:
|
||||
|
@ -118,7 +122,7 @@ def reinterpret_windows1252_as_utf8(wrongtext):
|
|||
altered_bytes.append(char.encode('WINDOWS_1252'))
|
||||
else:
|
||||
altered_bytes.append(char.encode('latin-1', 'replace'))
|
||||
return ''.join(altered_bytes).decode('utf-8', 'replace')
|
||||
return b''.join(altered_bytes).decode('utf-8', 'replace')
|
||||
|
||||
|
||||
def reinterpret_latin1_as_windows1252(wrongtext):
|
||||
|
@ -130,7 +134,7 @@ def reinterpret_latin1_as_windows1252(wrongtext):
|
|||
|
||||
|
||||
def text_badness(text):
|
||||
u'''
|
||||
'''
|
||||
Look for red flags that text is encoded incorrectly:
|
||||
|
||||
Obvious problems:
|
||||
|
@ -147,12 +151,12 @@ def text_badness(text):
|
|||
- Improbable single-byte characters, such as ƒ or ¬
|
||||
- Letters in somewhat rare scripts
|
||||
'''
|
||||
assert isinstance(text, unicode)
|
||||
assert isinstance(text, str)
|
||||
errors = 0
|
||||
very_weird_things = 0
|
||||
weird_things = 0
|
||||
prev_letter_script = None
|
||||
for pos in xrange(len(text)):
|
||||
for pos in range(len(text)):
|
||||
char = text[pos]
|
||||
index = ord(char)
|
||||
if index < 256:
|
||||
|
@ -241,7 +245,7 @@ WINDOWS_1252_GREMLINS = [
|
|||
]
|
||||
|
||||
# a list of Unicode characters that might appear in Windows-1252 text
|
||||
WINDOWS_1252_CODEPOINTS = range(256) + WINDOWS_1252_GREMLINS
|
||||
WINDOWS_1252_CODEPOINTS = list(range(256)) + WINDOWS_1252_GREMLINS
|
||||
|
||||
# Rank the characters typically represented by a single byte -- that is, in
|
||||
# Latin-1 or Windows-1252 -- by how weird it would be to see them in running
|
||||
|
@ -286,7 +290,7 @@ SINGLE_BYTE_WEIRDNESS = (
|
|||
# letters. We'll need it often.
|
||||
SINGLE_BYTE_LETTERS = [
|
||||
unicodedata.category(unichr(i)).startswith('L')
|
||||
for i in xrange(256)
|
||||
for i in range(256)
|
||||
]
|
||||
|
||||
# A table telling us how to interpret the first word of a letter's Unicode
|
||||
|
|
21
ox/form.py
21
ox/form.py
|
@ -1,17 +1,34 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
# GPL 2014
|
||||
from __future__ import with_statement, print_function
|
||||
|
||||
import itertools
|
||||
import mimetools
|
||||
import mimetypes
|
||||
import random
|
||||
import sys
|
||||
|
||||
|
||||
__all__ = ['MultiPartForm']
|
||||
|
||||
# from /usr/lib/python3.4/email/generator.py
|
||||
# Helper used by Generator._make_boundary
|
||||
_width = len(repr(sys.maxsize-1))
|
||||
_fmt = '%%0%dd' % _width
|
||||
|
||||
def _make_boundary():
|
||||
# Craft a random boundary.
|
||||
token = random.randrange(sys.maxsize)
|
||||
boundary = ('=' * 15) + (_fmt % token) + '=='
|
||||
return boundary
|
||||
|
||||
class MultiPartForm(object):
|
||||
"""Accumulate the data to be used when posting a form."""
|
||||
|
||||
def __init__(self):
|
||||
self.form_fields = []
|
||||
self.files = []
|
||||
self.boundary = mimetools.choose_boundary()
|
||||
self.boundary = _make_boundary()
|
||||
return
|
||||
|
||||
def get_content_type(self):
|
||||
|
|
|
@ -20,7 +20,7 @@ def toAZ(num):
|
|||
>>> toAZ(1234567890)
|
||||
'CYWOQVJ'
|
||||
"""
|
||||
if num < 1: raise ValueError, "must supply a positive integer"
|
||||
if num < 1: raise ValueError("must supply a positive integer")
|
||||
digits = string.ascii_uppercase
|
||||
az = ''
|
||||
while num != 0:
|
||||
|
@ -62,7 +62,7 @@ def to26(q):
|
|||
>>> to26(347485647)
|
||||
'BDGKMAP'
|
||||
"""
|
||||
if q < 0: raise ValueError, "must supply a positive integer"
|
||||
if q < 0: raise ValueError("must supply a positive integer")
|
||||
base26 = string.ascii_uppercase
|
||||
converted = []
|
||||
while q != 0:
|
||||
|
@ -119,7 +119,7 @@ def to32(q):
|
|||
ValueError: must supply a positive integer
|
||||
"""
|
||||
|
||||
if q < 0: raise ValueError, "must supply a positive integer"
|
||||
if q < 0: raise ValueError("must supply a positive integer")
|
||||
letters = "0123456789ABCDEFGHJKMNPQRSTVWXYZ"
|
||||
converted = []
|
||||
while q != 0:
|
||||
|
@ -206,7 +206,7 @@ def to36(q):
|
|||
...
|
||||
ValueError: must supply a positive integer
|
||||
"""
|
||||
if q < 0: raise ValueError, "must supply a positive integer"
|
||||
if q < 0: raise ValueError("must supply a positive integer")
|
||||
letters = "0123456789abcdefghijklmnopqrstuvwxyz"
|
||||
converted = []
|
||||
while q != 0:
|
||||
|
|
12
ox/html.py
12
ox/html.py
|
@ -1,9 +1,11 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
# GPL 2008
|
||||
import sys
|
||||
import re
|
||||
import string
|
||||
from htmlentitydefs import name2codepoint
|
||||
from six.moves.html_entities import name2codepoint
|
||||
from six import unichr
|
||||
|
||||
|
||||
# Configuration for add_links() function
|
||||
|
@ -23,6 +25,7 @@ link_target_attribute_re = re.compile(r'(<a [^>]*?)target=[^\s>]+')
|
|||
html_gunk_re = re.compile(r'(?:<br clear="all">|<i><\/i>|<b><\/b>|<em><\/em>|<strong><\/strong>|<\/?smallcaps>|<\/?uppercase>)', re.IGNORECASE)
|
||||
hard_coded_bullets_re = re.compile(r'((?:<p>(?:%s).*?[a-zA-Z].*?</p>\s*)+)' % '|'.join([re.escape(x) for x in DOTS]), re.DOTALL)
|
||||
trailing_empty_content_re = re.compile(r'(?:<p>(?: |\s|<br \/>)*?</p>\s*)+\Z')
|
||||
if sys.version[0] == 2:
|
||||
del x # Temporary variable
|
||||
|
||||
def escape(html):
|
||||
|
@ -146,12 +149,9 @@ def decode_html(html):
|
|||
>>> decode_html('Anniversary of Daoud's Republic')
|
||||
u"Anniversary of Daoud's Republic"
|
||||
"""
|
||||
if type(html) != unicode:
|
||||
html = unicode(html)[:]
|
||||
if type(html) is unicode:
|
||||
if isinstance(html, bytes):
|
||||
html = html.decode('utf-8')
|
||||
uchr = unichr
|
||||
else:
|
||||
uchr = lambda value: value > 255 and unichr(value) or chr(value)
|
||||
def entitydecode(match, uchr=uchr):
|
||||
entity = match.group(1)
|
||||
if entity == '#x80':
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
#!/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
from __future__ import with_statement
|
||||
from __future__ import with_statement, print_function
|
||||
|
||||
from js import minify
|
||||
from utils import json
|
||||
from .js import minify
|
||||
from .utils import json
|
||||
|
||||
|
||||
def load(f):
|
||||
|
@ -14,7 +14,7 @@ def loads(source):
|
|||
try:
|
||||
minified = minify(source)
|
||||
return json.loads(minified)
|
||||
except json.JSONDecodeError, e:
|
||||
except json.JSONDecodeError as e:
|
||||
s = minified.split('\n')
|
||||
context = s[e.lineno-1][max(0, e.colno-1):e.colno+30]
|
||||
msg = e.msg + ' at ' + context
|
||||
|
|
|
@ -9,9 +9,9 @@ import os
|
|||
import re
|
||||
import unicodedata
|
||||
|
||||
from normalize import normalize_name
|
||||
from text import get_sort_name, find_re
|
||||
from file import EXTENSIONS
|
||||
from .normalize import normalize_name
|
||||
from .text import get_sort_name, find_re
|
||||
from .file import EXTENSIONS
|
||||
|
||||
__all__ = ['parse_movie_path', 'create_movie_path', 'get_oxid']
|
||||
|
||||
|
|
51
ox/net.py
51
ox/net.py
|
@ -1,13 +1,13 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
# GPL 2008
|
||||
from __future__ import with_statement, print_function
|
||||
import os
|
||||
import gzip
|
||||
import re
|
||||
import StringIO
|
||||
from six import BytesIO
|
||||
import struct
|
||||
import urllib
|
||||
import urllib2
|
||||
from six.moves import urllib
|
||||
|
||||
from chardet.universaldetector import UniversalDetector
|
||||
|
||||
|
@ -26,7 +26,7 @@ def status(url, data=None, headers=DEFAULT_HEADERS):
|
|||
try:
|
||||
f = open_url(url, data, headers)
|
||||
s = f.code
|
||||
except urllib2.HTTPError, e:
|
||||
except urllib.error.HTTPError as e:
|
||||
s = e.code
|
||||
return s
|
||||
|
||||
|
@ -42,46 +42,59 @@ def get_headers(url, data=None, headers=DEFAULT_HEADERS):
|
|||
f.headers['Status'] = "%s" % f.code
|
||||
headers = f.headers
|
||||
f.close()
|
||||
except urllib2.HTTPError, e:
|
||||
except urllib.error.HTTPError as e:
|
||||
e.headers['Status'] = "%s" % e.code
|
||||
headers = e.headers
|
||||
return dict(headers)
|
||||
|
||||
def open_url(url, data=None, headers=DEFAULT_HEADERS):
|
||||
if isinstance(url, bytes):
|
||||
url = url.decode('utf-8')
|
||||
url = url.replace(' ', '%20')
|
||||
req = urllib2.Request(url, data, headers)
|
||||
return urllib2.urlopen(req)
|
||||
req = urllib.request.Request(url, data, headers)
|
||||
return urllib.request.urlopen(req)
|
||||
|
||||
def read_url(url, data=None, headers=DEFAULT_HEADERS, return_headers=False, unicode=False):
|
||||
if DEBUG:
|
||||
print 'ox.net.read_url', url
|
||||
print('ox.net.read_url', url)
|
||||
f = open_url(url, data, headers)
|
||||
result = f.read()
|
||||
f.close()
|
||||
if f.headers.get('content-encoding', None) == 'gzip':
|
||||
result = gzip.GzipFile(fileobj=StringIO.StringIO(result)).read()
|
||||
result = gzip.GzipFile(fileobj=BytesIO(result)).read()
|
||||
if unicode:
|
||||
ctype = f.headers.get('content-type', '').lower()
|
||||
if 'charset' in ctype:
|
||||
encoding = ctype.split('charset=')[-1]
|
||||
else:
|
||||
encoding = detect_encoding(result)
|
||||
if not encoding:
|
||||
encoding = 'latin-1'
|
||||
result = result.decode(encoding)
|
||||
if return_headers:
|
||||
f.headers['Status'] = "%s" % f.code
|
||||
return dict(f.headers), result
|
||||
headers = {}
|
||||
for key in f.headers:
|
||||
headers[key.lower()] = f.headers[key]
|
||||
return headers, result
|
||||
return result
|
||||
|
||||
def detect_encoding(data):
|
||||
data_lower = data.lower()
|
||||
charset = re.compile('content="text/html; charset=(.*?)"').findall(data)
|
||||
data_lower = data.lower().decode('utf-8', 'ignore')
|
||||
charset = re.compile('content="text/html; charset=(.*?)"').findall(data_lower)
|
||||
if not charset:
|
||||
charset = re.compile('meta charset="(.*?)"').findall(data)
|
||||
charset = re.compile('meta charset="(.*?)"').findall(data_lower)
|
||||
if charset:
|
||||
return charset[0].lower()
|
||||
detector = UniversalDetector()
|
||||
for line in data.split('\n'):
|
||||
detector.feed(line)
|
||||
p = 0
|
||||
l = len(data)
|
||||
s = 1024
|
||||
while p < l:
|
||||
detector.feed(data[p:p+s])
|
||||
if detector.done:
|
||||
break
|
||||
p += s
|
||||
detector.close()
|
||||
return detector.result['encoding']
|
||||
|
||||
|
@ -97,9 +110,9 @@ def save_url(url, filename, overwrite=False):
|
|||
|
||||
def oshash(url):
|
||||
def get_size(url):
|
||||
req = urllib2.Request(url, headers=DEFAULT_HEADERS.copy())
|
||||
req = urllib.request.Request(url, headers=DEFAULT_HEADERS.copy())
|
||||
req.get_method = lambda : 'HEAD'
|
||||
u = urllib2.urlopen(req)
|
||||
u = urllib.request.urlopen(req)
|
||||
if u.code != 200 or not 'Content-Length' in u.headers:
|
||||
raise IOError
|
||||
return int(u.headers['Content-Length'])
|
||||
|
@ -107,8 +120,8 @@ def oshash(url):
|
|||
def get_range(url, start, end):
|
||||
headers = DEFAULT_HEADERS.copy()
|
||||
headers['Range'] = 'bytes=%s-%s' % (start, end)
|
||||
req = urllib2.Request(url, headers=headers)
|
||||
u = urllib2.urlopen(req)
|
||||
req = urllib.request.Request(url, headers=headers)
|
||||
u = urllib.request.urlopen(req)
|
||||
return u.read()
|
||||
|
||||
try:
|
||||
|
|
|
@ -1,9 +1,10 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# ci:si:et:sw=4:sts=4:ts=4
|
||||
import re
|
||||
from text import find_re
|
||||
import cache
|
||||
from utils import json, ET
|
||||
|
||||
from . import cache
|
||||
from .text import find_re
|
||||
from .utils import json, ET
|
||||
|
||||
def get_embed_code(url, maxwidth=None, maxheight=None):
|
||||
embed = {}
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
from __future__ import with_statement, division
|
||||
from __future__ import with_statement, division, print_function
|
||||
import chardet
|
||||
import re
|
||||
import codecs
|
||||
|
@ -71,7 +71,7 @@ def load(filename, offset=0):
|
|||
try:
|
||||
data = unicode(data, 'latin-1')
|
||||
except:
|
||||
print "failed to detect encoding, giving up"
|
||||
print("failed to detect encoding, giving up")
|
||||
return srt
|
||||
|
||||
data = data.replace('\r\n', '\n')
|
||||
|
|
|
@ -6,7 +6,7 @@ from threading import Event
|
|||
from hashlib import sha1
|
||||
import os
|
||||
|
||||
from bencode import bencode, bdecode
|
||||
from .bencode import bencode, bdecode
|
||||
|
||||
__all__ = ['create_torrent', 'get_info_hash', 'get_torrent_info', 'get_files', 'get_torrent_size']
|
||||
|
||||
|
@ -24,9 +24,8 @@ def get_info_hash(torrentFile):
|
|||
return sha1(bencode(info)).hexdigest()
|
||||
|
||||
def get_torrent_info(data=None, file=None):
|
||||
from bencode import bencode
|
||||
if file:
|
||||
if isinstance(file, unicode):
|
||||
if not isinstance(file, bytes):
|
||||
file = file.encode('utf-8')
|
||||
with open(file, 'rb') as f:
|
||||
data = f.read()
|
||||
|
@ -36,7 +35,7 @@ def get_torrent_info(data=None, file=None):
|
|||
metainfo = bdecode(data)
|
||||
info = metainfo['info']
|
||||
piece_length = info['piece length']
|
||||
if info.has_key('length'):
|
||||
if 'length' in info:
|
||||
# let's assume we just have one file
|
||||
file_length = info['length']
|
||||
else:
|
||||
|
|
|
@ -2,8 +2,8 @@
|
|||
# encoding: utf-8
|
||||
__version__ = '1.0.0'
|
||||
|
||||
import imdb
|
||||
import wikipedia
|
||||
import google
|
||||
import piratecinema
|
||||
import oxdb
|
||||
from . import imdb
|
||||
from . import wikipedia
|
||||
from . import google
|
||||
from . import piratecinema
|
||||
from . import oxdb
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
import re
|
||||
import time
|
||||
|
||||
from ox import strip_tags, find_re
|
||||
from ox.cache import read_url
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
import re
|
||||
from urllib import quote
|
||||
from six.moves.urllib.parse import quote
|
||||
|
||||
from ox import find_re, strip_tags, decode_html
|
||||
from ox.cache import read_url
|
||||
|
|
|
@ -1,14 +1,11 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
from datetime import datetime
|
||||
from urllib import urlencode
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
|
||||
from ox import find_re, strip_tags, decode_html
|
||||
from ox import find_re, strip_tags
|
||||
from ox.cache import read_url
|
||||
from ox.net import open_url
|
||||
|
||||
def get_data(id, language='en'):
|
||||
if language == 'de':
|
||||
|
@ -57,7 +54,7 @@ def backup(filename):
|
|||
data = json.load(f)
|
||||
else:
|
||||
data = {}
|
||||
start = ids and max(map(int, data)) or 1
|
||||
start = max(map(int, data)) or 1
|
||||
for i in range(start, 11872):
|
||||
info = get_data(i)
|
||||
if info:
|
||||
|
|
|
@ -5,7 +5,7 @@ import re
|
|||
import ox.cache
|
||||
from ox.cache import read_url
|
||||
from ox.html import strip_tags
|
||||
from ox.text import find_re, remove_special_characters
|
||||
from ox.text import find_re
|
||||
|
||||
import imdb
|
||||
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
import re
|
||||
from urllib import unquote
|
||||
from six.moves.urllib.parse import unquote
|
||||
from ox.cache import read_url
|
||||
|
||||
|
||||
|
|
|
@ -1,17 +1,17 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
import re
|
||||
import urllib
|
||||
|
||||
from six.moves import urllib
|
||||
import ox
|
||||
from ox import strip_tags, decode_html
|
||||
from ox.utils import json
|
||||
from ox.cache import read_url
|
||||
|
||||
|
||||
def find(query, timeout=ox.cache.cache_timeout):
|
||||
if isinstance(query, unicode):
|
||||
if not isinstance(query, bytes):
|
||||
query = query.encode('utf-8')
|
||||
params = urllib.urlencode({'q': query})
|
||||
params = urllib.parse.urlencode({'q': query})
|
||||
url = 'http://duckduckgo.com/html/?' + params
|
||||
data = read_url(url, timeout=timeout).decode('utf-8')
|
||||
results = []
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
import re
|
||||
import urllib
|
||||
from six.moves import urllib
|
||||
|
||||
import ox
|
||||
from ox import strip_tags, decode_html
|
||||
|
@ -13,9 +13,9 @@ def read_url(url, data=None, headers=ox.net.DEFAULT_HEADERS, timeout=DEFAULT_TIM
|
|||
return ox.cache.read_url(url, data, headers, timeout, unicode=True)
|
||||
|
||||
def quote_plus(s):
|
||||
if not isinstance(s, str):
|
||||
if not isinstance(s, bytes):
|
||||
s = s.encode('utf-8')
|
||||
return urllib.quote_plus(s)
|
||||
return urllib.parse.quote_plus(s)
|
||||
|
||||
def find(query, max_results=DEFAULT_MAX_RESULTS, timeout=DEFAULT_TIMEOUT):
|
||||
"""
|
||||
|
|
|
@ -1,23 +1,27 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
import urllib
|
||||
from __future__ import print_function
|
||||
|
||||
import re
|
||||
import time
|
||||
import unicodedata
|
||||
|
||||
import ox
|
||||
from ox import find_re, strip_tags
|
||||
import ox.cache
|
||||
from six.moves import urllib
|
||||
from six import string_types
|
||||
|
||||
from siteparser import SiteParser
|
||||
import duckduckgo
|
||||
|
||||
from .. import find_re, strip_tags, decode_html
|
||||
from .. import cache
|
||||
|
||||
|
||||
from . siteparser import SiteParser
|
||||
from . import duckduckgo
|
||||
from ..utils import datetime
|
||||
from ..geo import normalize_country_name
|
||||
|
||||
def read_url(url, data=None, headers=ox.cache.DEFAULT_HEADERS, timeout=ox.cache.cache_timeout, valid=None, unicode=False):
|
||||
def read_url(url, data=None, headers=cache.DEFAULT_HEADERS, timeout=cache.cache_timeout, valid=None, unicode=False):
|
||||
headers = headers.copy()
|
||||
return ox.cache.read_url(url, data, headers, timeout, unicode=unicode)
|
||||
return cache.read_url(url, data, headers, timeout, unicode=unicode)
|
||||
|
||||
def get_url(id):
|
||||
return "http://www.imdb.com/title/tt%s/" % id
|
||||
|
@ -49,7 +53,7 @@ class Imdb(SiteParser):
|
|||
'page': 'business',
|
||||
're': [
|
||||
'<h5>Budget</h5>\s*?\$(.*?)<br',
|
||||
lambda data: find_re(ox.decode_html(data).replace(',', ''), '\d+')
|
||||
lambda data: find_re(decode_html(data).replace(',', ''), '\d+')
|
||||
],
|
||||
'type': 'int'
|
||||
},
|
||||
|
@ -211,7 +215,7 @@ class Imdb(SiteParser):
|
|||
'page': 'releaseinfo',
|
||||
're': [
|
||||
'<td class="release_date">(.*?)</td>',
|
||||
ox.strip_tags,
|
||||
strip_tags,
|
||||
],
|
||||
'type': 'list'
|
||||
},
|
||||
|
@ -326,7 +330,7 @@ class Imdb(SiteParser):
|
|||
|
||||
if 'alternativeTitles' in self:
|
||||
if len(self['alternativeTitles']) == 2 and \
|
||||
isinstance(self['alternativeTitles'][0], basestring):
|
||||
isinstance(self['alternativeTitles'][0], string_types):
|
||||
self['alternativeTitles'] = [self['alternativeTitles']]
|
||||
|
||||
#normalize country names
|
||||
|
@ -472,7 +476,7 @@ class Imdb(SiteParser):
|
|||
if c:
|
||||
alt[title].append(c)
|
||||
self['alternativeTitles'] = []
|
||||
for t in sorted(alt, lambda a, b: cmp(sorted(alt[a]), sorted(alt[b]))):
|
||||
for t in sorted(alt, key=lambda a: sorted(alt[a])):
|
||||
if alt[t]:
|
||||
countries = sorted([normalize_country_name(c) or c for c in alt[t]])
|
||||
self['alternativeTitles'].append((t, countries))
|
||||
|
@ -492,7 +496,7 @@ class Imdb(SiteParser):
|
|||
if 'votes' in self: self['votes'] = self['votes'].replace(',', '')
|
||||
|
||||
if 'cast' in self:
|
||||
if isinstance(self['cast'][0], basestring):
|
||||
if isinstance(self['cast'][0], string_types):
|
||||
self['cast'] = [self['cast']]
|
||||
self['actor'] = [c[0] for c in self['cast']]
|
||||
def cleanup_character(c):
|
||||
|
@ -503,10 +507,12 @@ class Imdb(SiteParser):
|
|||
|
||||
if 'connections' in self:
|
||||
cc={}
|
||||
if len(self['connections']) == 3 and isinstance(self['connections'][0], basestring):
|
||||
if len(self['connections']) == 3 and isinstance(self['connections'][0], string_types):
|
||||
self['connections'] = [self['connections']]
|
||||
for rel, data, _ in self['connections']:
|
||||
#cc[unicode(rel)] = re.compile('<a href="/title/tt(\d{7})/">(.*?)</a>').findall(data)
|
||||
if isinstance(rel, bytes):
|
||||
rel = rel.decode('utf-8')
|
||||
#cc[rel] = re.compile('<a href="/title/tt(\d{7})/">(.*?)</a>').findall(data)
|
||||
def get_conn(c):
|
||||
r = {
|
||||
'id': c[0],
|
||||
|
@ -516,14 +522,14 @@ class Imdb(SiteParser):
|
|||
if len(description) == 2 and description[-1].strip() != '-':
|
||||
r['description'] = description[-1].strip()
|
||||
return r
|
||||
cc[unicode(rel)] = map(get_conn, re.compile('<a href="/title/tt(\d{7})/">(.*?)</a>(.*?)<\/div', re.DOTALL).findall(data))
|
||||
cc[rel] = list(map(get_conn, re.compile('<a href="/title/tt(\d{7})/">(.*?)</a>(.*?)<\/div', re.DOTALL).findall(data)))
|
||||
|
||||
|
||||
self['connections'] = cc
|
||||
|
||||
for key in ('country', 'genre'):
|
||||
if key in self:
|
||||
self[key] = filter(lambda x: x.lower() != 'home', self[key])
|
||||
self[key] = list(filter(lambda x: x.lower() != 'home', self[key]))
|
||||
#0092999
|
||||
if '_director' in self:
|
||||
if 'series' in self or 'isSeries' in self:
|
||||
|
@ -590,8 +596,8 @@ class Imdb(SiteParser):
|
|||
if key in self:
|
||||
if isinstance(self[key][0], list):
|
||||
self[key] = [i[0] for i in self[key] if i]
|
||||
self[key] = sorted(list(set(self[key])),
|
||||
lambda a, b: self[key].index(a) - self[key].index(b))
|
||||
self[key] = sorted(list(set(self[key])), key=lambda a: self[key].index(a))
|
||||
|
||||
|
||||
if 'budget' in self and 'gross' in self:
|
||||
self['profit'] = self['gross'] - self['budget']
|
||||
|
@ -655,7 +661,7 @@ def get_movie_by_title(title, timeout=-1):
|
|||
u'0866567'
|
||||
'''
|
||||
params = {'s':'tt','q': title}
|
||||
if isinstance(title, unicode):
|
||||
if not isinstance(title, bytes):
|
||||
try:
|
||||
params['q'] = unicodedata.normalize('NFKC', params['q']).encode('latin-1')
|
||||
except:
|
||||
|
@ -731,7 +737,7 @@ def get_movie_id(title, director='', year='', timeout=-1):
|
|||
if year:
|
||||
params['q'] = u'"%s (%s)" %s' % (title, year, director)
|
||||
google_query = "site:imdb.com %s" % params['q']
|
||||
if isinstance(params['q'], unicode):
|
||||
if not isinstance(params['q'], bytes):
|
||||
try:
|
||||
params['q'] = unicodedata.normalize('NFKC', params['q']).encode('latin-1')
|
||||
except:
|
||||
|
@ -775,7 +781,7 @@ def get_movie_poster(imdbId):
|
|||
info = ImdbCombined(imdbId)
|
||||
if 'posterId' in info:
|
||||
url = "http://www.imdb.com/media/rm%s/tt%s" % (info['posterId'], imdbId)
|
||||
data = read_url(url)
|
||||
data = read_url(url).decode('utf-8', 'ignore')
|
||||
poster = find_re(data, 'img.*?id="primary-img".*?src="(.*?)"')
|
||||
return poster
|
||||
elif 'series' in info:
|
||||
|
@ -787,11 +793,11 @@ def get_episodes(imdbId, season=None):
|
|||
url = 'http://www.imdb.com/title/tt%s/episodes' % imdbId
|
||||
if season:
|
||||
url += '?season=%d' % season
|
||||
data = ox.cache.read_url(url)
|
||||
data = cache.read_url(url)
|
||||
for e in re.compile('<div data-const="tt(\d{7})".*?>.*?<div>S(\d+), Ep(\d+)<\/div>\n<\/div>', re.DOTALL).findall(data):
|
||||
episodes['S%02dE%02d' %(int(e[1]), int(e[2]))] = e[0]
|
||||
else:
|
||||
data = ox.cache.read_url(url)
|
||||
data = cache.read_url(url)
|
||||
match = re.compile('<strong>Season (\d+)</strong>').findall(data)
|
||||
if match:
|
||||
for season in range(1, int(match[0]) + 1):
|
||||
|
@ -800,7 +806,7 @@ def get_episodes(imdbId, season=None):
|
|||
|
||||
def max_votes():
|
||||
url = 'http://www.imdb.com/search/title?num_votes=500000,&sort=num_votes,desc'
|
||||
data = ox.cache.read_url(url)
|
||||
data = cache.read_url(url)
|
||||
votes = max([int(v.replace(',', ''))
|
||||
for v in re.compile('<td class="sort_col">([\d,]+)</td>').findall(data)])
|
||||
return votes
|
||||
|
@ -810,6 +816,6 @@ def guess(title, director='', timeout=-1):
|
|||
|
||||
if __name__ == "__main__":
|
||||
import json
|
||||
print json.dumps(Imdb('0306414'), indent=2)
|
||||
print(json.dumps(Imdb('0306414'), indent=2))
|
||||
#print json.dumps(Imdb('0133093'), indent=2)
|
||||
|
||||
|
|
|
@ -1,5 +1,7 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
from __future__ import print_function
|
||||
|
||||
import re
|
||||
from ox.net import read_url
|
||||
|
||||
|
@ -13,5 +15,5 @@ def get_poster_url(id):
|
|||
return ''
|
||||
|
||||
if __name__ == '__main__':
|
||||
print get_poster_url('0749451')
|
||||
print(get_poster_url('0749451'))
|
||||
|
||||
|
|
|
@ -2,22 +2,24 @@
|
|||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
import re
|
||||
|
||||
from six import string_types
|
||||
|
||||
from ..cache import read_url
|
||||
from .. import strip_tags, decode_html
|
||||
from .. import decode_html
|
||||
from ..utils import datetime
|
||||
|
||||
|
||||
def cleanup(key, data, data_type):
|
||||
if data:
|
||||
if isinstance(data[0], basestring):
|
||||
if isinstance(data[0], string_types):
|
||||
#FIXME: some types need strip_tags
|
||||
#data = [strip_tags(decode_html(p)).strip() for p in data]
|
||||
data = [decode_html(p).strip() for p in data]
|
||||
elif isinstance(data[0], list) or isinstance(data[0], tuple):
|
||||
data = [cleanup(key, p, data_type) for p in data]
|
||||
while len(data) == 1 and not isinstance(data, basestring):
|
||||
while len(data) == 1 and not isinstance(data, string_types):
|
||||
data = data[0]
|
||||
if data_type == 'list' and isinstance(data, basestring):
|
||||
if data_type == 'list' and isinstance(data, string_types):
|
||||
data = [data, ]
|
||||
elif data_type != 'list':
|
||||
data = ''
|
||||
|
@ -40,7 +42,7 @@ class SiteParser(dict):
|
|||
for key in self.regex:
|
||||
url = self.get_url(self.regex[key]['page'])
|
||||
data = self.read_url(url, timeout)
|
||||
if isinstance(self.regex[key]['re'], basestring):
|
||||
if isinstance(self.regex[key]['re'], string_types):
|
||||
data = re.compile(self.regex[key]['re'], re.DOTALL).findall(data)
|
||||
data = cleanup(key, data, self.regex[key]['type'])
|
||||
elif callable(self.regex[key]['re']):
|
||||
|
@ -51,7 +53,7 @@ class SiteParser(dict):
|
|||
f = r
|
||||
else:
|
||||
f = re.compile(r, re.DOTALL).findall
|
||||
if isinstance(data, basestring):
|
||||
if isinstance(data, string_types):
|
||||
data = f(data)
|
||||
else:
|
||||
data = [f(d) for d in data]
|
||||
|
|
|
@ -1,11 +1,14 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# vi:si:et:sw=4:sts=4:ts=4
|
||||
from __future__ import print_function
|
||||
|
||||
import re
|
||||
from urllib import urlencode
|
||||
|
||||
from six.moves import urllib
|
||||
|
||||
from ox.utils import json
|
||||
from ox.cache import read_url
|
||||
from ox import find_re, decode_html
|
||||
from ox import find_re
|
||||
|
||||
|
||||
def get_id(url):
|
||||
|
@ -138,11 +141,11 @@ def get_allmovie_id(wikipedia_url):
|
|||
def find(query, max_results=10):
|
||||
query = {'action': 'query', 'list':'search', 'format': 'json',
|
||||
'srlimit': max_results, 'srwhat': 'text', 'srsearch': query.encode('utf-8')}
|
||||
url = "http://en.wikipedia.org/w/api.php?" + urlencode(query)
|
||||
url = "http://en.wikipedia.org/w/api.php?" + urllib.parse.urlencode(query)
|
||||
data = read_url(url)
|
||||
if not data:
|
||||
data = read_url(url, timeout=0)
|
||||
result = json.loads(data)
|
||||
result = json.loads(data.decode('utf-8'))
|
||||
results = []
|
||||
if result and 'query' in result:
|
||||
for r in result['query']['search']:
|
||||
|
|
5
setup.py
5
setup.py
|
@ -36,15 +36,16 @@ setup(
|
|||
download_url="http://code.0x2620.org/python-ox/download",
|
||||
license="GPLv3",
|
||||
packages=['ox', 'ox.django', 'ox.django.api', 'ox.torrent', 'ox.web'],
|
||||
install_requires=['chardet', 'feedparser'],
|
||||
install_requires=['six', 'chardet', 'feedparser'],
|
||||
keywords = [
|
||||
],
|
||||
classifiers = [
|
||||
'Operating System :: OS Independent',
|
||||
'Programming Language :: Python',
|
||||
'Programming Language :: Python :: 2',
|
||||
'Programming Language :: Python :: 2.6',
|
||||
'Programming Language :: Python :: 2.7',
|
||||
'Programming Language :: Python :: 3',
|
||||
'Programming Language :: Python :: 3.4',
|
||||
'Topic :: Software Development :: Libraries :: Python Modules',
|
||||
],
|
||||
)
|
||||
|
|
Loading…
Reference in a new issue