This commit is contained in:
j 2016-01-14 17:09:10 +05:30
parent c49f663d54
commit 2681536b08
3 changed files with 11 additions and 11 deletions

View file

@ -6,7 +6,7 @@ from __future__ import print_function
import unicodedata import unicodedata
from six import unichr, PY3 from six import unichr, PY2
__all__ = ['fix_bad_unicode'] __all__ = ['fix_bad_unicode']
@ -151,10 +151,10 @@ def text_badness(text):
- Improbable single-byte characters, such as ƒ or ¬ - Improbable single-byte characters, such as ƒ or ¬
- Letters in somewhat rare scripts - Letters in somewhat rare scripts
''' '''
if PY3: if PY2:
assert isinstance(text, str)
else:
assert isinstance(text, unicode) assert isinstance(text, unicode)
else:
assert isinstance(text, str)
errors = 0 errors = 0
very_weird_things = 0 very_weird_things = 0
weird_things = 0 weird_things = 0

View file

@ -117,7 +117,7 @@ def clean_html(text):
* Removes stuff like "<p>&nbsp;&nbsp;</p>", but only if it's at the * Removes stuff like "<p>&nbsp;&nbsp;</p>", but only if it's at the
bottom of the text. bottom of the text.
""" """
from text import normalize_newlines from .text import normalize_newlines
text = normalize_newlines(text) text = normalize_newlines(text)
text = re.sub(r'<(/?)\s*b\s*>', '<\\1strong>', text) text = re.sub(r'<(/?)\s*b\s*>', '<\\1strong>', text)
text = re.sub(r'<(/?)\s*i\s*>', '<\\1em>', text) text = re.sub(r'<(/?)\s*i\s*>', '<\\1em>', text)

View file

@ -8,7 +8,7 @@ import os
import re import re
import struct import struct
from six import BytesIO, PY3 from six import BytesIO, PY2
from six.moves import urllib from six.moves import urllib
from chardet.universaldetector import UniversalDetector from chardet.universaldetector import UniversalDetector
@ -52,14 +52,14 @@ def get_json(url, data=None, headers=DEFAULT_HEADERS):
return json.loads(read_url(url, data, headers).decode('utf-8')) return json.loads(read_url(url, data, headers).decode('utf-8'))
def open_url(url, data=None, headers=DEFAULT_HEADERS): def open_url(url, data=None, headers=DEFAULT_HEADERS):
if PY3: if PY2:
if isinstance(url, bytes):
url = url.decode('utf-8')
else:
if not isinstance(url, bytes): if not isinstance(url, bytes):
url = url.encode('utf-8') url = url.encode('utf-8')
else:
if isinstance(url, bytes):
url = url.decode('utf-8')
url = url.replace(' ', '%20') url = url.replace(' ', '%20')
if data and PY3 and not isinstance(data, bytes): if data and not PY2 and not isinstance(data, bytes):
data = data.encode('utf-8') data = data.encode('utf-8')
req = urllib.request.Request(url, data, headers) req = urllib.request.Request(url, data, headers)
return urllib.request.urlopen(req) return urllib.request.urlopen(req)