This commit is contained in:
j 2016-01-14 17:09:10 +05:30
parent c49f663d54
commit 2681536b08
3 changed files with 11 additions and 11 deletions

View file

@ -6,7 +6,7 @@ from __future__ import print_function
import unicodedata
from six import unichr, PY3
from six import unichr, PY2
__all__ = ['fix_bad_unicode']
@ -151,10 +151,10 @@ def text_badness(text):
- Improbable single-byte characters, such as ƒ or ¬
- Letters in somewhat rare scripts
'''
if PY3:
assert isinstance(text, str)
else:
if PY2:
assert isinstance(text, unicode)
else:
assert isinstance(text, str)
errors = 0
very_weird_things = 0
weird_things = 0

View file

@ -117,7 +117,7 @@ def clean_html(text):
* Removes stuff like "<p>&nbsp;&nbsp;</p>", but only if it's at the
bottom of the text.
"""
from text import normalize_newlines
from .text import normalize_newlines
text = normalize_newlines(text)
text = re.sub(r'<(/?)\s*b\s*>', '<\\1strong>', text)
text = re.sub(r'<(/?)\s*i\s*>', '<\\1em>', text)

View file

@ -8,7 +8,7 @@ import os
import re
import struct
from six import BytesIO, PY3
from six import BytesIO, PY2
from six.moves import urllib
from chardet.universaldetector import UniversalDetector
@ -52,14 +52,14 @@ def get_json(url, data=None, headers=DEFAULT_HEADERS):
return json.loads(read_url(url, data, headers).decode('utf-8'))
def open_url(url, data=None, headers=DEFAULT_HEADERS):
if PY3:
if isinstance(url, bytes):
url = url.decode('utf-8')
else:
if PY2:
if not isinstance(url, bytes):
url = url.encode('utf-8')
else:
if isinstance(url, bytes):
url = url.decode('utf-8')
url = url.replace(' ', '%20')
if data and PY3 and not isinstance(data, bytes):
if data and not PY2 and not isinstance(data, bytes):
data = data.encode('utf-8')
req = urllib.request.Request(url, data, headers)
return urllib.request.urlopen(req)