2013-10-11 17:28:32 +00:00
|
|
|
# vi:si:et:sw=4:sts=4:ts=4
|
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
# from http://blog.lumino.so/2012/08/20/fix-unicode-mistakes-with-python/
|
|
|
|
# MIT
|
2014-09-30 16:15:32 +00:00
|
|
|
from __future__ import print_function
|
2013-10-11 17:28:32 +00:00
|
|
|
|
|
|
|
import unicodedata
|
|
|
|
|
2018-12-15 00:08:54 +00:00
|
|
|
from six import unichr, text_type
|
2014-09-30 16:15:32 +00:00
|
|
|
|
2013-10-11 17:28:32 +00:00
|
|
|
__all__ = ['fix_bad_unicode']
|
|
|
|
|
|
|
|
def fix_bad_unicode(text):
|
2014-09-30 16:15:32 +00:00
|
|
|
"""
|
2013-10-11 17:28:32 +00:00
|
|
|
Something you will find all over the place, in real-world text, is text
|
|
|
|
that's mistakenly encoded as utf-8, decoded in some ugly format like
|
|
|
|
latin-1 or even Windows codepage 1252, and encoded as utf-8 again.
|
|
|
|
|
|
|
|
This causes your perfectly good Unicode-aware code to end up with garbage
|
|
|
|
text because someone else (or maybe "someone else") made a mistake.
|
|
|
|
|
|
|
|
This function looks for the evidence of that having happened and fixes it.
|
|
|
|
It determines whether it should replace nonsense sequences of single-byte
|
|
|
|
characters that were really meant to be UTF-8 characters, and if so, turns
|
|
|
|
them into the correctly-encoded Unicode character that they were meant to
|
|
|
|
represent.
|
|
|
|
|
|
|
|
The input to the function must be Unicode. It's not going to try to
|
|
|
|
auto-decode bytes for you -- then it would just create the problems it's
|
|
|
|
supposed to fix.
|
|
|
|
|
2014-09-30 16:15:32 +00:00
|
|
|
>>> fix_bad_unicode(u'único')
|
|
|
|
'único'
|
|
|
|
|
|
|
|
>>> fix_bad_unicode('This text is fine already :þ')
|
|
|
|
'This text is fine already :þ'
|
2013-10-11 17:28:32 +00:00
|
|
|
|
|
|
|
|
|
|
|
Because these characters often come from Microsoft products, we allow
|
|
|
|
for the possibility that we get not just Unicode characters 128-255, but
|
|
|
|
also Windows's conflicting idea of what characters 128-160 are.
|
|
|
|
|
2014-09-30 16:15:32 +00:00
|
|
|
>>> fix_bad_unicode('This — should be an em dash')
|
|
|
|
'This — should be an em dash'
|
2013-10-11 17:28:32 +00:00
|
|
|
|
|
|
|
We might have to deal with both Windows characters and raw control
|
|
|
|
characters at the same time, especially when dealing with characters like
|
|
|
|
\x81 that have no mapping in Windows.
|
|
|
|
|
2014-09-30 16:15:32 +00:00
|
|
|
>>> fix_bad_unicode('This text is sad .â\x81”.')
|
|
|
|
'This text is sad .⁔.'
|
2013-10-11 17:28:32 +00:00
|
|
|
|
|
|
|
This function even fixes multiple levels of badness:
|
|
|
|
|
2014-09-30 16:15:32 +00:00
|
|
|
>>> wtf = '\xc3\xa0\xc2\xb2\xc2\xa0_\xc3\xa0\xc2\xb2\xc2\xa0'
|
|
|
|
>>> fix_bad_unicode(wtf)
|
|
|
|
'ಠ_ಠ'
|
2013-10-11 17:28:32 +00:00
|
|
|
|
|
|
|
However, it has safeguards against fixing sequences of letters and
|
|
|
|
punctuation that can occur in valid text:
|
|
|
|
|
2014-09-30 16:15:32 +00:00
|
|
|
>>> fix_bad_unicode('not such a fan of Charlotte Brontë…”')
|
|
|
|
'not such a fan of Charlotte Brontë…”'
|
2013-10-11 17:28:32 +00:00
|
|
|
|
|
|
|
Cases of genuine ambiguity can sometimes be addressed by finding other
|
|
|
|
characters that are not double-encoding, and expecting the encoding to
|
|
|
|
be consistent:
|
|
|
|
|
2014-09-30 16:15:32 +00:00
|
|
|
>>> fix_bad_unicode('AHÅ™, the new sofa from IKEA®')
|
|
|
|
'AHÅ™, the new sofa from IKEA®'
|
2013-10-11 17:28:32 +00:00
|
|
|
|
|
|
|
Finally, we handle the case where the text is in a single-byte encoding
|
|
|
|
that was intended as Windows-1252 all along but read as Latin-1:
|
|
|
|
|
2014-09-30 16:15:32 +00:00
|
|
|
>>> fix_bad_unicode('This text was never Unicode at all\x85')
|
|
|
|
'This text was never Unicode at all…'
|
2013-10-11 17:28:32 +00:00
|
|
|
"""
|
2015-11-04 12:01:55 +00:00
|
|
|
if isinstance(text, bytes):
|
2013-10-11 17:28:32 +00:00
|
|
|
raise TypeError("This isn't even decoded into Unicode yet. "
|
|
|
|
"Decode it first.")
|
|
|
|
if len(text) == 0:
|
|
|
|
return text
|
|
|
|
|
|
|
|
maxord = max(ord(char) for char in text)
|
|
|
|
tried_fixing = []
|
|
|
|
if maxord < 128:
|
|
|
|
# Hooray! It's ASCII!
|
|
|
|
return text
|
|
|
|
else:
|
|
|
|
attempts = [(text, text_badness(text) + len(text))]
|
|
|
|
if maxord < 256:
|
|
|
|
tried_fixing = reinterpret_latin1_as_utf8(text)
|
|
|
|
tried_fixing2 = reinterpret_latin1_as_windows1252(text)
|
|
|
|
attempts.append((tried_fixing, text_cost(tried_fixing)))
|
|
|
|
attempts.append((tried_fixing2, text_cost(tried_fixing2)))
|
|
|
|
elif all(ord(char) in WINDOWS_1252_CODEPOINTS for char in text):
|
|
|
|
tried_fixing = reinterpret_windows1252_as_utf8(text)
|
|
|
|
attempts.append((tried_fixing, text_cost(tried_fixing)))
|
|
|
|
else:
|
|
|
|
# We can't imagine how this would be anything but valid text.
|
|
|
|
return text
|
|
|
|
|
|
|
|
# Sort the results by badness
|
|
|
|
attempts.sort(key=lambda x: x[1])
|
|
|
|
#print attempts
|
|
|
|
goodtext = attempts[0][0]
|
|
|
|
if goodtext == text:
|
|
|
|
return goodtext
|
|
|
|
else:
|
|
|
|
return fix_bad_unicode(goodtext)
|
|
|
|
|
|
|
|
|
|
|
|
def reinterpret_latin1_as_utf8(wrongtext):
|
|
|
|
newbytes = wrongtext.encode('latin-1', 'replace')
|
|
|
|
return newbytes.decode('utf-8', 'replace')
|
|
|
|
|
|
|
|
|
|
|
|
def reinterpret_windows1252_as_utf8(wrongtext):
|
|
|
|
altered_bytes = []
|
|
|
|
for char in wrongtext:
|
|
|
|
if ord(char) in WINDOWS_1252_GREMLINS:
|
|
|
|
altered_bytes.append(char.encode('WINDOWS_1252'))
|
|
|
|
else:
|
|
|
|
altered_bytes.append(char.encode('latin-1', 'replace'))
|
2014-09-30 16:15:32 +00:00
|
|
|
return b''.join(altered_bytes).decode('utf-8', 'replace')
|
2013-10-11 17:28:32 +00:00
|
|
|
|
|
|
|
|
|
|
|
def reinterpret_latin1_as_windows1252(wrongtext):
|
|
|
|
"""
|
|
|
|
Maybe this was always meant to be in a single-byte encoding, and it
|
|
|
|
makes the most sense in Windows-1252.
|
|
|
|
"""
|
|
|
|
return wrongtext.encode('latin-1').decode('WINDOWS_1252', 'replace')
|
|
|
|
|
|
|
|
|
|
|
|
def text_badness(text):
|
2014-09-30 16:15:32 +00:00
|
|
|
'''
|
2013-10-11 17:28:32 +00:00
|
|
|
Look for red flags that text is encoded incorrectly:
|
|
|
|
|
|
|
|
Obvious problems:
|
|
|
|
- The replacement character \ufffd, indicating a decoding error
|
|
|
|
- Unassigned or private-use Unicode characters
|
|
|
|
|
|
|
|
Very weird things:
|
|
|
|
- Adjacent letters from two different scripts
|
|
|
|
- Letters in scripts that are very rarely used on computers (and
|
|
|
|
therefore, someone who is using them will probably get Unicode right)
|
|
|
|
- Improbable control characters, such as 0x81
|
|
|
|
|
|
|
|
Moderately weird things:
|
|
|
|
- Improbable single-byte characters, such as ƒ or ¬
|
|
|
|
- Letters in somewhat rare scripts
|
|
|
|
'''
|
2018-12-15 00:08:54 +00:00
|
|
|
assert isinstance(text, text_type)
|
2013-10-11 17:28:32 +00:00
|
|
|
errors = 0
|
|
|
|
very_weird_things = 0
|
|
|
|
weird_things = 0
|
|
|
|
prev_letter_script = None
|
2014-09-30 16:15:32 +00:00
|
|
|
for pos in range(len(text)):
|
2013-10-11 17:28:32 +00:00
|
|
|
char = text[pos]
|
|
|
|
index = ord(char)
|
|
|
|
if index < 256:
|
|
|
|
# Deal quickly with the first 256 characters.
|
|
|
|
weird_things += SINGLE_BYTE_WEIRDNESS[index]
|
|
|
|
if SINGLE_BYTE_LETTERS[index]:
|
|
|
|
prev_letter_script = 'latin'
|
|
|
|
else:
|
|
|
|
prev_letter_script = None
|
|
|
|
else:
|
|
|
|
category = unicodedata.category(char)
|
|
|
|
if category == 'Co':
|
|
|
|
# Unassigned or private use
|
|
|
|
errors += 1
|
|
|
|
elif index == 0xfffd:
|
|
|
|
# Replacement character
|
|
|
|
errors += 1
|
|
|
|
elif index in WINDOWS_1252_GREMLINS:
|
|
|
|
lowchar = char.encode('WINDOWS_1252').decode('latin-1')
|
|
|
|
weird_things += SINGLE_BYTE_WEIRDNESS[ord(lowchar)] - 0.5
|
|
|
|
|
|
|
|
if category.startswith('L'):
|
|
|
|
# It's a letter. What kind of letter? This is typically found
|
|
|
|
# in the first word of the letter's Unicode name.
|
|
|
|
name = unicodedata.name(char)
|
|
|
|
scriptname = name.split()[0]
|
|
|
|
freq, script = SCRIPT_TABLE.get(scriptname, (0, 'other'))
|
|
|
|
if prev_letter_script:
|
|
|
|
if script != prev_letter_script:
|
|
|
|
very_weird_things += 1
|
|
|
|
if freq == 1:
|
|
|
|
weird_things += 2
|
|
|
|
elif freq == 0:
|
|
|
|
very_weird_things += 1
|
|
|
|
prev_letter_script = script
|
|
|
|
else:
|
|
|
|
prev_letter_script = None
|
|
|
|
|
|
|
|
return 100 * errors + 10 * very_weird_things + weird_things
|
|
|
|
|
|
|
|
|
|
|
|
def text_cost(text):
|
|
|
|
"""
|
|
|
|
Assign a cost function to the length plus weirdness of a text string.
|
|
|
|
"""
|
|
|
|
return text_badness(text) + len(text)
|
|
|
|
|
|
|
|
#######################################################################
|
|
|
|
# The rest of this file is esoteric info about characters, scripts, and their
|
|
|
|
# frequencies.
|
|
|
|
#
|
|
|
|
# Start with an inventory of "gremlins", which are characters from all over
|
|
|
|
# Unicode that Windows has instead assigned to the control characters
|
|
|
|
# 0x80-0x9F. We might encounter them in their Unicode forms and have to figure
|
|
|
|
# out what they were originally.
|
|
|
|
|
|
|
|
WINDOWS_1252_GREMLINS = [
|
|
|
|
# adapted from http://effbot.org/zone/unicode-gremlins.htm
|
|
|
|
0x0152, # LATIN CAPITAL LIGATURE OE
|
|
|
|
0x0153, # LATIN SMALL LIGATURE OE
|
|
|
|
0x0160, # LATIN CAPITAL LETTER S WITH CARON
|
|
|
|
0x0161, # LATIN SMALL LETTER S WITH CARON
|
|
|
|
0x0178, # LATIN CAPITAL LETTER Y WITH DIAERESIS
|
|
|
|
0x017E, # LATIN SMALL LETTER Z WITH CARON
|
|
|
|
0x017D, # LATIN CAPITAL LETTER Z WITH CARON
|
|
|
|
0x0192, # LATIN SMALL LETTER F WITH HOOK
|
|
|
|
0x02C6, # MODIFIER LETTER CIRCUMFLEX ACCENT
|
|
|
|
0x02DC, # SMALL TILDE
|
|
|
|
0x2013, # EN DASH
|
|
|
|
0x2014, # EM DASH
|
|
|
|
0x201A, # SINGLE LOW-9 QUOTATION MARK
|
|
|
|
0x201C, # LEFT DOUBLE QUOTATION MARK
|
|
|
|
0x201D, # RIGHT DOUBLE QUOTATION MARK
|
|
|
|
0x201E, # DOUBLE LOW-9 QUOTATION MARK
|
|
|
|
0x2018, # LEFT SINGLE QUOTATION MARK
|
|
|
|
0x2019, # RIGHT SINGLE QUOTATION MARK
|
|
|
|
0x2020, # DAGGER
|
|
|
|
0x2021, # DOUBLE DAGGER
|
|
|
|
0x2022, # BULLET
|
|
|
|
0x2026, # HORIZONTAL ELLIPSIS
|
|
|
|
0x2030, # PER MILLE SIGN
|
|
|
|
0x2039, # SINGLE LEFT-POINTING ANGLE QUOTATION MARK
|
|
|
|
0x203A, # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
|
|
|
|
0x20AC, # EURO SIGN
|
|
|
|
0x2122, # TRADE MARK SIGN
|
|
|
|
]
|
|
|
|
|
|
|
|
# a list of Unicode characters that might appear in Windows-1252 text
|
2014-09-30 16:15:32 +00:00
|
|
|
WINDOWS_1252_CODEPOINTS = list(range(256)) + WINDOWS_1252_GREMLINS
|
2013-10-11 17:28:32 +00:00
|
|
|
|
|
|
|
# Rank the characters typically represented by a single byte -- that is, in
|
|
|
|
# Latin-1 or Windows-1252 -- by how weird it would be to see them in running
|
|
|
|
# text.
|
|
|
|
#
|
|
|
|
# 0 = not weird at all
|
|
|
|
# 1 = rare punctuation or rare letter that someone could certainly
|
|
|
|
# have a good reason to use. All Windows-1252 gremlins are at least
|
|
|
|
# weirdness 1.
|
|
|
|
# 2 = things that probably don't appear next to letters or other
|
|
|
|
# symbols, such as math or currency symbols
|
|
|
|
# 3 = obscure symbols that nobody would go out of their way to use
|
|
|
|
# (includes symbols that were replaced in ISO-8859-15)
|
|
|
|
# 4 = why would you use this?
|
|
|
|
# 5 = unprintable control character
|
|
|
|
#
|
|
|
|
# The Portuguese letter à (0xc3) is marked as weird because it would usually
|
|
|
|
# appear in the middle of a word in actual Portuguese, and meanwhile it
|
|
|
|
# appears in the mis-encodings of many common characters.
|
|
|
|
|
|
|
|
SINGLE_BYTE_WEIRDNESS = (
|
|
|
|
# 0 1 2 3 4 5 6 7 8 9 a b c d e f
|
|
|
|
5, 5, 5, 5, 5, 5, 5, 5, 5, 0, 0, 5, 5, 5, 5, 5, # 0x00
|
|
|
|
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, # 0x10
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x20
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x30
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x40
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x50
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x60
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, # 0x70
|
|
|
|
2, 5, 1, 4, 1, 1, 3, 3, 4, 3, 1, 1, 1, 5, 1, 5, # 0x80
|
|
|
|
5, 1, 1, 1, 1, 3, 1, 1, 4, 1, 1, 1, 1, 5, 1, 1, # 0x90
|
|
|
|
1, 0, 2, 2, 3, 2, 4, 2, 4, 2, 2, 0, 3, 1, 1, 4, # 0xa0
|
|
|
|
2, 2, 3, 3, 4, 3, 3, 2, 4, 4, 4, 0, 3, 3, 3, 0, # 0xb0
|
|
|
|
0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0xc0
|
|
|
|
1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, # 0xd0
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0xe0
|
|
|
|
1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, # 0xf0
|
|
|
|
)
|
|
|
|
|
|
|
|
# Pre-cache the Unicode data saying which of these first 256 characters are
|
|
|
|
# letters. We'll need it often.
|
|
|
|
SINGLE_BYTE_LETTERS = [
|
|
|
|
unicodedata.category(unichr(i)).startswith('L')
|
2014-09-30 16:15:32 +00:00
|
|
|
for i in range(256)
|
2013-10-11 17:28:32 +00:00
|
|
|
]
|
|
|
|
|
|
|
|
# A table telling us how to interpret the first word of a letter's Unicode
|
|
|
|
# name. The number indicates how frequently we expect this script to be used
|
|
|
|
# on computers. Many scripts not included here are assumed to have a frequency
|
|
|
|
# of "0" -- if you're going to write in Linear B using Unicode, you're
|
|
|
|
# probably aware enough of encoding issues to get it right.
|
|
|
|
#
|
|
|
|
# The lowercase name is a general category -- for example, Han characters and
|
|
|
|
# Hiragana characters are very frequently adjacent in Japanese, so they all go
|
|
|
|
# into category 'cjk'. Letters of different categories are assumed not to
|
|
|
|
# appear next to each other often.
|
|
|
|
SCRIPT_TABLE = {
|
|
|
|
'LATIN': (3, 'latin'),
|
|
|
|
'CJK': (2, 'cjk'),
|
|
|
|
'ARABIC': (2, 'arabic'),
|
|
|
|
'CYRILLIC': (2, 'cyrillic'),
|
|
|
|
'GREEK': (2, 'greek'),
|
|
|
|
'HEBREW': (2, 'hebrew'),
|
|
|
|
'KATAKANA': (2, 'cjk'),
|
|
|
|
'HIRAGANA': (2, 'cjk'),
|
|
|
|
'HIRAGANA-KATAKANA': (2, 'cjk'),
|
|
|
|
'HANGUL': (2, 'cjk'),
|
|
|
|
'DEVANAGARI': (2, 'devanagari'),
|
|
|
|
'THAI': (2, 'thai'),
|
|
|
|
'FULLWIDTH': (2, 'cjk'),
|
|
|
|
'MODIFIER': (2, None),
|
|
|
|
'HALFWIDTH': (1, 'cjk'),
|
|
|
|
'BENGALI': (1, 'bengali'),
|
|
|
|
'LAO': (1, 'lao'),
|
|
|
|
'KHMER': (1, 'khmer'),
|
|
|
|
'TELUGU': (1, 'telugu'),
|
|
|
|
'MALAYALAM': (1, 'malayalam'),
|
|
|
|
'SINHALA': (1, 'sinhala'),
|
|
|
|
'TAMIL': (1, 'tamil'),
|
|
|
|
'GEORGIAN': (1, 'georgian'),
|
|
|
|
'ARMENIAN': (1, 'armenian'),
|
|
|
|
'KANNADA': (1, 'kannada'), # mostly used for looks of disapproval
|
|
|
|
'MASCULINE': (1, 'latin'),
|
|
|
|
'FEMININE': (1, 'latin')
|
|
|
|
}
|
|
|
|
|