From 2bfd63ff0f1f2128ceb67199bcc5752cc7f32058 Mon Sep 17 00:00:00 2001 From: j <0x006A@0x2620.org> Date: Sun, 30 Dec 2012 16:16:23 +0100 Subject: [PATCH] add ox.fix_bad_unicode --- ox/__init__.py | 1 + ox/fixunicode.py | 331 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 332 insertions(+) create mode 100644 ox/fixunicode.py diff --git a/ox/__init__.py b/ox/__init__.py index e667851..c767960 100644 --- a/ox/__init__.py +++ b/ox/__init__.py @@ -27,3 +27,4 @@ from normalize import * from oembed import * from text import * from torrent import * +from fixunicode import * diff --git a/ox/fixunicode.py b/ox/fixunicode.py new file mode 100644 index 0000000..332684b --- /dev/null +++ b/ox/fixunicode.py @@ -0,0 +1,331 @@ +# vi:si:et:sw=4:sts=4:ts=4 +# -*- coding: utf-8 -*- +# from http://blog.lumino.so/2012/08/20/fix-unicode-mistakes-with-python/ +# MIT + +import unicodedata + +__all__ = ['fix_bad_unicode'] + +def fix_bad_unicode(text): + u""" + Something you will find all over the place, in real-world text, is text + that's mistakenly encoded as utf-8, decoded in some ugly format like + latin-1 or even Windows codepage 1252, and encoded as utf-8 again. + + This causes your perfectly good Unicode-aware code to end up with garbage + text because someone else (or maybe "someone else") made a mistake. + + This function looks for the evidence of that having happened and fixes it. + It determines whether it should replace nonsense sequences of single-byte + characters that were really meant to be UTF-8 characters, and if so, turns + them into the correctly-encoded Unicode character that they were meant to + represent. + + The input to the function must be Unicode. It's not going to try to + auto-decode bytes for you -- then it would just create the problems it's + supposed to fix. + + >>> print fix_bad_unicode(u'único') + único + + >>> print fix_bad_unicode(u'This text is fine already :þ') + This text is fine already :þ + + Because these characters often come from Microsoft products, we allow + for the possibility that we get not just Unicode characters 128-255, but + also Windows's conflicting idea of what characters 128-160 are. + + >>> print fix_bad_unicode(u'This — should be an em dash') + This — should be an em dash + + We might have to deal with both Windows characters and raw control + characters at the same time, especially when dealing with characters like + \x81 that have no mapping in Windows. + + >>> print fix_bad_unicode(u'This text is sad .â\x81”.') + This text is sad .⁔. + + This function even fixes multiple levels of badness: + + >>> wtf = u'\xc3\xa0\xc2\xb2\xc2\xa0_\xc3\xa0\xc2\xb2\xc2\xa0' + >>> print fix_bad_unicode(wtf) + ಠ_ಠ + + However, it has safeguards against fixing sequences of letters and + punctuation that can occur in valid text: + + >>> print fix_bad_unicode(u'not such a fan of Charlotte Brontë…”') + not such a fan of Charlotte Brontë…” + + Cases of genuine ambiguity can sometimes be addressed by finding other + characters that are not double-encoding, and expecting the encoding to + be consistent: + + >>> print fix_bad_unicode(u'AHÅ™, the new sofa from IKEA®') + AHÅ™, the new sofa from IKEA® + + Finally, we handle the case where the text is in a single-byte encoding + that was intended as Windows-1252 all along but read as Latin-1: + + >>> print fix_bad_unicode(u'This text was never Unicode at all\x85') + This text was never Unicode at all… + """ + if not isinstance(text, unicode): + raise TypeError("This isn't even decoded into Unicode yet. " + "Decode it first.") + if len(text) == 0: + return text + + maxord = max(ord(char) for char in text) + tried_fixing = [] + if maxord < 128: + # Hooray! It's ASCII! + return text + else: + attempts = [(text, text_badness(text) + len(text))] + if maxord < 256: + tried_fixing = reinterpret_latin1_as_utf8(text) + tried_fixing2 = reinterpret_latin1_as_windows1252(text) + attempts.append((tried_fixing, text_cost(tried_fixing))) + attempts.append((tried_fixing2, text_cost(tried_fixing2))) + elif all(ord(char) in WINDOWS_1252_CODEPOINTS for char in text): + tried_fixing = reinterpret_windows1252_as_utf8(text) + attempts.append((tried_fixing, text_cost(tried_fixing))) + else: + # We can't imagine how this would be anything but valid text. + return text + + # Sort the results by badness + attempts.sort(key=lambda x: x[1]) + #print attempts + goodtext = attempts[0][0] + if goodtext == text: + return goodtext + else: + return fix_bad_unicode(goodtext) + + +def reinterpret_latin1_as_utf8(wrongtext): + newbytes = wrongtext.encode('latin-1', 'replace') + return newbytes.decode('utf-8', 'replace') + + +def reinterpret_windows1252_as_utf8(wrongtext): + altered_bytes = [] + for char in wrongtext: + if ord(char) in WINDOWS_1252_GREMLINS: + altered_bytes.append(char.encode('WINDOWS_1252')) + else: + altered_bytes.append(char.encode('latin-1', 'replace')) + return ''.join(altered_bytes).decode('utf-8', 'replace') + + +def reinterpret_latin1_as_windows1252(wrongtext): + """ + Maybe this was always meant to be in a single-byte encoding, and it + makes the most sense in Windows-1252. + """ + return wrongtext.encode('latin-1').decode('WINDOWS_1252', 'replace') + + +def text_badness(text): + u''' + Look for red flags that text is encoded incorrectly: + + Obvious problems: + - The replacement character \ufffd, indicating a decoding error + - Unassigned or private-use Unicode characters + + Very weird things: + - Adjacent letters from two different scripts + - Letters in scripts that are very rarely used on computers (and + therefore, someone who is using them will probably get Unicode right) + - Improbable control characters, such as 0x81 + + Moderately weird things: + - Improbable single-byte characters, such as ƒ or ¬ + - Letters in somewhat rare scripts + ''' + assert isinstance(text, unicode) + errors = 0 + very_weird_things = 0 + weird_things = 0 + prev_letter_script = None + for pos in xrange(len(text)): + char = text[pos] + index = ord(char) + if index < 256: + # Deal quickly with the first 256 characters. + weird_things += SINGLE_BYTE_WEIRDNESS[index] + if SINGLE_BYTE_LETTERS[index]: + prev_letter_script = 'latin' + else: + prev_letter_script = None + else: + category = unicodedata.category(char) + if category == 'Co': + # Unassigned or private use + errors += 1 + elif index == 0xfffd: + # Replacement character + errors += 1 + elif index in WINDOWS_1252_GREMLINS: + lowchar = char.encode('WINDOWS_1252').decode('latin-1') + weird_things += SINGLE_BYTE_WEIRDNESS[ord(lowchar)] - 0.5 + + if category.startswith('L'): + # It's a letter. What kind of letter? This is typically found + # in the first word of the letter's Unicode name. + name = unicodedata.name(char) + scriptname = name.split()[0] + freq, script = SCRIPT_TABLE.get(scriptname, (0, 'other')) + if prev_letter_script: + if script != prev_letter_script: + very_weird_things += 1 + if freq == 1: + weird_things += 2 + elif freq == 0: + very_weird_things += 1 + prev_letter_script = script + else: + prev_letter_script = None + + return 100 * errors + 10 * very_weird_things + weird_things + + +def text_cost(text): + """ + Assign a cost function to the length plus weirdness of a text string. + """ + return text_badness(text) + len(text) + +####################################################################### +# The rest of this file is esoteric info about characters, scripts, and their +# frequencies. +# +# Start with an inventory of "gremlins", which are characters from all over +# Unicode that Windows has instead assigned to the control characters +# 0x80-0x9F. We might encounter them in their Unicode forms and have to figure +# out what they were originally. + +WINDOWS_1252_GREMLINS = [ + # adapted from http://effbot.org/zone/unicode-gremlins.htm + 0x0152, # LATIN CAPITAL LIGATURE OE + 0x0153, # LATIN SMALL LIGATURE OE + 0x0160, # LATIN CAPITAL LETTER S WITH CARON + 0x0161, # LATIN SMALL LETTER S WITH CARON + 0x0178, # LATIN CAPITAL LETTER Y WITH DIAERESIS + 0x017E, # LATIN SMALL LETTER Z WITH CARON + 0x017D, # LATIN CAPITAL LETTER Z WITH CARON + 0x0192, # LATIN SMALL LETTER F WITH HOOK + 0x02C6, # MODIFIER LETTER CIRCUMFLEX ACCENT + 0x02DC, # SMALL TILDE + 0x2013, # EN DASH + 0x2014, # EM DASH + 0x201A, # SINGLE LOW-9 QUOTATION MARK + 0x201C, # LEFT DOUBLE QUOTATION MARK + 0x201D, # RIGHT DOUBLE QUOTATION MARK + 0x201E, # DOUBLE LOW-9 QUOTATION MARK + 0x2018, # LEFT SINGLE QUOTATION MARK + 0x2019, # RIGHT SINGLE QUOTATION MARK + 0x2020, # DAGGER + 0x2021, # DOUBLE DAGGER + 0x2022, # BULLET + 0x2026, # HORIZONTAL ELLIPSIS + 0x2030, # PER MILLE SIGN + 0x2039, # SINGLE LEFT-POINTING ANGLE QUOTATION MARK + 0x203A, # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK + 0x20AC, # EURO SIGN + 0x2122, # TRADE MARK SIGN +] + +# a list of Unicode characters that might appear in Windows-1252 text +WINDOWS_1252_CODEPOINTS = range(256) + WINDOWS_1252_GREMLINS + +# Rank the characters typically represented by a single byte -- that is, in +# Latin-1 or Windows-1252 -- by how weird it would be to see them in running +# text. +# +# 0 = not weird at all +# 1 = rare punctuation or rare letter that someone could certainly +# have a good reason to use. All Windows-1252 gremlins are at least +# weirdness 1. +# 2 = things that probably don't appear next to letters or other +# symbols, such as math or currency symbols +# 3 = obscure symbols that nobody would go out of their way to use +# (includes symbols that were replaced in ISO-8859-15) +# 4 = why would you use this? +# 5 = unprintable control character +# +# The Portuguese letter à (0xc3) is marked as weird because it would usually +# appear in the middle of a word in actual Portuguese, and meanwhile it +# appears in the mis-encodings of many common characters. + +SINGLE_BYTE_WEIRDNESS = ( +# 0 1 2 3 4 5 6 7 8 9 a b c d e f + 5, 5, 5, 5, 5, 5, 5, 5, 5, 0, 0, 5, 5, 5, 5, 5, # 0x00 + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, # 0x10 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x20 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x30 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x40 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x50 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x60 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, # 0x70 + 2, 5, 1, 4, 1, 1, 3, 3, 4, 3, 1, 1, 1, 5, 1, 5, # 0x80 + 5, 1, 1, 1, 1, 3, 1, 1, 4, 1, 1, 1, 1, 5, 1, 1, # 0x90 + 1, 0, 2, 2, 3, 2, 4, 2, 4, 2, 2, 0, 3, 1, 1, 4, # 0xa0 + 2, 2, 3, 3, 4, 3, 3, 2, 4, 4, 4, 0, 3, 3, 3, 0, # 0xb0 + 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0xc0 + 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, # 0xd0 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0xe0 + 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, # 0xf0 +) + +# Pre-cache the Unicode data saying which of these first 256 characters are +# letters. We'll need it often. +SINGLE_BYTE_LETTERS = [ + unicodedata.category(unichr(i)).startswith('L') + for i in xrange(256) +] + +# A table telling us how to interpret the first word of a letter's Unicode +# name. The number indicates how frequently we expect this script to be used +# on computers. Many scripts not included here are assumed to have a frequency +# of "0" -- if you're going to write in Linear B using Unicode, you're +# probably aware enough of encoding issues to get it right. +# +# The lowercase name is a general category -- for example, Han characters and +# Hiragana characters are very frequently adjacent in Japanese, so they all go +# into category 'cjk'. Letters of different categories are assumed not to +# appear next to each other often. +SCRIPT_TABLE = { + 'LATIN': (3, 'latin'), + 'CJK': (2, 'cjk'), + 'ARABIC': (2, 'arabic'), + 'CYRILLIC': (2, 'cyrillic'), + 'GREEK': (2, 'greek'), + 'HEBREW': (2, 'hebrew'), + 'KATAKANA': (2, 'cjk'), + 'HIRAGANA': (2, 'cjk'), + 'HIRAGANA-KATAKANA': (2, 'cjk'), + 'HANGUL': (2, 'cjk'), + 'DEVANAGARI': (2, 'devanagari'), + 'THAI': (2, 'thai'), + 'FULLWIDTH': (2, 'cjk'), + 'MODIFIER': (2, None), + 'HALFWIDTH': (1, 'cjk'), + 'BENGALI': (1, 'bengali'), + 'LAO': (1, 'lao'), + 'KHMER': (1, 'khmer'), + 'TELUGU': (1, 'telugu'), + 'MALAYALAM': (1, 'malayalam'), + 'SINHALA': (1, 'sinhala'), + 'TAMIL': (1, 'tamil'), + 'GEORGIAN': (1, 'georgian'), + 'ARMENIAN': (1, 'armenian'), + 'KANNADA': (1, 'kannada'), # mostly used for looks of disapproval + 'MASCULINE': (1, 'latin'), + 'FEMININE': (1, 'latin') +} +