update Shared
This commit is contained in:
parent
e7ebbedd38
commit
6881f3471a
184 changed files with 13080 additions and 13691 deletions
|
|
@ -15,7 +15,7 @@
|
|||
# 02110-1301 USA
|
||||
######################### END LICENSE BLOCK #########################
|
||||
|
||||
__version__ = "2.2.1"
|
||||
__version__ = "2.3.0"
|
||||
from sys import version_info
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -12,34 +12,68 @@ Example::
|
|||
If no paths are provided, it takes its input from stdin.
|
||||
|
||||
"""
|
||||
from io import open
|
||||
from sys import argv, stdin
|
||||
|
||||
from __future__ import absolute_import, print_function, unicode_literals
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
from io import open
|
||||
|
||||
from chardet import __version__
|
||||
from chardet.universaldetector import UniversalDetector
|
||||
|
||||
|
||||
def description_of(file, name='stdin'):
|
||||
"""Return a string describing the probable encoding of a file."""
|
||||
def description_of(lines, name='stdin'):
|
||||
"""
|
||||
Return a string describing the probable encoding of a file or
|
||||
list of strings.
|
||||
|
||||
:param lines: The lines to get the encoding of.
|
||||
:type lines: Iterable of bytes
|
||||
:param name: Name of file or collection of lines
|
||||
:type name: str
|
||||
"""
|
||||
u = UniversalDetector()
|
||||
for line in file:
|
||||
for line in lines:
|
||||
u.feed(line)
|
||||
u.close()
|
||||
result = u.result
|
||||
if result['encoding']:
|
||||
return '%s: %s with confidence %s' % (name,
|
||||
result['encoding'],
|
||||
result['confidence'])
|
||||
return '{0}: {1} with confidence {2}'.format(name, result['encoding'],
|
||||
result['confidence'])
|
||||
else:
|
||||
return '%s: no result' % name
|
||||
return '{0}: no result'.format(name)
|
||||
|
||||
|
||||
def main():
|
||||
if len(argv) <= 1:
|
||||
print(description_of(stdin))
|
||||
else:
|
||||
for path in argv[1:]:
|
||||
with open(path, 'rb') as f:
|
||||
print(description_of(f, path))
|
||||
def main(argv=None):
|
||||
'''
|
||||
Handles command line arguments and gets things started.
|
||||
|
||||
:param argv: List of arguments, as if specified on the command-line.
|
||||
If None, ``sys.argv[1:]`` is used instead.
|
||||
:type argv: list of str
|
||||
'''
|
||||
# Get command line arguments
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Takes one or more file paths and reports their detected \
|
||||
encodings",
|
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
|
||||
conflict_handler='resolve')
|
||||
parser.add_argument('input',
|
||||
help='File whose encoding we would like to determine.',
|
||||
type=argparse.FileType('rb'), nargs='*',
|
||||
default=[sys.stdin])
|
||||
parser.add_argument('--version', action='version',
|
||||
version='%(prog)s {0}'.format(__version__))
|
||||
args = parser.parse_args(argv)
|
||||
|
||||
for f in args.input:
|
||||
if f.isatty():
|
||||
print("You are running chardetect interactively. Press " +
|
||||
"CTRL-D twice at the start of a blank line to signal the " +
|
||||
"end of your input. If you want help, run chardetect " +
|
||||
"--help\n", file=sys.stderr)
|
||||
print(description_of(f, f.name))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
|
|
|||
|
|
@ -177,6 +177,12 @@ class JapaneseContextAnalysis:
|
|||
return -1, 1
|
||||
|
||||
class SJISContextAnalysis(JapaneseContextAnalysis):
|
||||
def __init__(self):
|
||||
self.charset_name = "SHIFT_JIS"
|
||||
|
||||
def get_charset_name(self):
|
||||
return self.charset_name
|
||||
|
||||
def get_order(self, aBuf):
|
||||
if not aBuf:
|
||||
return -1, 1
|
||||
|
|
@ -184,6 +190,8 @@ class SJISContextAnalysis(JapaneseContextAnalysis):
|
|||
first_char = wrap_ord(aBuf[0])
|
||||
if ((0x81 <= first_char <= 0x9F) or (0xE0 <= first_char <= 0xFC)):
|
||||
charLen = 2
|
||||
if (first_char == 0x87) or (0xFA <= first_char <= 0xFC):
|
||||
self.charset_name = "CP932"
|
||||
else:
|
||||
charLen = 1
|
||||
|
||||
|
|
|
|||
|
|
@ -129,11 +129,11 @@ class Latin1Prober(CharSetProber):
|
|||
if total < 0.01:
|
||||
confidence = 0.0
|
||||
else:
|
||||
confidence = ((self._mFreqCounter[3] / total)
|
||||
- (self._mFreqCounter[1] * 20.0 / total))
|
||||
confidence = ((self._mFreqCounter[3] - self._mFreqCounter[1] * 20.0)
|
||||
/ total)
|
||||
if confidence < 0.0:
|
||||
confidence = 0.0
|
||||
# lower the confidence of latin1 so that other more accurate
|
||||
# detector can take priority.
|
||||
confidence = confidence * 0.5
|
||||
confidence = confidence * 0.73
|
||||
return confidence
|
||||
|
|
|
|||
|
|
@ -353,7 +353,7 @@ SJIS_cls = (
|
|||
2,2,2,2,2,2,2,2, # 68 - 6f
|
||||
2,2,2,2,2,2,2,2, # 70 - 77
|
||||
2,2,2,2,2,2,2,1, # 78 - 7f
|
||||
3,3,3,3,3,3,3,3, # 80 - 87
|
||||
3,3,3,3,3,2,2,3, # 80 - 87
|
||||
3,3,3,3,3,3,3,3, # 88 - 8f
|
||||
3,3,3,3,3,3,3,3, # 90 - 97
|
||||
3,3,3,3,3,3,3,3, # 98 - 9f
|
||||
|
|
@ -369,9 +369,8 @@ SJIS_cls = (
|
|||
2,2,2,2,2,2,2,2, # d8 - df
|
||||
3,3,3,3,3,3,3,3, # e0 - e7
|
||||
3,3,3,3,3,4,4,4, # e8 - ef
|
||||
4,4,4,4,4,4,4,4, # f0 - f7
|
||||
4,4,4,4,4,0,0,0 # f8 - ff
|
||||
)
|
||||
3,3,3,3,3,3,3,3, # f0 - f7
|
||||
3,3,3,3,3,0,0,0) # f8 - ff
|
||||
|
||||
|
||||
SJIS_st = (
|
||||
|
|
@ -571,5 +570,3 @@ UTF8SMModel = {'classTable': UTF8_cls,
|
|||
'stateTable': UTF8_st,
|
||||
'charLenTable': UTF8CharLenTable,
|
||||
'name': 'UTF-8'}
|
||||
|
||||
# flake8: noqa
|
||||
|
|
|
|||
|
|
@ -47,7 +47,7 @@ class SJISProber(MultiByteCharSetProber):
|
|||
self._mContextAnalyzer.reset()
|
||||
|
||||
def get_charset_name(self):
|
||||
return "SHIFT_JIS"
|
||||
return self._mContextAnalyzer.get_charset_name()
|
||||
|
||||
def feed(self, aBuf):
|
||||
aLen = len(aBuf)
|
||||
|
|
|
|||
|
|
@ -71,9 +71,9 @@ class UniversalDetector:
|
|||
|
||||
if not self._mGotData:
|
||||
# If the data starts with BOM, we know it is UTF
|
||||
if aBuf[:3] == codecs.BOM:
|
||||
if aBuf[:3] == codecs.BOM_UTF8:
|
||||
# EF BB BF UTF-8 with BOM
|
||||
self.result = {'encoding': "UTF-8", 'confidence': 1.0}
|
||||
self.result = {'encoding': "UTF-8-SIG", 'confidence': 1.0}
|
||||
elif aBuf[:4] == codecs.BOM_UTF32_LE:
|
||||
# FF FE 00 00 UTF-32, little-endian BOM
|
||||
self.result = {'encoding': "UTF-32LE", 'confidence': 1.0}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue