2013-10-11 17:28:32 +00:00
|
|
|
#!/usr/bin/env python
|
|
|
|
"""
|
|
|
|
Script which takes one or more file paths and reports on their detected
|
|
|
|
encodings
|
|
|
|
|
|
|
|
Example::
|
|
|
|
|
|
|
|
% chardetect somefile someotherfile
|
|
|
|
somefile: windows-1252 with confidence 0.5
|
|
|
|
someotherfile: ascii with confidence 1.0
|
|
|
|
|
|
|
|
If no paths are provided, it takes its input from stdin.
|
|
|
|
|
|
|
|
"""
|
2015-11-04 12:01:55 +00:00
|
|
|
|
|
|
|
from __future__ import absolute_import, print_function, unicode_literals
|
|
|
|
|
|
|
|
import argparse
|
|
|
|
import sys
|
2013-10-11 17:28:32 +00:00
|
|
|
from io import open
|
|
|
|
|
2015-11-04 12:01:55 +00:00
|
|
|
from chardet import __version__
|
2013-10-11 17:28:32 +00:00
|
|
|
from chardet.universaldetector import UniversalDetector
|
|
|
|
|
|
|
|
|
2015-11-04 12:01:55 +00:00
|
|
|
def description_of(lines, name='stdin'):
|
|
|
|
"""
|
|
|
|
Return a string describing the probable encoding of a file or
|
|
|
|
list of strings.
|
|
|
|
|
|
|
|
:param lines: The lines to get the encoding of.
|
|
|
|
:type lines: Iterable of bytes
|
|
|
|
:param name: Name of file or collection of lines
|
|
|
|
:type name: str
|
|
|
|
"""
|
2013-10-11 17:28:32 +00:00
|
|
|
u = UniversalDetector()
|
2015-11-04 12:01:55 +00:00
|
|
|
for line in lines:
|
2013-10-11 17:28:32 +00:00
|
|
|
u.feed(line)
|
|
|
|
u.close()
|
|
|
|
result = u.result
|
|
|
|
if result['encoding']:
|
2015-11-04 12:01:55 +00:00
|
|
|
return '{0}: {1} with confidence {2}'.format(name, result['encoding'],
|
|
|
|
result['confidence'])
|
2013-10-11 17:28:32 +00:00
|
|
|
else:
|
2015-11-04 12:01:55 +00:00
|
|
|
return '{0}: no result'.format(name)
|
2013-10-11 17:28:32 +00:00
|
|
|
|
|
|
|
|
2015-11-04 12:01:55 +00:00
|
|
|
def main(argv=None):
|
|
|
|
'''
|
|
|
|
Handles command line arguments and gets things started.
|
|
|
|
|
|
|
|
:param argv: List of arguments, as if specified on the command-line.
|
|
|
|
If None, ``sys.argv[1:]`` is used instead.
|
|
|
|
:type argv: list of str
|
|
|
|
'''
|
|
|
|
# Get command line arguments
|
|
|
|
parser = argparse.ArgumentParser(
|
|
|
|
description="Takes one or more file paths and reports their detected \
|
|
|
|
encodings",
|
|
|
|
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
|
|
|
|
conflict_handler='resolve')
|
|
|
|
parser.add_argument('input',
|
|
|
|
help='File whose encoding we would like to determine.',
|
|
|
|
type=argparse.FileType('rb'), nargs='*',
|
|
|
|
default=[sys.stdin])
|
|
|
|
parser.add_argument('--version', action='version',
|
|
|
|
version='%(prog)s {0}'.format(__version__))
|
|
|
|
args = parser.parse_args(argv)
|
|
|
|
|
|
|
|
for f in args.input:
|
|
|
|
if f.isatty():
|
|
|
|
print("You are running chardetect interactively. Press " +
|
|
|
|
"CTRL-D twice at the start of a blank line to signal the " +
|
|
|
|
"end of your input. If you want help, run chardetect " +
|
|
|
|
"--help\n", file=sys.stderr)
|
|
|
|
print(description_of(f, f.name))
|
2013-10-11 17:28:32 +00:00
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
main()
|