#! /usr/bin/env python3
# -*- coding: utf-8 -*-

# flo-detect-non-ascii-chars-in-files --- Detect non-ASCII characters in a
#                                         set of files
# Copyright (C) 2013  Florent Rougon
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; version 2 dated June, 1991.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

import sys, os, locale, argparse, codecs, collections

line_width = 80
import textwrap
from textwrap import dedent
tw = textwrap.TextWrapper(width=line_width, break_long_words=False,
                          break_on_hyphens=False)

progname = os.path.basename(sys.argv[0])
progversion = "0.1"
version_blurb = """Written by Florent Rougon.

Copyright (c) 2013  Florent Rougon
This is free software; see the source for copying conditions.  There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE."""


class NonASCIICharsDetector:
    def __init__(self):
        self.match_count = 0

    def contains_nonascii_chars(self, s):
        return any( ( ord(c) > 127 for c in s ) )

    def process_file(self, filepath, encoding, errors="replace"):
        found_nonASCII = False

        if errors == "strict":
            try:
                with open(filepath, "r", encoding=encoding, errors="strict") \
                        as f:
                    f.read()
            except UnicodeDecodeError:
                return (False, found_nonASCII)

        contextlines = collections.deque((), maxlen=params.context+1)
        line = None
        line_nb = 0

        if False:
            start_delimiter = ("< " * (line_width//2))[:-1]
            end_delimiter = ("> " * (line_width//2))[:-1]
        else:
            start_delimiter = "<" * line_width
            end_delimiter = ">" * line_width

        def _readline(f):
            nonlocal line, line_nb, contextlines

            line = f.readline()
            if line:
                if line.endswith('\n'):
                    line = line[:-1]
                contextlines.append(line)
                line_nb += 1
                return line
            else:
                return None

        with open(filepath, "r", encoding=encoding, errors=errors) as f:
            while True:
                if _readline(f) is None: break

                if self.contains_nonascii_chars(line):
                    found_nonASCII = True # only for this file
                    self.match_count += 1 # global (across all files)
                    if self.match_count == 1: # first match among all files
                        l  = []
                    else:
                        l = [""]

                    l.extend([ "From file {!r} (line {}, decoded as {}):".format(
                                filepath, line_nb, encoding),
                               start_delimiter ])
                    for i in range(-1-params.context, -1):
                        try:
                            l.append(contextlines[i])
                        except IndexError:
                            pass
                    l.append(line)

                    endcontext_needed = params.context
                    while endcontext_needed > 0:
                        if _readline(f) is None: break
                        l.append(line)

                        if self.contains_nonascii_chars(line):
                             # Reset it to its initial value
                            endcontext_needed = params.context
                        else:
                            endcontext_needed -= 1

                    l.append(end_delimiter)
                    if not params.files_with_matches:
                        print(*l, sep='\n')

        if params.files_with_matches and found_nonASCII:
            print(filepath)

        return (True, found_nonASCII)

    def process_file_in_bytes_mode(self, filepath):
        d = {}
        offset = 0
        with open(filepath, "rb") as f:
            while True:
                b = f.read(1)
                if not b:
                    break
                offset += 1
                num = b[0]      # integer
                if num > 127:
                    self.match_count += 1
                    if num in d:
                        d[num].append(offset - 1)
                    else:
                        d[num] = [offset - 1]

        if d:
            print("In file {!r}:".format(filepath))

        for (val, offsets) in d.items():
            if 32 <= val <= 126 or 160 <= val: # printable in latin1 or latin9
                complement = ", latin1: {}, latin9: {}".format(
                  *[ bytes([val]).decode(enc) for enc in ("latin1", "latin9") ])
            else:
                complement = ""

            print(
              "Non-ASCII byte ({val:d}, {val:#x}, {val:#o}{cmpl}) "
              "at offsets:\n  {offs}".format(val=val, cmpl=complement,
                                           offs=' '.join(map(str, offsets))))
            if val in (164, 166, 168, 180, 184, 188, 189, 190):
                print(tw.fill(dedent("""\
                  WARNING: this code point is one of the 8 code points that
                  differ between ISO-8859-1 and ISO-8859-15.\n""")))

    def process_files(self, files, assume_encoding=None):
        if assume_encoding is not None:
            try_encodings = ()
            fallback_encoding = assume_encoding
        else:
            try_encodings = ("utf-8", "iso8859_15", "cp1252")
            fallback_encoding = locale.getpreferredencoding()

        for f in files:
            if not os.path.isfile(f):
                print("{}: WARNING: not a regular file, skipping: {!r}".format(
                        progname, f), file=sys.stderr)
                continue

            try:
                if params.bytes_mode:
                    self.process_file_in_bytes_mode(f)
                else:
                    for encoding in try_encodings:
                        processed, found_nonASCII = self.process_file(
                            f, encoding, errors="strict")
                        if processed:
                            break
                    else:
                        self.process_file(f, fallback_encoding, errors="replace")
            except PermissionError as e:
                print("{}: WARNING: {}".format(progname, e), file=sys.stderr)


class ExitFromCommandLineOrConfigFileParsing(Exception):
    """Exception raised to exit from command line and config file parsing"""
    def __init__(self, message=None, file=sys.stderr, exit_status=2):
        self.message = message
        self.file = file
        self.exit_status = exit_status

    def __str__(self):
        return self.message

    def __repr__(self):
        return """{classname}(
  message={msg!r},
  file={file!r},
  exit_status={status!r})""".format(classname=self.__class__.__name__,
                                    file=self.file,
                                    msg=self.message,
                                    status=self.exit_status)

    ExceptionShortDescription = "Exiting from command-line parsing"


class EncodingAct(argparse.Action):
    def __call__(self, parser, namespace, value, option_string=None):
        try:
            codecs.lookup(value)
        except LookupError:
            msg = dedent("""\
           '{encoding}' is not a recognized encoding; valid encodings are
           those recognized by the Python 'codecs' module (cf.
           <http://docs.python.org/3/library/codecs.html#standard-encodings>)"""
                         .format(progname=progname, encoding=value))
            raise ExitFromCommandLineOrConfigFileParsing(msg)

        namespace.assume_encoding = value


def process_command_line():
    params = argparse.Namespace()

    parser = argparse.ArgumentParser(
        usage="""\
%(prog)s [OPTION ...] FILE ...
Detect non-ASCII characters in a set of files.""",
        description="""\
By default, the program tries to guess the encoding of each file in order to
display the lines containing non-ASCII characters. This auto-detection can be
turned off with option -e (--assume-encoding), in which case invalid input
according to the specified encoding is replaced (using the REPLACEMENT
CHARACTER in Unicode locales). This is particularly useful to see non-breakable
spaces by specifying the ASCII encoding to that option.""",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        # I want --help but not -h (it might be useful for something else)
        add_help=False)

    parser.add_argument('file', nargs='+',
                        help="""file to check""")
    parser.add_argument('-C', '--context', type=int, default=2,
                        help="""\
      number of context lines (default: %(default)s)""")
    parser.add_argument('-l', '--files-with-matches', action='store_const',
                        const=True, default=False, help="""\
      suppress normal output; instead, print the name of each file that
      contains non-ASCII characters, one per line""")
    parser.add_argument('-e', '--assume-encoding', metavar="ENC", default=None,
                        action=EncodingAct, help="""\
      decode the files with ENC, using the errors="replace" policy (this does
      not modify the files; default: try several encodings before falling back
      to locale.getpreferredencoding())""")
    parser.add_argument('-b', '--bytes-mode', action='store_const',
                        const=True, default=False, help="""\
      don't try to decode any file; only print byte offsets of bytes > 127""")
    parser.add_argument('--help', action="help",
                        help="display this message and exit")
    # The version text is not wrapped when using
    # formatter_class=argparse.RawDescriptionHelpFormatter
    parser.add_argument('--version', action='version',
                        version="{name} {version}\n{blurb}".format(
            name=progname, version=progversion, blurb=version_blurb))

    params = parser.parse_args(namespace=params)

    return params


def main():
    global params
    locale.setlocale(locale.LC_ALL, '')

    try:
        params = process_command_line()
    except ExitFromCommandLineOrConfigFileParsing as e:
        if e.message is not None:
            print(tw.fill("{}: {}".format(progname, e.message)), file=e.file)
        sys.exit(e.exit_status)

    detector = NonASCIICharsDetector()
    detector.process_files(params.file, assume_encoding=params.assume_encoding)

    sys.exit(0 if detector.match_count else 1)

if __name__ == "__main__": main()