#! /usr/bin/env python

# flo-identify-files-with-nonascii-chars --- Identify files containing non-ASCII
#                                            characters
# Copyright (c) 2009 Florent Rougon
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; version 2 dated June, 1991.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program; see the file COPYING. If not, write to the
# Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
# Boston, MA  02110-1301 USA.

from __future__ import nested_scopes, division
import sys, os, stat, getopt, codecs

progname = os.path.basename(sys.argv[0])

usage = """Usage: %s [OPTION...] FILE_OR_DIR...
Identify files containing non-ASCII characters.

DESCRIPTION

Every FILE_OR_DIR argument is examined in sequence:
  - if FILE_OR_DIR is a file, its name is printed if, and only if the
    file contains at least a byte greater than 127;
  - if FILE_OR_DIR is a directory, it is scanned in a recursive
    manner in order to print the name of every file under FILE_OR_DIR
    that contains at least a byte greater than 127.

Alternatively, if no argument is given, the FILE_OR_DIR elements to
scan are read from the standard input, separatated by newlines by
default (this may be changed with option -0).

Symbolic links are not followed and special files not attempted to
read from; both are simply reported on the standard error.

OPTIONS

  -0                          use null characters as separators when
                              reading arguments from the standard input
  -f, --filter                apply a custom filter to the output (use
                              the Source, Luke)
  -v, --verbose               in addition to file names, print information
                              such as the position and value of the first byte
                              that is greater than 127
      --help                  display usage information and exit

EXIT STATUS

Similarly to GNU grep(1), in case of normal termination, the exit
status is 0 if at least one non-ASCII character was found (in any of
the input file(s)), 1 otherwise. In case of incorrect invocation
(for instance, if an invalid option was given), the exit status is 2.""" % \
(progname,)

params = {}
found_non_ASCII_chars = False


def filter_func(path):
    ext = os.path.splitext(path)[1]

    if not ext or ext == ".py":
        return False
    else:
        return True


def handle_file(fpath):
    global found_non_ASCII_chars

    f = open(fpath, "rb")

    while True:
        c = f.read(1)
        if not c:
            break
        num = ord(c)

        if num > 127:
            found_non_ASCII_chars = True
            if (not params["filter_output"]) or (not filter_func(fpath)):
                if params["verbose"]:
                    additional_info = "pos = %u, value = %02X" \
                                      % (f.tell() - 1, num)
                    print "%s: %s" % (fpath, additional_info)
                else:
                    print fpath
            break

    f.close()


def os_walk_dumb_error_handling(exception):
    sys.stderr.write("%s\n" % str(exception))


def handle_direntry(direntry):
    mode = os.lstat(direntry)[stat.ST_MODE]

    if stat.S_ISDIR(mode):
        for root, dirs, files in os.walk(direntry,
                                         onerror=os_walk_dumb_error_handling):
            for f in files:
                # Not handle_file, because f could be a symbolic link and
                # would thus be followed by handle_file
                handle_direntry(os.path.join(root, f))
    elif stat.S_ISREG(mode):
        handle_file(direntry)
    elif stat.S_ISLNK(mode):
        target = os.readlink(direntry)
        sys.stderr.write("%s -> %s\n" % (direntry, target))
    else:
        sys.stderr.write("Special file: %s\n" % (direntry,))


def split_stream(input, separator):
    l = []

    while True:
        c = input.read(1)
        if not c:
            break
        elif c == separator:
            yield ''.join(l)
            l = []
        else:
            l.append(c)

    if len(l) > 0:
        yield ''.join(l)


def main():
    global params

    try:
        # Options processing
        opts, args = getopt.getopt(sys.argv[1:], "0fv",
                                   ["filter",
                                    "verbose",
                                    "help"])
    except getopt.GetoptError, message:
        sys.stderr.write(usage + '\n')
        sys.exit(2)

    # Default values for options
    params["separator"] = '\n'
    params["filter_output"] = False
    params["verbose"] = False

    # Read command-line options
    for option, value in opts:
        if option == "--help":
            print usage
            sys.exit(0)
        elif option in ("-0",):
            params["separator"] = '\x00'
        elif option in ("-f", "--filter"):
            params["filter_output"] = True
        elif option in ("-v", "--verbose"):
            params["verbose"] = True
        else:
            sys.stderr.write(usage + '\n')
            sys.exit(2)

    if len(args) == 0:
        for file_or_dir in split_stream(sys.stdin, params["separator"]):
            handle_direntry(file_or_dir)
    else:
        for file_or_dir in args:
            handle_direntry(file_or_dir)

    if found_non_ASCII_chars:
        exit_status = 0
    else:
        exit_status = 1

    sys.exit(exit_status)

if __name__ == "__main__": main()
