#! /usr/bin/env python

# flo-recode-filecontents --- Recode file contents
# Copyright (c) 2009 Florent Rougon
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; version 2 dated June, 1991.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program; see the file COPYING. If not, write to the
# Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
# Boston, MA  02110-1301 USA.

import sys, os, stat, getopt, codecs

progname = os.path.basename(sys.argv[0])
progversion = "0.1"
backup_suffix = ".%s-bak" % (progname,)

usage = """Usage: %s [OPTION...] FILE_OR_DIR...
Recode the contents of one or more files.

Every FILE_OR_DIR argument is examined: if it is a file and its contents
is not already in the target encoding, it is recoded into that encoding;
if it is a directory and option -r was given, every file under that
directory is processed in this way. If no FILE_OR_DIR argument is
supplied, they are read from the standard input, separated by ASCII
linefeed (10) characters, or ASCII NUL (0) if option -0 is supplied.

The general goal is to recode the contents of files from ORIG-ENCODING
to TARGET-ENCODING, without messing up with those that are already
encoded in TARGET-ENCODING (as a consequence, if the script stops
because of some error, it can be run again without any bad outcome once
the error is fixed).

The algorithm is tailored to the conversion from ISO-8859-1 (or
ISO-8859-15) to UTF-8, which has the following remarkable properties:

  (1) Conversion from ORIG-ENCODING to TARGET-ENCODING is always
      possible (and lossless).

  (2) If a string consisting only of characters from ORIG-ENCODING is
      encoded to TARGET-ENCODING, the resulting byte string is a valid
      string for ORIG-ENCODING, although not the same as the original
      string (e.g., the UTF-8 encoding of a ISO-8859-1 string is itself
      a valid ISO-8859-1 string).

  (3) However, a string from ORIG-ENCODING that contains at least one
      non-ASCII character is not a valid byte string for TARGET-ENCODING
      (e.g., a ISO-8859-1 string that doesn't only contain ASCII
      characters is in general not a valid UTF-8 string).

  (4) There is nothing to do to convert an ASCII string to
      TARGET-ENCODING (e.g., ASCII strings are their own representation
      in UTF-8 encoding).

Thus, for any given string S that is considered for conversion:
  - if S contains only ASCII characters, it is left unchanged;
  - else, if S is a valid byte string for TARGET-ENCODING, it is
    considered to be already encoded in TARGET-ENCODING, and is left
    unchanged;
  - else, if S is a valid byte string for ORIG-ENCODING, it is considered
    to be encoded in ORIG-ENCODING, and is converted to TARGET-ENCODING
    based on this assumption;
  - else, the program considers the encoding of S as unknown, and leaves
    S unchanged.

This little algorithm is applied to every FILE_OR_DIR argument,
recursively if option -r was supplied. Symbolic links are not followed.
Before recoding a file, the program creates a backup with the same name
except for the added suffix '%s'.

The program prints a line for every file considered (except for its own
backup files), as long as it contains at least one non-ASCII character.
The first character of this line says what is being done with the file.
The second character gives information about its type. The rest of the
line gives information about the file (name or 'FROM -> TO' form for
symbolic links).

Meaning of the first characters of an output line
-------------------------------------------------

First character:

    R    Recoding the file contents
    L    Leaving unchanged, as it seems to be already in TARGET-ENCODING
 <space> Leaving unchanged for another reason (special file or backup
         file created by this program)
    U    Unknown encoding, leaving unchanged
    E    Error while creating the backup file

Second character:

    f    regular file
    s    symbolic link
    S    special file (device file, socket, named pipe...)
    B    backup file created by this program

The valid encodings for ORIG-ENCODING and TARGET-ENCODING are listed in
the documentation for the Python 'codecs' module in
<python documentation root>/html/lib/standard-encodings.html. They
include 'iso-8859-1', 'iso-8859-15' and 'utf-8'.

Options:

  -f, --from=ORIG-ENCODING    original encoding of the file name
  -t, --to=TARGET-ENCODING    target encoding for the file name
  -r, --recursive             recursively process directory arguments
  -n, --simulate              don't do any renaming, only print what
                              would otherwise be done
  -0                          use null characters as separators when
                              reading arguments from the standard input
      --help                  display usage information and exit""" % \
(progname, backup_suffix)


def guess_encoding(s, orig, target):
    try:
        u = s.decode("ascii")
    except UnicodeDecodeError, e:
        pass
    else:
        return ("ascii", u)

    valid_for_target_encoding = True

    try:
        u = s.decode(target)
    except UnicodeDecodeError, e:
        valid_for_target_encoding = False

    if valid_for_target_encoding:
        return ("target", u)

    valid_for_orig_encoding = True

    try:
        u = s.decode(orig)
    except UnicodeDecodeError, e:
        valid_for_orig_encoding = False

    if valid_for_orig_encoding:
        return ("orig", u)
    else:
        return None


def handle_file(f, orig_encoding, target_encoding, simulate):
    # Don't process our own backup files!
    if f.endswith(backup_suffix):
        return

    contents = open(f, "rb").read()
    guessed_encoding, unicode_string = guess_encoding(contents, orig_encoding,
                                                      target_encoding)

    if guessed_encoding == "ascii":
        pass
    elif guessed_encoding == "orig":
        backup_file = "%s%s" % (f, backup_suffix)

        if os.path.exists(backup_file):
            print " B %s" % backup_file
            return

        try:
            if not simulate:
                bf = open(backup_file, "wb")
                bf.write(contents)
                bf.flush()
                bf.close()
        except:
            print "Ef %s" % f
            return
            
        # Recoding
        print "Rf %s" % (f,)
        newcontents = unicode_string.encode(target_encoding)

        if not simulate:
            f = open(f, "wb")
            f.write(newcontents)
            f.flush()
            f.close()
            os.unlink(backup_file)
    elif guessed_encoding == "target":
        # Leaving as is
        print "Lf %s" % (f,)
    else:
        assert guessed_encoding is None
        # Unknown encoding
        print "Uf %s" % (f,)


def os_walk_dumb_error_handling(exception):
    sys.stderr.write("%s\n" % str(exception))


def handle_direntry(direntry, orig_encoding, target_encoding, recursive,
                    simulate):
    mode = os.lstat(direntry)[stat.ST_MODE]

    if stat.S_ISDIR(mode) and recursive:
        for root, dirs, files in os.walk(direntry,
                                         onerror=os_walk_dumb_error_handling):
            for f in files:
                # Not handle_file, because f could be a symbolic link and
                # would be followed by handle_file
                handle_direntry(os.path.join(root, f),
                                orig_encoding, target_encoding, simulate)
    elif stat.S_ISREG(mode):
        handle_file(direntry, orig_encoding, target_encoding, simulate)
    elif stat.S_ISLNK(mode):
        target = os.readlink(direntry)
        print " s %s -> %s" % (direntry, target)
    else:
        print " S %s" % (direntry,)


def split_stream(input, separator):
    l = []

    while True:
        c = input.read(1)
        if not c:
            break
        elif c == separator:
            yield ''.join(l)
            l = []
        else:
            l.append(c)

    if len(l) > 0:
        yield ''.join(l)


def main():
    try:
        # Options processing
        opts, args = getopt.getopt(sys.argv[1:], "f:t:nr0",
                                   ["from=",
                                    "to=",
                                    "recursive",
                                    "simulate",
                                    "help"])
    except getopt.GetoptError, message:
        sys.exit(usage)

    # Default values for options
    orig_encoding = None
    target_encoding = None
    recursive = False
    separator = '\n'
    simulate = False

    # Read command-line options
    for option, value in opts:
        if option == "--help":
            print usage
            sys.exit(0)
        elif option in ("-0",):
            separator = '\x00'
        elif option in ("-f", "--from"):
            orig_encoding = value
        elif option in ("-t", "--to"):
            target_encoding = value
        elif option in ("-r", "--recursive"):
            recursive = True
        elif option in ("-n", "--simulate"):
            simulate = True
        else:
            sys.exit(usage)

    if (orig_encoding is None) or (target_encoding is None):
        sys.exit("%s: both --from and --to must be provided" % progname)

    for encoding in (orig_encoding, target_encoding):
        try:
            codec_info = codecs.lookup(encoding)
        except LookupError:
            sys.exit("Encoding not recognized by Python: %s" % encoding)

    if len(args) == 0:
        stuff_to_inspect = split_stream(sys.stdin, separator)
    else:
        stuff_to_inspect = args

    for file_or_dir in stuff_to_inspect:
        handle_direntry(file_or_dir, orig_encoding, target_encoding,
                        recursive, simulate)

    sys.exit(0)

if __name__ == "__main__": main()
