#! /usr/bin/env python

# flo-recode-filenames --- Recode file names in a directory tree
# Copyright (c) 2009 Florent Rougon
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; version 2 dated June, 1991.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program; see the file COPYING. If not, write to the
# Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
# Boston, MA  02110-1301 USA.

import sys, os, stat, getopt, codecs

progname = os.path.basename(sys.argv[0])
usage = """Usage: %s [OPTION...] DIR...
Recode the file names under one or more directories (recursive).

Descend under every DIR argument and recode the names of files and
directories found therein using the following algorithm.

The general goal is to recode files and directory names from
ORIG-ENCODING to TARGET-ENCODING, without messing up with those that are
already encoded in TARGET-ENCODING (as a consequence, if the script
stops because of some error, it can be run again without any bad outcome
once the error is fixed).

The algorithm is tailored to the conversion from ISO-8859-1 (or
ISO-8859-15) to UTF-8, which has the following remarkable properties:

  (1) Conversion from ORIG-ENCODING to TARGET-ENCODING is always
      possible (and lossless).

  (2) If a string consisting only of characters from ORIG-ENCODING is
      encoded to TARGET-ENCODING, the resulting byte string is a valid
      string for ORIG-ENCODING, although not the same as the original
      string (e.g., the UTF-8 encoding of a ISO-8859-1 string is itself
      a valid ISO-8859-1 string).

  (3) However, a string from ORIG-ENCODING that contains at least one
      non-ASCII character is not a valid byte string for TARGET-ENCODING
      (e.g., a ISO-8859-1 string that doesn't only contain ASCII
      characters is in general not a valid UTF-8 string).

  (4) There is nothing to do to convert an ASCII string to
      TARGET-ENCODING (e.g., ASCII strings are their own representation
      in UTF-8 encoding).

Thus, for any given string S that is considered for conversion (file or
directory name, name or target path of a symbolic link):
  - if S contains only ASCII characters, it is left unchanged;
  - else, if S is a valid byte string for TARGET-ENCODING, it is
    considered to be already encoded in TARGET-ENCODING, and is left
    unchanged;
  - else, if S is a valid byte string for ORIG-ENCODING, it is considered
    to be encoded in ORIG-ENCODING, and is converted to TARGET-ENCODING
    based on this assumption;
  - else, the program considers the encoding of S as unknown, and leaves
    S unchanged.

This little algorithm is applied to every file and directory name under
every directory specified with a DIR argument (recursively). Symbolic
links are not followed, but both the name of a symbolic link and its
target are considered for encoding conversion.

The program prints a line for every name N considered, as long as it
contains at least one non-ASCII character. The first character of this
line says what is being done with N (or, if --simulate was given, what
would be done without that option). The second character gives
information about the type of N (name of a regular file, directory,
symbolic link, etc.). Then follows a space and a string indicating the
conversion (path of the file or directory, or in the case of symlinks
whose target is being recoded, a string of the form 'FROM -> TO').

The last part of the line ('TO' string in the case where a symlink
target is being recoded, last path element otherwise) is supposed to be
read with a locale supporting ORIG-ENCODING. In this case, it can be
used to check that the encoding for N was correctly guessed. Indeed, the
string is converted to ORIG-ENCODING before being printed, under the
assumption that the guessed encoding was correct. Other parts of the
line (usually, names of parent directories), need not be printed with
correct encoding. This is because they were not considered for recoding
yet, therefore the program has no idea about their encoding.

Meaning of the first characters of an output line
-------------------------------------------------

First character:

  R  Recoding the file/directory/symlink name or symlink target
  L  Leaving unchanged, as it seems to be already in TARGET-ENCODING
  U  Unknown encoding, leaving unchanged

Second character:

  f  neither a directory, nor a symlink, nor a symlink target
     (presumably, a regular file)
  d  directory
  s  symbolic link whose name is being considered for recoding
  F  symbolic link to a file; the target of the symbolic link is being
     considered for recoding
  D  symbolic link to a directory; the target of the symbolic link is
     being considered for recoding

The valid encodings for ORIG-ENCODING and TARGET-ENCODING are listed in
the documentation for the Python 'codecs' module in
<python documentation root>/html/lib/standard-encodings.html. They
include 'iso-8859-1', 'iso-8859-15' and 'utf-8'.

Options:

  -f, --from=ORIG-ENCODING    original encoding of the file name
  -t, --to=TARGET-ENCODING    target encoding for the file name
  -n, --simulate              don't do any renaming, only print what
                              would otherwise be done
      --help                  display usage information and exit""" % progname


def guess_encoding(s, orig, target):
    try:
        u = s.decode("ascii")
    except UnicodeDecodeError, e:
        pass
    else:
        return "ascii"

    valid_for_target_encoding = True

    try:
        u = s.decode(target)
    except UnicodeDecodeError, e:
        valid_for_target_encoding = False

    if valid_for_target_encoding:
        return "target"

    valid_for_orig_encoding = True

    try:
        u = s.decode(orig)
    except UnicodeDecodeError, e:
        valid_for_orig_encoding = False

    if valid_for_orig_encoding:
        return "orig"
    else:
        return None


def handle_regular_file_or_dir(entry_type, root, name,
                               orig_encoding, target_encoding, simulate):
    full_path = os.path.join(root, name)
    guessed_encoding = guess_encoding(name, orig_encoding, target_encoding)
    type_indicator = {"directory": "d",
                      "file": "f",
                      "symlink": "s"}

    if guessed_encoding == "ascii":
        pass
    elif guessed_encoding == "orig":
        # Renaming
        newname = name.decode(orig_encoding).encode(target_encoding)
        print "R%s %s" % (type_indicator[entry_type], full_path)
        if not simulate:
            os.rename(full_path, os.path.join(root, newname))
    elif guessed_encoding == "target":
        # Leaving as is
        uname = name.decode(target_encoding)
        name_in_orig_encoding = uname.encode(orig_encoding, "replace")
        print "L%s %s" % (type_indicator[entry_type],
                          os.path.join(root, name_in_orig_encoding))
    else:
        assert guessed_encoding is None
        # Unknown encoding for 'name'
        print "U%s %s" % (type_indicator[entry_type], full_path)


def handle_symlink(entry_type, root, name,
                   orig_encoding, target_encoding, simulate):
    # First deal with the target of the symlink
    full_path = os.path.join(root, name)
    symlink_target = os.readlink(full_path)
    guessed_encoding = guess_encoding(symlink_target, orig_encoding,
                                      target_encoding)
    type_indicator = {"directory": "D",
                      "file": "F"}

    if guessed_encoding == "ascii":
        pass
    elif guessed_encoding == "orig":
        # Recreating the symlink with its target recoded to 'target_encoding'
        newtarget = symlink_target.decode(orig_encoding).encode(target_encoding)
        print "R%s %s -> %s" % (type_indicator[entry_type], full_path,
                                symlink_target)
        if not simulate:
            os.unlink(full_path)
            os.symlink(newtarget, full_path)
    elif guessed_encoding == "target":
        # Leaving the symlink target as is
        utarget = symlink_target.decode(target_encoding)
        target_in_orig_encoding = utarget.encode(orig_encoding,
                                                 "replace")
        print "L%s %s -> %s" % (type_indicator[entry_type],
                                full_path, target_in_orig_encoding)
    else:
        assert guessed_encoding is None
        # Unknown encoding for 'symlink_target'
        print "U%s %s -> %s" % (type_indicator[entry_type], full_path,
                                symlink_target)

    # Now, deal with the name of the symlink just as with a regular file
    handle_regular_file_or_dir("symlink", root, name,
                               orig_encoding, target_encoding, simulate)

def handle_direntry(entry_type, root, name, orig_encoding, target_encoding,
                    simulate):
    if os.path.islink(os.path.join(root, name)):
        handle_symlink(entry_type, root, name, orig_encoding,
                       target_encoding, simulate)
    else:
        handle_regular_file_or_dir(entry_type, root, name,
                                   orig_encoding, target_encoding,
                                   simulate)


def os_walk_dumb_error_handling(exception):
    sys.stderr.write("%s\n" % str(exception))


def recode_directory_tree(directory, orig_encoding, target_encoding,
                          simulate):
    mode = os.stat(directory)[stat.ST_MODE]
    if stat.S_ISDIR(mode):
        for root, dirs, files in os.walk(directory, topdown=False,
                                         onerror=os_walk_dumb_error_handling):
            for d in dirs:
                handle_direntry("directory", root, d, orig_encoding,
                                target_encoding, simulate) 

            for f in files:
                handle_direntry("file", root, f, orig_encoding,
                                target_encoding, simulate) 
    else:
        sys.stderr.write("** Skipping %s, which is not a directory **\n")


def main():
    try:
        # Options processing
        opts, args = getopt.getopt(sys.argv[1:], "f:t:n",
                                   ["from=",
                                    "to=",
                                    "simulate",
                                    "help"])
    except getopt.GetoptError, message:
        sys.exit(usage)

    if len(args) == 0:
        sys.exit(usage)

    # Default values for options
    orig_encoding = None
    target_encoding = None
    simulate = False

    # Read command-line options
    for option, value in opts:
        if option == "--help":
            print usage
            sys.exit(0)
        elif option in ("-f", "--from"):
            orig_encoding = value
        elif option in ("-t", "--to"):
            target_encoding = value
        elif option in ("-n", "--simulate"):
            simulate = True
        else:
            sys.exit(usage)

    if (orig_encoding is None) or (target_encoding is None):
        sys.exit("%s: both --from and --to must be provided" % progname)

    for encoding in (orig_encoding, target_encoding):
        try:
            codec_info = codecs.lookup(encoding)
        except LookupError:
            sys.exit("Encoding not recognized by Python: %s" % encoding)

    try:
        for directory in args:
            recode_directory_tree(directory, orig_encoding, target_encoding,
                                  simulate)
    except os.error, e:
        sys.stderr.write("OS error: %s\n" % str(e))
        raise

    sys.exit(0)

if __name__ == "__main__": main()
