#! /usr/bin/env python

# get-rid-of-cp850 --- Help converting files containing chars encoded
#                      with the cp850 "code page"
# Copyright (c) 2009 Florent Rougon
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; version 2 dated June, 1991.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; see the file COPYING. If not, write to the
# Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
# Boston, MA  02110-1301 USA.

import sys, os, locale, re, stat, getopt, codecs, pyaspell
import flo_small_funcs

progname = os.path.basename(sys.argv[0])
preferred_encoding = None

usage = """Usage: %s [OPTION...] FILE_OR_DIR...
Help with conversion from mixed cp850 + UTF-8 to something sane.

XXX

Options (XXX incomplete):

  -f, --from=ORIG-ENCODING    original encoding of the file name
  -t, --to=TARGET-ENCODING    target encoding for the file name
  -r, --recursive             recursively process directory arguments
  -n, --simulate              don't do any renaming, only print what
                              would otherwise be done
  -0                          use null characters as separators when
                              reading arguments from the standard input
      --help                  display usage information and exit""" % \
(progname, backup_suffix)

# Global parameters
params = {}
first_file = True
global_summary = {}
spell_checker = pyaspell.AspellLinux([("master", "fr_FR-80"),
                                      ("encoding", "utf-8")])

# Tuple (a, b) corresponds to "any char c such that a <= c < b". The range
# therefore contains b - a chars.
# chars_to_translate = [ (128, 160), (182, 185), (210, 213), 245, 248, 253 ]
chars_to_translate = [ (128, 256) ]
# chars_to_translate = [ (128, 160) ]


def is_translatable_char(c):
    """Determine if character C is part of 'chars_to_translate'."""
    code_point = ord(c)

    for elem in chars_to_translate:
        if isinstance(elem, tuple):
            if elem[0] <= code_point < elem[1]:
                return True
        else:
            assert isinstance(elem, int), elem
            if code_point == elem:
                return True

    return False


def lencode(s):
    if isinstance(s, unicode):
        return s.encode(preferred_encoding)
    else:
        assert isinstance(s, basestring), s
        return s

def lprint(s, LF=True):
    if LF:
        print lencode(s)
    else:
        print lencode(s),


# Code taken from tdbsf/core/tdbsf_api.py
def read_emacs_style_file_variables(filename, s):
    """Read an optional Emacs-style line declaring file variables.

    The line should have the syntax:

      -*- VAR1: VALUE1; VAR2: VALUE2; ... -*-

    (the last semicolon in the declaration is optional)

    For now, only the "coding" pseudo-variable is used by TDBSF. The
    associated value should be a valid encoding name for Python (and also for
    Emacs if you use it to open the files). Other variables are ignored.

    Valid encoding names include "utf-8", "iso-8859-1", "iso-8859-15".

    This declaration line should be immediately followed by a blank line, then
    the first record header, though this is not enforced for now.

    S -- string to parse
    FILENAME -- file from which S was read (for error messages)

    """
    emacs_style_file_vars_cre = re.compile(
        r"(.* )?-\*- (?P<assignments>.*) -\*-")
    assignment_cre = re.compile(r"^ *(?P<name>[^ \t:]+): (?P<value>[^ \t]+) *$")

    res = {}

    first_newline = s.index('\n')
    if first_newline == -1:
        return res

    line = s[0:first_newline]
    mo = emacs_style_file_vars_cre.match(line)

    if mo is not None:
        assignments = mo.group("assignments").split(";")
        if not assignments[-1]:
            assignments = assignments[:-1]

        for a in assignments:
            assignment_mo = assignment_cre.match(a)
            if assignment_mo is None:
                raise TDBSFInvalidFileHeader(
                    "'%s': invalid assignment in Emacs-style file variables "
                    "line: '%s'" % (filename, unicode(a, errors='replace')))

            name = assignment_mo.group("name")
            if name == "coding":
                res[name] = assignment_mo.group("value")

    return res


def handle_file(fname, orig_encoding, target_encoding, simulate):
    global first_file, params, global_summary

    f = open(fname, "rU")

    if False:
        first_line = f.readline()
        file_variables = read_emacs_style_file_variables(fname, first_line)
        # Don't process files with a proper encoding declaration
        if file_variables.has_key("coding"):
            return
        else:
            f.seek(0, 0)

    current_word = []
    current_word_has_translatable_chars = False
    curr_offset = params["file offsets"]
    # Position of interesting chars of the current word, counted from the
    # beginning of the file
    offsets = []
    d = {}

    while True:
        c = f.read(1)
        if c in " \r\n\t\a\b\f\v()[]{}&\"'#`,?;.:/!%=":
            if current_word_has_translatable_chars:
                word = ''.join(current_word)

                if not spell_checker.check(word):
                    if d.has_key(word):
                        d[word].extend(offsets)
                    else:
                        d[word] = offsets

            if not c:
                break

            current_word = []
            current_word_has_translatable_chars = False
            offsets = []
        else:
            if is_translatable_char(c):
                current_word_has_translatable_chars = True
                offsets.append(curr_offset)
            #     uc = c.decode(orig_encoding)
            # else:
            #     uc = c.decode(target_encoding)

            current_word.append(c)

        curr_offset += 1

    for word, offsets in d.iteritems():
        this_file_entry = {"file": fname,
                           "offsets": offsets}
        if global_summary.has_key(word):
            global_summary[word].append(this_file_entry)
        else:
            global_summary[word] = [ this_file_entry ]

    # Don't print anything if no "interesting" word was found
    if d != {}:
        if first_file:
            first_file = False
        else:
            # Newline to separate from the previous file info
            sys.stdout.write('\n')

        sys.stdout.write(flo_small_funcs.frame(fname) + '\n')

        sorted_words = d.keys()
#        sorted_words.sort(cmp=locale.strcoll)
        sorted_words.sort()

        for word in sorted_words:
            lprint("%s: %s" % (word.decode("utf-8"),
                               ", ".join(map(str, d[word]))))


def os_walk_dumb_error_handling(exception):
    sys.stderr.write("%s\n" % str(exception))


def handle_direntry(direntry, orig_encoding, target_encoding, recursive,
                    simulate):
    mode = os.lstat(direntry)[stat.ST_MODE]

    if stat.S_ISDIR(mode) and recursive:
        for root, dirs, files in os.walk(direntry,
                                         onerror=os_walk_dumb_error_handling):
            for f in files:
                # Not handle_file, because f could be a symbolic link and
                # would be followed by handle_file
                handle_direntry(os.path.join(root, f),
                                orig_encoding, target_encoding, simulate)
    elif stat.S_ISREG(mode):
        handle_file(direntry, orig_encoding, target_encoding, simulate)
    elif stat.S_ISLNK(mode):
        target = os.readlink(direntry)
        print " s %s -> %s" % (direntry, target)
    else:
        print " S %s" % (direntry,)


def split_stream(inp, separator):
    l = []

    while True:
        c = inp.read(1)
        if not c:
            break
        elif c == separator:
            yield ''.join(l)
            l = []
        else:
            l.append(c)

    if len(l) > 0:
        yield ''.join(l)


def main():
    global preferred_encoding, wordlist

    try:
        # Options processing
        opts, args = getopt.getopt(sys.argv[1:], "f:t:gnr01",
                                   ["from=",
                                    "to=",
                                    "global-summary",
                                    "recursive",
                                    "simulate",
                                    "file-offsets=",
                                    "help"])
    except getopt.GetoptError, message:
        sys.exit(usage)

    locale.setlocale(locale.LC_ALL, '')

    preferred_encoding = locale.getpreferredencoding()

    # Default values for options
    orig_encoding = "cp850"
    target_encoding = "iso-8859-15"
    recursive = False
    separator = '\n'
    simulate = False
    params["file offsets"] = 0
    params["global summary"] = False

    # Read command-line options
    for option, value in opts:
        if option == "--help":
            print usage
            sys.exit(0)
        elif option in ("-0",):
            separator = '\x00'
        elif option in ("-1",):
            params["file offsets"] = 1
        elif option in ("-f", "--from"):
            orig_encoding = value
        elif option in ("-t", "--to"):
            target_encoding = value
        elif option in ("-r", "--recursive"):
            recursive = True
        elif option in ("-n", "--simulate"):
            simulate = True
        elif option in ("-g", "global-summary"):
            params["global summary"] = True
        elif option in ("--file-offsets",):
            params["file offsets"] = int(value)
        else:
            sys.exit(usage)

    if (orig_encoding is None) or (target_encoding is None):
        sys.exit("%s: both --from and --to must be provided" % progname)

    for encoding in (orig_encoding, target_encoding):
        try:
            codec_info = codecs.lookup(encoding)
        except LookupError:
            sys.exit("Encoding not recognized by Python: %s" % encoding)

    if len(args) == 0:
        stuff_to_inspect = split_stream(sys.stdin, separator)
    else:
        stuff_to_inspect = args

    for file_or_dir in stuff_to_inspect:
        handle_direntry(file_or_dir, orig_encoding, target_encoding,
                        recursive, simulate)

    spell_checker.close()

    if params["global summary"]:
        lprint("\n\n%s" % flo_small_funcs.frame("Global summary"))

        sorted_words = global_summary.keys()
        sorted_words.sort(cmp=locale.strcoll)

        for word in sorted_words:
            # byte_seq = word.encode("cp850")

            # try:
            #     utf8_interpretation = byte_seq.decode("utf-8")
            # except UnicodeDecodeError, e:
            #     utf8_interpretation = "%s: %s" % (e.reason, repr(e.object))

            lprint("%s (%s)" % (word.decode("utf-8"), repr(word)))
            for entry in global_summary[word]:
                print "  %s: %s" % (entry["file"],
                                    ", ".join(map(str, entry["offsets"])))

    sys.exit(0)

if __name__ == "__main__": main()
