#! /usr/bin/env python # get-rid-of-cp850 --- Help converting files containing chars encoded # with the cp850 "code page" # Copyright (c) 2009 Florent Rougon # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; version 2 dated June, 1991. # # This program is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; see the file COPYING. If not, write to the # Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, # Boston, MA 02110-1301 USA. import sys, os, locale, re, stat, getopt, codecs, pyaspell import flo_small_funcs progname = os.path.basename(sys.argv[0]) preferred_encoding = None usage = """Usage: %s [OPTION...] FILE_OR_DIR... Help with conversion from mixed cp850 + UTF-8 to something sane. XXX Options (XXX incomplete): -f, --from=ORIG-ENCODING original encoding of the file name -t, --to=TARGET-ENCODING target encoding for the file name -r, --recursive recursively process directory arguments -n, --simulate don't do any renaming, only print what would otherwise be done -0 use null characters as separators when reading arguments from the standard input --help display usage information and exit""" % \ (progname, backup_suffix) # Global parameters params = {} first_file = True global_summary = {} spell_checker = pyaspell.AspellLinux([("master", "fr_FR-80"), ("encoding", "utf-8")]) # Tuple (a, b) corresponds to "any char c such that a <= c < b". The range # therefore contains b - a chars. # chars_to_translate = [ (128, 160), (182, 185), (210, 213), 245, 248, 253 ] chars_to_translate = [ (128, 256) ] # chars_to_translate = [ (128, 160) ] def is_translatable_char(c): """Determine if character C is part of 'chars_to_translate'.""" code_point = ord(c) for elem in chars_to_translate: if isinstance(elem, tuple): if elem[0] <= code_point < elem[1]: return True else: assert isinstance(elem, int), elem if code_point == elem: return True return False def lencode(s): if isinstance(s, unicode): return s.encode(preferred_encoding) else: assert isinstance(s, basestring), s return s def lprint(s, LF=True): if LF: print lencode(s) else: print lencode(s), # Code taken from tdbsf/core/tdbsf_api.py def read_emacs_style_file_variables(filename, s): """Read an optional Emacs-style line declaring file variables. The line should have the syntax: -*- VAR1: VALUE1; VAR2: VALUE2; ... -*- (the last semicolon in the declaration is optional) For now, only the "coding" pseudo-variable is used by TDBSF. The associated value should be a valid encoding name for Python (and also for Emacs if you use it to open the files). Other variables are ignored. Valid encoding names include "utf-8", "iso-8859-1", "iso-8859-15". This declaration line should be immediately followed by a blank line, then the first record header, though this is not enforced for now. S -- string to parse FILENAME -- file from which S was read (for error messages) """ emacs_style_file_vars_cre = re.compile( r"(.* )?-\*- (?P.*) -\*-") assignment_cre = re.compile(r"^ *(?P[^ \t:]+): (?P[^ \t]+) *$") res = {} first_newline = s.index('\n') if first_newline == -1: return res line = s[0:first_newline] mo = emacs_style_file_vars_cre.match(line) if mo is not None: assignments = mo.group("assignments").split(";") if not assignments[-1]: assignments = assignments[:-1] for a in assignments: assignment_mo = assignment_cre.match(a) if assignment_mo is None: raise TDBSFInvalidFileHeader( "'%s': invalid assignment in Emacs-style file variables " "line: '%s'" % (filename, unicode(a, errors='replace'))) name = assignment_mo.group("name") if name == "coding": res[name] = assignment_mo.group("value") return res def handle_file(fname, orig_encoding, target_encoding, simulate): global first_file, params, global_summary f = open(fname, "rU") if False: first_line = f.readline() file_variables = read_emacs_style_file_variables(fname, first_line) # Don't process files with a proper encoding declaration if file_variables.has_key("coding"): return else: f.seek(0, 0) current_word = [] current_word_has_translatable_chars = False curr_offset = params["file offsets"] # Position of interesting chars of the current word, counted from the # beginning of the file offsets = [] d = {} while True: c = f.read(1) if c in " \r\n\t\a\b\f\v()[]{}&\"'#`,?;.:/!%=": if current_word_has_translatable_chars: word = ''.join(current_word) if not spell_checker.check(word): if d.has_key(word): d[word].extend(offsets) else: d[word] = offsets if not c: break current_word = [] current_word_has_translatable_chars = False offsets = [] else: if is_translatable_char(c): current_word_has_translatable_chars = True offsets.append(curr_offset) # uc = c.decode(orig_encoding) # else: # uc = c.decode(target_encoding) current_word.append(c) curr_offset += 1 for word, offsets in d.iteritems(): this_file_entry = {"file": fname, "offsets": offsets} if global_summary.has_key(word): global_summary[word].append(this_file_entry) else: global_summary[word] = [ this_file_entry ] # Don't print anything if no "interesting" word was found if d != {}: if first_file: first_file = False else: # Newline to separate from the previous file info sys.stdout.write('\n') sys.stdout.write(flo_small_funcs.frame(fname) + '\n') sorted_words = d.keys() # sorted_words.sort(cmp=locale.strcoll) sorted_words.sort() for word in sorted_words: lprint("%s: %s" % (word.decode("utf-8"), ", ".join(map(str, d[word])))) def os_walk_dumb_error_handling(exception): sys.stderr.write("%s\n" % str(exception)) def handle_direntry(direntry, orig_encoding, target_encoding, recursive, simulate): mode = os.lstat(direntry)[stat.ST_MODE] if stat.S_ISDIR(mode) and recursive: for root, dirs, files in os.walk(direntry, onerror=os_walk_dumb_error_handling): for f in files: # Not handle_file, because f could be a symbolic link and # would be followed by handle_file handle_direntry(os.path.join(root, f), orig_encoding, target_encoding, simulate) elif stat.S_ISREG(mode): handle_file(direntry, orig_encoding, target_encoding, simulate) elif stat.S_ISLNK(mode): target = os.readlink(direntry) print " s %s -> %s" % (direntry, target) else: print " S %s" % (direntry,) def split_stream(inp, separator): l = [] while True: c = inp.read(1) if not c: break elif c == separator: yield ''.join(l) l = [] else: l.append(c) if len(l) > 0: yield ''.join(l) def main(): global preferred_encoding, wordlist try: # Options processing opts, args = getopt.getopt(sys.argv[1:], "f:t:gnr01", ["from=", "to=", "global-summary", "recursive", "simulate", "file-offsets=", "help"]) except getopt.GetoptError, message: sys.exit(usage) locale.setlocale(locale.LC_ALL, '') preferred_encoding = locale.getpreferredencoding() # Default values for options orig_encoding = "cp850" target_encoding = "iso-8859-15" recursive = False separator = '\n' simulate = False params["file offsets"] = 0 params["global summary"] = False # Read command-line options for option, value in opts: if option == "--help": print usage sys.exit(0) elif option in ("-0",): separator = '\x00' elif option in ("-1",): params["file offsets"] = 1 elif option in ("-f", "--from"): orig_encoding = value elif option in ("-t", "--to"): target_encoding = value elif option in ("-r", "--recursive"): recursive = True elif option in ("-n", "--simulate"): simulate = True elif option in ("-g", "global-summary"): params["global summary"] = True elif option in ("--file-offsets",): params["file offsets"] = int(value) else: sys.exit(usage) if (orig_encoding is None) or (target_encoding is None): sys.exit("%s: both --from and --to must be provided" % progname) for encoding in (orig_encoding, target_encoding): try: codec_info = codecs.lookup(encoding) except LookupError: sys.exit("Encoding not recognized by Python: %s" % encoding) if len(args) == 0: stuff_to_inspect = split_stream(sys.stdin, separator) else: stuff_to_inspect = args for file_or_dir in stuff_to_inspect: handle_direntry(file_or_dir, orig_encoding, target_encoding, recursive, simulate) spell_checker.close() if params["global summary"]: lprint("\n\n%s" % flo_small_funcs.frame("Global summary")) sorted_words = global_summary.keys() sorted_words.sort(cmp=locale.strcoll) for word in sorted_words: # byte_seq = word.encode("cp850") # try: # utf8_interpretation = byte_seq.decode("utf-8") # except UnicodeDecodeError, e: # utf8_interpretation = "%s: %s" % (e.reason, repr(e.object)) lprint("%s (%s)" % (word.decode("utf-8"), repr(word))) for entry in global_summary[word]: print " %s: %s" % (entry["file"], ", ".join(map(str, entry["offsets"]))) sys.exit(0) if __name__ == "__main__": main()