#! /usr/bin/env python3
# -*- coding: utf-8 -*-
# flo-expand-xml-entities --- Quick & dirty expansion of entities in an XML file
# Copyright (C) 2013 Florent Rougon
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; version 2 dated June, 1991.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see .
import sys, locale, os, argparse, re, textwrap, codecs, io
from textwrap import dedent
tw = textwrap.TextWrapper(width=80, break_long_words=False,
break_on_hyphens=False)
progname = os.path.basename(sys.argv[0])
progversion = "0.2"
version_blurb = """Written by Florent Rougon.
Copyright (c) 2013 Florent Rougon
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE."""
# The spec was not checked, the official syntax may be different
entity_name_re = r"[a-zA-Z0-9_]+"
# The following seems easier than what we have in the actual regexp below, but
# it does not allow to capture the entity value *without* the quoting chars in
# a single regexp group:
#
# (?P"[^"]*"|'[^']*')
entity_def_cre = re.compile(r"""
{})
[ \t\n]+
(?P
(?P")|') # Remember what kind of quote we found.
(?P
# The allowed contents for the value depends on the kind of quote used
# (I suppose that putting one asterisk at the end of each alternative inside
# the group gives better performance than putting only one after the group).
(?(double_quote)[^"]*|[^']*)
)
(?P=quote_char)
[ \t\n]*
>""".format(entity_name_re), re.VERBOSE | re.MULTILINE)
entity_ref_cre = re.compile(r"&(?P{});".format(entity_name_re))
class EntityRepos:
def __init__(self, entities=None):
self.entities = dict(entities or {})
def expandEntityDefs(self, d=None):
if d is None:
d = self.entities
for entity in d:
d[entity] = self.expandEntities(d[entity], d, expanding={entity})
def expandEntities(self, s, d=None, expanding=None):
"""Recursively expand all entities in string 's'.
If 'expanding' is not None, it must be a set of entity names. It is
used to detect cycles in the set of all entity definitions: if an
entity whose name is in 'expanding' is encountered while reading and
expanding 's', the function signals an error and aborts.
"""
if d is None:
d = self.entities
expanding = expanding if expanding is not None else frozenset()
mo = entity_ref_cre.search(s)
if mo:
name = mo.group("name")
if name in expanding:
sys.exit("Found a cycle while expanding entity '{}'. Aborting."
.format(name))
if name in d:
expansion = self.expandEntities(d[name], d,
expanding=expanding | {name})
else:
# Externally defined entity such as < → leave it as is
expansion = "&{};".format(name)
return s[:mo.start()] \
+ expansion + self.expandEntities(s[mo.end():], d,
expanding=expanding)
else:
return s
def readDtd(self, s, d=None):
if d is None:
d = self.entities
match_end = 0
for mo in entity_def_cre.finditer(s):
match_end = mo.end()
d[mo.group("name")] = mo.group("value")
return match_end
def expandEntitiesWithoutRecursion(self, s, d=None):
if d is None:
d = self.entities
l = []
unknown_entities = set()
match_end = 0
for mo in entity_ref_cre.finditer(s):
# What precedes the match
l.append(s[match_end:mo.start()])
match_end = mo.end()
name = mo.group("name")
if name in d:
l.append(d[name])
else:
unknown_entities.add(name)
l.append("&{};".format(name))
l.append(s[match_end:])
return (l, unknown_entities)
def processHybridString(self, s):
"""Expand entities after an internal DTD subset."""
# We must not pollute self.entities with the entities defined in the
# internal DTD subset of 's', since self.entities is used to expand
# entities in all files/strings.
dict_ = self.entities.copy()
end_of_dtd = self.readDtd(s, dict_)
l = [ s[0:end_of_dtd] ]
self.expandEntityDefs(dict_)
l2, unknown_entities = \
self.expandEntitiesWithoutRecursion(s[end_of_dtd:], dict_)
l.extend(l2)
return (l, unknown_entities)
class ExitFromCommandLineOrConfigFileParsing(Exception):
"""Exception raised to exit from command line and config file parsing"""
def __init__(self, message=None, file=sys.stderr, exit_status=2):
self.message = message
self.file = file
self.exit_status = exit_status
def __str__(self):
return self.message
def __repr__(self):
return """{classname}(
message={msg!r},
file={file!r},
exit_status={status!r})""".format(classname=self.__class__.__name__,
file=self.file,
msg=self.message,
status=self.exit_status)
class DtdAct(argparse.Action):
def __call__(self, parser, namespace, values, option_string=None):
# 'values' is a list because of parser.add_argument(..., nargs='*', ...)
for f in values:
namespace.dtds.append({"name": f,
"encoding": namespace.currentEncoding})
class EncodingAct(argparse.Action):
def __call__(self, parser, namespace, value, option_string=None):
try:
codecs.lookup(value)
except LookupError:
msg = dedent("""\
'{encoding}' is not a recognized encoding; valid encodings are
those recognized by the Python 'codecs' module (cf.
)"""
.format(progname=progname, encoding=value))
raise ExitFromCommandLineOrConfigFileParsing(msg)
namespace.currentEncoding = value
class InputFileAct(argparse.Action):
def __call__(self, parser, namespace, values, option_string=None):
# 'values' is a list because of parser.add_argument(..., nargs='*', ...)
for f in values:
namespace.ifiles.append({"name": f,
"encoding": namespace.currentEncoding})
class OutputFileAct(argparse.Action):
def __call__(self, parser, namespace, values, option_string=None):
# 'values' is a list because of parser.add_argument(..., nargs='*', ...)
for f in values:
namespace.ofiles.append({"name": f,
"encoding": namespace.currentEncoding})
def process_command_line():
params = argparse.Namespace()
params.currentEncoding = None
params.dtds = []
params.ifiles = []
params.ofiles = []
parser = argparse.ArgumentParser(
usage="""\
%(prog)s [OPTION ...]
Expand XML-style entities in a set of files.""",
description="""\
Each input file IFILE is read in turn and written to the corresponding output
file OFILE after entity expansion. Entity definitions are read from the files
specified by the --dtd option(s).
In "hybrid" mode, each IFILE may also have an internal DTD subset that is used
when expanding its own contents (after the internal DTD subset), in addition to
the entity definitions gathered from the files specified by --dtd options.
Since every IFILE's expansion is written to the corresponding OFILE, there
must be as many IFILE arguments as there are OFILEs, and their relative order
is important.
The parser is based on regular expressions and knows almost nothing about
XML syntax. It will happily expand entities inside XML comments, for instance.
The advantage is that it can therefore be used to expand entities in files
with a looser syntax, such as HTML.
As --dtd, --input-file and --output-file, the --encoding (-e) option may be
given several times; every time it is encountered, it defines the encoding
used for reading and writing the files (streams) specified by the following
command line arguments, until the next --encoding option.
Wherever a file name is expected, '-' can be used to indicate the standard
input or standard output stream (depending on whether the option specifies an
input or output stream).
In general, there are two ways of specifying file names that look like
options:
- for options such as --dtd, use '--dtd=-e' instead of '--dtd -e' (example
for a file named '-e'); as many such options as necessary can be used on
the same command line;
- for positional arguments, use a single '--' argument (without any quotes)
to mark the end of options and the start of positional arguments (since
this program does not accept any positional argument, this second method
is not applicable here).""",
formatter_class=argparse.RawDescriptionHelpFormatter,
# I want --help but not -h (it might be useful for something else)
add_help=False)
parser.add_argument('-d', '--dtd', action=DtdAct, metavar="DTD", nargs='*',
help="read entity definitions from file %(metavar)s")
parser.add_argument('--mode', choices=("simple", "hybrid"),
default="simple", metavar="MODE",
help="""\
allowed values: %(choices)s. In hybrid mode, entities in an input file
are only expanded after gathering entity definitions from an internal
DTD subset (i.e., at the beginning of the same input file) [default:
%(default)s]""")
parser.add_argument('-e', '--encoding', action=EncodingAct,
help="""\
encoding to use for reading and writing files specified by later
command-line arguments; also applies to stdin and stdout when they are
used as input or output "files" (when '-' is specified as a file name)
[default: use the Python defaults from the 'io' module, which on most
platforms are derived from the current locale]""")
parser.add_argument('-i', '--input-file', metavar="IFILE", nargs='*',
action=InputFileAct, help="input file")
parser.add_argument('-o', '--output-file', metavar="OFILE", nargs='*',
action=OutputFileAct, help="output file")
parser.add_argument('--help', action="help",
help="display this message and exit")
# The version text is not wrapped when using
# formatter_class=argparse.RawDescriptionHelpFormatter
parser.add_argument('--version', action='version',
version="{name} {version}\n{blurb}".format(
name=progname, version=progversion, blurb=version_blurb))
params = parser.parse_args(namespace=params)
if not params.ifiles:
raise ExitFromCommandLineOrConfigFileParsing(
"at least one input file must be specified")
if len(params.ifiles) != len(params.ofiles):
raise ExitFromCommandLineOrConfigFileParsing(
"you must specify as many input files as output files")
stdin_bindings = stdout_bindings = 0
for f in params.dtds + params.ifiles:
if f["name"] == "-":
stdin_bindings += 1
for f in params.ofiles:
if f["name"] == "-":
stdout_bindings += 1
if stdin_bindings > 1:
msg = dedent("""\
stdin can be used for at most 1 input stream (DTD or IFILE),
but the command line implies {} references to stdin""".format(
stdin_bindings))
raise ExitFromCommandLineOrConfigFileParsing(message=msg)
if stdout_bindings > 1:
msg = dedent("""\
stdout can be used for at most 1 output stream (OFILE),
but the command line implies {} references to stdout""".format(
stdout_bindings))
raise ExitFromCommandLineOrConfigFileParsing(message=msg)
return params
def openStream(name, direction, encoding=None, errors=None, newline=None):
assert direction in ("input", "output"), direction
# Quoting the documentation of the 'io' module:
# Argument names are not part of the specification, and only the
# arguments of open() are intended to be used as keyword arguments.
if name == "-" and direction == "input":
# There is also codecs.getreader/writer(), but
# io.TextIOWrapper seems more modern and identical to what the
# built-in function open() creates.
sys.stdin = res = io.TextIOWrapper(
sys.stdin.detach(), encoding, errors, newline)
elif name == "-" and direction == "output":
sys.stdout = res = io.TextIOWrapper(
sys.stdout.detach(), encoding, errors, newline)
else:
mode = "rt" if direction == "input" else "wt"
# Keyword arguments useful to keep the default value or the
# 'buffering' argument that comes after 'mode'
res = open(name, mode, encoding=encoding, errors=errors,
newline=newline)
return res
def main():
locale.setlocale(locale.LC_ALL, '')
try:
params = process_command_line()
except ExitFromCommandLineOrConfigFileParsing as e:
if e.message is not None:
print(tw.fill("{}: {}".format(progname, e)), file=e.file)
sys.exit(e.exit_status)
# print(params.files, params.dtds)
# sys.exit()
entityRepos = EntityRepos()
for dtd in params.dtds:
with openStream(dtd["name"], "input", dtd["encoding"]) as f:
entityRepos.readDtd(f.read())
# This is necessary to be able to expand entities with a non-recursive
# method (which is the only reasonable alternative without any particular
# knowledge of the text to expand).
entityRepos.expandEntityDefs()
expansionFunc = {"simple": "expandEntitiesWithoutRecursion",
"hybrid": "processHybridString"}
i = 0
for ifile, ofile in zip(params.ifiles, params.ofiles):
i += 1
if i > 1:
print('', file=sys.stderr)
iname = "" if ifile["name"] == "-" else ifile["name"]
print("Expanding entities in '{}'...".format(iname), end=' ',
file=sys.stderr)
with openStream(ifile["name"], "input", ifile["encoding"]) as f:
expansion, unknown_entities = \
getattr(entityRepos, expansionFunc[params.mode])(f.read())
print("done.", file=sys.stderr)
if unknown_entities:
print(tw.fill("Unknown entities found during this expansion: {}."
.format(', '.join(unknown_entities))),
file=sys.stderr)
with openStream(ofile["name"], "output", ofile["encoding"]) as f:
f.writelines(expansion)
sys.exit(0)
if __name__ == "__main__": main()