Source code for condconfigparser.lexer

# -*- coding: utf-8 -*-

# lexer.py --- Lexer module of CondConfigParser
#
# Copyright (c) 2014, Florent Rougon
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
#    this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright notice,
#    this list of conditions and the following disclaimer in the documentation
#    and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
#
# The views and conclusions contained in the software and documentation are
# those of the authors and should not be interpreted as representing official
# policies, either expressed or implied, of the CondConfigParser Project.

"""Lexer module of CondConfigParser.

This module defines a :class:`Token` class, one subclass for every token
type, a :class:`TokenType` :class:`enum.Enum` and a :class:`Lexer`
class.

"""
import re
import collections
import enum

from .exceptions import ParseError


# Taken from the enum documentation
class AutoNumber(enum.Enum):
    def __new__(cls):
        value = len(cls.__members__) + 1
        obj = object.__new__(cls)
        obj._value_ = value
        return obj

[docs]@enum.unique # enum.IntEnum could be more appropriate if one wanted to use actual parse # tables typical of LL(k) grammars. class TokenType(AutoNumber): """Identifier objects for token types.""" newline = () varAssignmentsStart = () varAssignmentsEnd = () predicateStart = () predicateEnd = () listStart = () listEnd = () openParen = () closeParen = () orOp = () andOp = () notOp = () equalsOp = () notEqualsOp = () inOp = () assignOp = () comma = () true = () false = () stringLiteral = () variable = () rawConfigLine = ()
[docs]class Token: """Class representing a token instance (lexeme). :attr:`Token.startline` and :attr:`Token.startcol` are 1-based line and column numbers for the first character of the lexeme (i.e., where the token starts in the parsed stream). Assuming it fits on one line of the input stream (which is not necessarily the case for string literal tokens), the column number of its last character is given by: :attr:`Token.startcol` + len(:attr:`Token.string`) - 1. Instances of this class must be considered immutable. They can be used as dictionary keys and set elements, among others. Since these operations assume that the hash value of the object never changes, it is an error to modify such an object after an operation that relies on its hash value (cf. :meth:`object.__hash__`). """ #: Token type (:class:`TokenType` instance) type = None #: The raw lexeme, *i.e.*, the exact string read from the parsed file string = None #: String used for :meth:`~object.__str__`, in particular for the #: :class:`NewlineToken` stringRepr = None def __init__(self, startline, startcol): for n in ("startline", "startcol"): setattr(self, n, locals()[n]) # Goes together with __hash__() def __eq__(self, other): return self.type == other.type and \ self.string == other.string and \ self.stringRepr == other.stringRepr and \ self.startline == other.startline and \ self.startcol == other.startcol def __ne__(self, other): return not self.__eq__(other) # Goes together with __eq__() def __hash__(self): return (self.type.value ^ hash(self.string) ^ hash(self.stringRepr) ^ self.startline ^ self.startcol) def __str__(self): return '<{} "{}" {}:{}>'.format(self.type.name, self.stringRepr, self.startline, self.startcol) def __repr__(self): return "{}.{}({!r}, {!r})".format( __name__, type(self).__name__, self.startline, self.startcol) # Useful when creating an exception related to a token
[docs] def pos(self): return (self.startline, self.startcol)
[docs] def formatStartPos(self): return "{}:{}".format(self.startline, self.startcol)
[docs]class NewlineToken(Token): type = TokenType.newline string = "\n" stringRepr = r"\n"
[docs]class VarAssignmentsStartToken(Token): type = TokenType.varAssignmentsStart string = stringRepr = "{"
[docs]class VarAssignmentsEndToken(Token): type = TokenType.varAssignmentsEnd string = stringRepr = "}"
[docs]class PredicateStartToken(Token): type = TokenType.predicateStart string = stringRepr = "["
[docs]class PredicateEndToken(Token): type = TokenType.predicateEnd string = stringRepr = "]"
[docs]class ListStartToken(Token): type = TokenType.listStart string = stringRepr = "["
[docs]class ListEndToken(Token): type = TokenType.listEnd string = stringRepr = "]"
[docs]class OpenParenToken(Token): type = TokenType.openParen string = stringRepr = "("
[docs]class CloseParenToken(Token): type = TokenType.closeParen string = stringRepr = ")"
[docs]class OrToken(Token): type = TokenType.orOp string = stringRepr = "or"
[docs]class AndToken(Token): type = TokenType.andOp string = stringRepr = "and"
[docs]class NotToken(Token): type = TokenType.notOp string = stringRepr = "not"
[docs]class EqualsToken(Token): type = TokenType.equalsOp string = stringRepr = "=="
[docs]class NotEqualsToken(Token): type = TokenType.notEqualsOp string = stringRepr = "!="
[docs]class InToken(Token): type = TokenType.inOp string = stringRepr = "in"
[docs]class AssignToken(Token): type = TokenType.assignOp string = stringRepr = "="
[docs]class CommaToken(Token): type = TokenType.comma string = stringRepr = ","
[docs]class TrueToken(Token): type = TokenType.true string = stringRepr = "True"
[docs]class FalseToken(Token): type = TokenType.false string = stringRepr = "False"
[docs]class StringLiteralToken(Token): type = TokenType.stringLiteral def __init__(self, startline, startcol, unprocessedString, value): Token.__init__(self, startline, startcol) self.string = self.stringRepr = unprocessedString #: String obtained after escape sequences expansion in the #: string literal self.value = value # No need to define __eq__(): if two instances have the same # 'unprocessedString' (stored in self.string), then they necessarily have # the same value in the 'value' attribute. def __repr__(self): return "{}.{}({!r}, {!r}, {!r}, {!r})".format( __name__, type(self).__name__, self.startline, self.startcol, self.string, self.value)
[docs]class VariableToken(Token): type = TokenType.variable def __init__(self, startline, startcol, name): Token.__init__(self, startline, startcol) self.string = self.stringRepr = name def __repr__(self): return "{}.{}({!r}, {!r}, {!r})".format( __name__, type(self).__name__, self.startline, self.startcol, self.string)
[docs]class RawConfigLineToken(Token): type = TokenType.rawConfigLine def __init__(self, startline, startcol, line): Token.__init__(self, startline, startcol) self.string = self.stringRepr = line def __repr__(self): return "{}.{}({!r}, {!r}, {!r})".format( __name__, type(self).__name__, self.startline, self.startcol, self.string)
#: Which token(s) may be closed by a given closing delimiter mayClose = {")": (OpenParenToken,), "}": (VarAssignmentsStartToken,), "]": (ListStartToken, PredicateStartToken)}
[docs]class Lexer: WSandComments_cre = re.compile(r" *(#.*)?") keywordOrVariable_cre = re.compile(r"\b ([a-zA-Z0-9_]+) \b", re.VERBOSE) equalsOp_cre = re.compile(r"==[^=!]") notEqualsOp_cre = re.compile(r"!=[^=!]") assign_cre = re.compile(r"=[^=!]") backslashNewline_cre = re.compile(r"\\\n") def __init__(self, stream): self.stream = stream self.line = 1 # Typical line numbering scheme self.col = 1 # No universal consensus here... self.curline = stream.readline()
[docs] def readline(self): self.curline = self.stream.readline() if self.curline: self.line += 1 self.col = 1 return self.curline
[docs] def peek(self): try: c = self.curline[self.col - 1] except IndexError: return "" # EOF else: return c # Return \n at the end of a line
[docs] def skipWSandComments(self): """Skip spaces and comments (all on a single line).""" mo = self.WSandComments_cre.match(self.curline, self.col - 1) if mo: self.col += mo.end() - mo.start() # Quicker than len(mo.group())? return mo
[docs] def skipWSNLandComments(self, delimStack=None): """Skip a possibly-multiline mix of spaces and comments. By default, :class:`NewlineToken` instances are collected as encountered and returned in the form of a list. However, if *delimStack* is a non-empty delimiter stack, some of the newline tokens are selectively omitted from the returned list. """ ignoreNewlineTokens = False # When delimStack is passed and non-empty, newline tokens are ignored # if and only if we are: # # (a) inside a <predicate>; # # (b) or inside a <varAssignments> and there is at least a # <listStart> "[" or an <openParen> "(" that has not been closed. if delimStack: if delimStack[0].type == TokenType.predicateStart: ignoreNewlineTokens = True else: assert delimStack[0].type == TokenType.varAssignmentsStart, \ delimStack[0] for t in delimStack: if t.type in (TokenType.listStart, TokenType.openParen): ignoreNewlineTokens = True break newlineTokens = [] while True: self.skipWSandComments() # stops before an eventual \n char c = self.peek() if not c: break # EOF elif c == "\n": if not ignoreNewlineTokens: newlineTokens.append(NewlineToken(self.line, self.col)) self.readline() else: break return newlineTokens
[docs] def scanStringLiteralToken(self): # Remember where the string literal starts startline, startcol = self.line, self.col i = self.col assert i > 0 and self.curline[i-1] == '"', (i, self.curline) # Will be set to True when we encounter an unescaped \ character processingEscape = False unprocessed = [] # Before escape sequences processing chars = [] # After escape sequences processing while True: try: c = self.curline[i] except IndexError: if self.curline[self.col-1:].endswith("\n"): text = self.curline[self.col-1:-1] else: text = self.curline[self.col-1:] raise ParseError( (self.line, i+1), "EOF reached while reading a " "string literal: {}".format(text)) if processingEscape: if c == "\\": chars.append("\\") elif c == "n": chars.append("\n") elif c == "t": chars.append("\t") elif c == '"': chars.append('"') elif c == "\n": # backslash-newline escape sequence self.readline() i = -1 else: raise ParseError( (self.line, i), "invalid escape sequence in string " "literal: {}".format(self.curline[i-1:i+1])) processingEscape = False elif c == "\\": processingEscape = True elif c == '"': break # end of the string literal else: chars.append(c) unprocessed.append(c) i += 1 # self.line is automatically updated whenever self.readline() is # called. self.col = i + 2 return (startline, startcol, ''.join(unprocessed), ''.join(chars))
[docs] def checkMatchingDelimiters(self, delimStack, closing, closingName, opening, openingName): try: t = delimStack[-1] # top of the stack except IndexError: raise ParseError( (self.line, self.col), "{} '{}' without any matching {} '{}'" .format(closingName, closing, openingName, opening)) if not isinstance(t, mayClose[closing]): raise ParseError( (self.line, self.col), "{} '{}' can't close '{}' at {}".format( closingName, closing, t.string, t.formatStartPos()))
[docs] def scanBalancedTokens(self, delimStack): """Scan a balanced sequence of tokens. Normally, the token sequence should start right after a :class:`VarAssignmentsStartToken` (``{``) or :class:`ListStartToken` (``[``), which should be found at the top of *delimStack*. All subsequent tokens will be scanned and yielded until a ``}`` or ``]`` matching the top of *delimStack* is found. The method does not consume that closing delimiter, for symmetry with the handling of the opening delimiter. """ while True: yield from self.skipWSNLandComments(delimStack=delimStack) c = self.peek() if not c: break # EOF if c == "{": # It is probably an error to have an opening bracket # here, however the parser is in a better position to # decide on that matter. token = VarAssignmentsStartToken(self.line, self.col) delimStack.append(token) elif c == "[": token = ListStartToken(self.line, self.col) delimStack.append(token) elif c == "(": token = OpenParenToken(self.line, self.col) delimStack.append(token) elif c == "}": self.checkMatchingDelimiters( delimStack, "}", "closing brace", "{", "opening brace") if len(delimStack) == 1: break else: st = delimStack.pop() # start token assert st.type is TokenType.varAssignmentsStart, st.type token = VarAssignmentsEndToken(self.line, self.col) elif c == "]": self.checkMatchingDelimiters( delimStack, "]", "closing bracket", "[", "opening bracket") if len(delimStack) == 1: break else: st = delimStack.pop() # matching opening token assert st.type is TokenType.listStart, st.type token = ListEndToken(self.line, self.col) elif c == ")": self.checkMatchingDelimiters(delimStack, ")", "closing parenthesis", "(", "opening parenthesis") st = delimStack.pop() # matching opening token assert st.type is TokenType.openParen, st.type token = CloseParenToken(self.line, self.col) elif c == ",": token = CommaToken(self.line, self.col) elif c == '"': token = StringLiteralToken(*self.scanStringLiteralToken()) else: mo = self.keywordOrVariable_cre.match(self.curline, self.col - 1) if mo: word = mo.group(1) for kw, t in (("or", OrToken), ("and", AndToken), ("not", NotToken), ("in", InToken), ("True", TrueToken), ("False", FalseToken)): if word == kw: token = t(self.line, self.col) break else: token = VariableToken(self.line, self.col, word) else: # Whether to "continue" to the start of the outer loop once # the following (inner) loop is over. contAfterLoop = False for regexp, t in ((self.equalsOp_cre, EqualsToken), (self.notEqualsOp_cre, NotEqualsToken), (self.assign_cre, AssignToken), (self.backslashNewline_cre, "bsNl")): mo = regexp.match(self.curline, self.col - 1) if mo and t == "bsNl": # Backslash followed by a newline → ignore self.readline() # No token, no automatic self.col advance contAfterLoop = True break elif mo: token = t(self.line, self.col) break else: text = self.curline[self.col-1:] if text.endswith("\n"): text = text[:-1] assert text, text raise ParseError((self.line, self.col), "does not start with a valid " "token: {}".format(text)) if contAfterLoop: continue # String literals tokens are handled separately because they may # span multiple lines. if not isinstance(token, StringLiteralToken): self.col += len(token.string) # advance in the input stream yield token
[docs] def scanEnclosedTokenGroup(self): """Scan a :token:`varAssignments` or :token:`predicate`.""" # Stack where opening delimiters will be stored in order to # check that they are properly matched by the corresponding # closing delimiters. # # Keeping track of the set of currently opened brackets and # braces in the lexer is necessary because it must behave # differently depending on whether it is inside a # <varAssignments> or <predicate>, or outside both of these # (newlines are treated differently, for one; as a little bonus, # this allows the lexer to generate either a <listEnd> or a # <predicateEnd> for an encountered ']', depending on the # nesting level). delimStack = collections.deque() c = self.peek() if c == "{": t = VarAssignmentsStartToken(self.line, self.col) elif c == "[": t = PredicateStartToken(self.line, self.col) else: assert False, "expected '{{' or '[' instead of '{}'".format(c) delimStack.append(t) # this is a "push" self.col += 1 # advance in the input stream yield t # Scan all tokens inside the group, making sure that all opening # and closing delimiters come in matching pairs. yield from self.scanBalancedTokens(delimStack) ot = delimStack.pop() # the opening { or [ cc = self.peek() if not cc: raise ParseError((self.line, self.col), "EOF reached while reading a <predicate>") elif cc == "}": assert c == "{", "expected '{{' instead of '{}'".format(c) assert ot.type is TokenType.varAssignmentsStart, ot.type yield VarAssignmentsEndToken(self.line, self.col) else: assert cc == "]", "expected ']' instead of '{}'".format(cc) assert c == "[", "expected '[' instead of '{}'".format(c) assert ot.type is TokenType.predicateStart, ot.type yield PredicateEndToken(self.line, self.col) self.col += 1 # advance in the input stream yield from self.skipWSNLandComments()
[docs] def scanRawConfig(self): """Scan a :token:`rawConfigLine`.""" while True: c = self.peek() # A '[' at the beginning of a line (possibly following # whitespace) marks the beginning of the next <predicate>. if not c or c == "[": break text = self.curline[self.col-1:] if text.endswith("\n"): text = text[:-1] yield RawConfigLineToken(self.line, self.col, text) self.readline() self.skipWSNLandComments()
[docs] def tokenGenerator(self): """Generate all tokens from the input stream.""" # Skip initial whitespace and comments self.skipWSNLandComments() c = self.peek() if not c: return # EOF elif c == "{": # The optional <varAssignments> section is present, read it. yield from self.scanEnclosedTokenGroup() while True: # Scan zero or more <rawConfigLine> tokens. yield from self.scanRawConfig() if not self.peek(): break # Scan a <predicate>. yield from self.scanEnclosedTokenGroup() if not self.peek(): break