Source code for parsimonious.grammar

"""A convenience which constructs expression trees from an easy-to-read syntax

Use this unless you have a compelling reason not to; it performs some
optimizations that would be tedious to do when constructing an expression tree
by hand.

"""
from collections import OrderedDict
from inspect import isfunction, ismethod

from six import (text_type, itervalues, iteritems, python_2_unicode_compatible, PY2)

from parsimonious.exceptions import BadGrammar, UndefinedLabel
from parsimonious.expressions import (Literal, Regex, Sequence, OneOf,
    Lookahead, Optional, ZeroOrMore, OneOrMore, Not, TokenMatcher,
    expression)
from parsimonious.nodes import NodeVisitor
from parsimonious.utils import evaluate_string

@python_2_unicode_compatible
class Grammar(OrderedDict):
    """A collection of rules that describe a language

    You can start parsing from the default rule by calling ``parse()``
    directly on the ``Grammar`` object::

        g = Grammar('''
                    polite_greeting = greeting ", my good " title
                    greeting        = "Hi" / "Hello"
                    title           = "madam" / "sir"
                    ''')
        g.parse('Hello, my good sir')

    Or start parsing from any of the other rules; you can pull them out of the
    grammar as if it were a dictionary::

        g['title'].parse('sir')

    You could also just construct a bunch of ``Expression`` objects yourself
    and stitch them together into a language, but using a ``Grammar`` has some
    important advantages:

    * Languages are much easier to define in the nice syntax it provides.
    * Circular references aren't a pain.
    * It does all kinds of whizzy space- and time-saving optimizations, like
      factoring up repeated subexpressions into a single object, which should
      increase cache hit ratio. [Is this implemented yet?]

    """
    def __init__(self, rules='', **more_rules):
        """Construct a grammar.

        :arg rules: A string of production rules, one per line.
        :arg default_rule: The name of the rule invoked when you call
            :meth:`parse()` or :meth:`match()` on the grammar. Defaults to the
            first rule. Falls back to None if there are no string-based rules
            in this grammar.
        :arg more_rules: Additional kwargs whose names are rule names and
            values are Expressions or custom-coded callables which accomplish
            things the built-in rule syntax cannot. These take precedence over
            ``rules`` in case of naming conflicts.

        """

        decorated_custom_rules = {
            k: (expression(v, k, self) if isfunction(v) or ismethod(v) else v)
            for k, v in iteritems(more_rules)}

        exprs, first = self._expressions_from_rules(rules, decorated_custom_rules)
        super(Grammar, self).__init__(exprs.items())
        self.default_rule = first  # may be None

    def default(self, rule_name):
        """Return a new Grammar whose :term:`default rule` is ``rule_name``."""
        new = self._copy()
        new.default_rule = new[rule_name]
        return new

    def _copy(self):
        """Return a shallow copy of myself.

        Deep is unnecessary, since Expression trees are immutable. Subgrammars
        recreate all the Expressions from scratch, and AbstractGrammars have
        no Expressions.

        """
        new = Grammar.__new__(Grammar)
        super(Grammar, new).__init__(iteritems(self))
        new.default_rule = self.default_rule
        return new

    def _expressions_from_rules(self, rules, custom_rules):
        """Return a 2-tuple: a dict of rule names pointing to their
        expressions, and then the first rule.

        It's a web of expressions, all referencing each other. Typically,
        there's a single root to the web of references, and that root is the
        starting symbol for parsing, but there's nothing saying you can't have
        multiple roots.

        :arg custom_rules: A map of rule names to custom-coded rules:
            Expressions

        """
        tree = rule_grammar.parse(rules)
        return RuleVisitor(custom_rules).visit(tree)

    def parse(self, text, pos=0):
        """Parse some text with the :term:`default rule`.

        :arg pos: The index at which to start parsing

        """
        self._check_default_rule()
        return self.default_rule.parse(text, pos=pos)

    def match(self, text, pos=0):
        """Parse some text with the :term:`default rule` but not necessarily
        all the way to the end.

        :arg pos: The index at which to start parsing

        """
        self._check_default_rule()
        return self.default_rule.match(text, pos=pos)

    def _check_default_rule(self):
        """Raise RuntimeError if there is no default rule defined."""
        if not self.default_rule:
            raise RuntimeError("Can't call parse() on a Grammar that has no "
                               "default rule. Choose a specific rule instead, "
                               "like some_grammar['some_rule'].parse(...).")

    def __str__(self):
        """Return a rule string that, when passed to the constructor, would
        reconstitute the grammar."""
        exprs = [self.default_rule] if self.default_rule else []
        exprs.extend(expr for expr in itervalues(self) if
                     expr is not self.default_rule)
        return '\n'.join(expr.as_rule() for expr in exprs)

    def __repr__(self):
        """Return an expression that will reconstitute the grammar."""
        codec = 'string_escape' if PY2 else 'unicode_escape'
        return "Grammar('%s')" % str(self).encode(codec)


class TokenGrammar(Grammar):
    """A Grammar which takes a list of pre-lexed tokens instead of text

    This is useful if you want to do the lexing yourself, as a separate pass:
    for example, to implement indentation-based languages.

    """
    def _expressions_from_rules(self, rules, custom_rules):
        tree = rule_grammar.parse(rules)
        return TokenRuleVisitor(custom_rules).visit(tree)


class BootstrappingGrammar(Grammar):
    """The grammar used to recognize the textual rules that describe other
    grammars

    This grammar gets its start from some hard-coded Expressions and claws its
    way from there to an expression tree that describes how to parse the
    grammar description syntax.

    """
    def _expressions_from_rules(self, rule_syntax, custom_rules):
        """Return the rules for parsing the grammar definition syntax.

        Return a 2-tuple: a dict of rule names pointing to their expressions,
        and then the top-level expression for the first rule.

        """
        # Hard-code enough of the rules to parse the grammar that describes the
        # grammar description language, to bootstrap:
        comment = Regex(r'#[^\r\n]*', name='comment')
        meaninglessness = OneOf(Regex(r'\s+'), comment, name='meaninglessness')
        _ = ZeroOrMore(meaninglessness, name='_')
        equals = Sequence(Literal('='), _, name='equals')
        label = Sequence(Regex(r'[a-zA-Z_][a-zA-Z_0-9]*'), _, name='label')
        reference = Sequence(label, Not(equals), name='reference')
        quantifier = Sequence(Regex(r'[*+?]'), _, name='quantifier')
        # This pattern supports empty literals. TODO: A problem?
        spaceless_literal = Regex(r'u?r?"[^"\\]*(?:\\.[^"\\]*)*"',
                                  ignore_case=True,
                                  dot_all=True,
                                  name='spaceless_literal')
        literal = Sequence(spaceless_literal, _, name='literal')
        regex = Sequence(Literal('~'),
                         literal,
                         Regex('[ilmsux]*', ignore_case=True),
                         _,
                         name='regex')
        atom = OneOf(reference, literal, regex, name='atom')
        quantified = Sequence(atom, quantifier, name='quantified')

        term = OneOf(quantified, atom, name='term')
        not_term = Sequence(Literal('!'), term, _, name='not_term')
        term.members = (not_term,) + term.members

        sequence = Sequence(term, OneOrMore(term), name='sequence')
        or_term = Sequence(Literal('/'), _, term, name='or_term')
        ored = Sequence(term, OneOrMore(or_term), name='ored')
        expression = OneOf(ored, sequence, term, name='expression')
        rule = Sequence(label, equals, expression, name='rule')
        rules = Sequence(_, OneOrMore(rule), name='rules')

        # Use those hard-coded rules to parse the (more extensive) rule syntax.
        # (For example, unless I start using parentheses in the rule language
        # definition itself, I should never have to hard-code expressions for
        # those above.)

        rule_tree = rules.parse(rule_syntax)

        # Turn the parse tree into a map of expressions:
        return RuleVisitor().visit(rule_tree)


# The grammar for parsing PEG grammar definitions:
# This is a nice, simple grammar. We may someday add to it, but it's a safe bet
# that the future will always be a superset of this.
rule_syntax = (r'''
    # Ignored things (represented by _) are typically hung off the end of the
    # leafmost kinds of nodes. Literals like "/" count as leaves.

    rules = _ rule*
    rule = label equals expression
    equals = "=" _
    literal = spaceless_literal _

    # So you can't spell a regex like `~"..." ilm`:
    spaceless_literal = ~"u?r?\"[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*\""is /
                        ~"u?r?'[^'\\\\]*(?:\\\\.[^'\\\\]*)*'"is

    expression = ored / sequence / term
    or_term = "/" _ term
    ored = term or_term+
    sequence = term term+
    not_term = "!" term _
    lookahead_term = "&" term _
    term = not_term / lookahead_term / quantified / atom
    quantified = atom quantifier
    atom = reference / literal / regex / parenthesized
    regex = "~" spaceless_literal ~"[ilmsux]*"i _
    parenthesized = "(" _ expression ")" _
    quantifier = ~"[*+?]" _
    reference = label !equals

    # A subsequent equal sign is the only thing that distinguishes a label
    # (which begins a new rule) from a reference (which is just a pointer to a
    # rule defined somewhere else):
    label = ~"[a-zA-Z_][a-zA-Z_0-9]*" _

    # _ = ~r"\s*(?:#[^\r\n]*)?\s*"
    _ = meaninglessness*
    meaninglessness = ~r"\s+" / comment
    comment = ~r"#[^\r\n]*"
    ''')


class LazyReference(text_type):
    """A lazy reference to a rule, which we resolve after grokking all the
    rules"""

    name = u''

    # Just for debugging:
    def _as_rhs(self):
        return u'<LazyReference to %s>' % self


class RuleVisitor(NodeVisitor):
    """Turns a parse tree of a grammar definition into a map of ``Expression``
    objects

    This is the magic piece that breathes life into a parsed bunch of parse
    rules, allowing them to go forth and parse other things.

    """
    quantifier_classes = {'?': Optional, '*': ZeroOrMore, '+': OneOrMore}

    visit_expression = visit_term = visit_atom = NodeVisitor.lift_child

    def __init__(self, custom_rules=None):
        """Construct.

        :arg custom_rules: A dict of {rule name: expression} holding custom
            rules which will take precedence over the others

        """
        self.custom_rules = custom_rules or {}

    def visit_parenthesized(self, node, parenthesized):
        """Treat a parenthesized subexpression as just its contents.

        Its position in the tree suffices to maintain its grouping semantics.

        """
        left_paren, _, expression, right_paren, _ = parenthesized
        return expression

    def visit_quantifier(self, node, quantifier):
        """Turn a quantifier into just its symbol-matching node."""
        symbol, _ = quantifier
        return symbol

    def visit_quantified(self, node, quantified):
        atom, quantifier = quantified
        return self.quantifier_classes[quantifier.text](atom)

    def visit_lookahead_term(self, node, lookahead_term):
        ampersand, term, _ = lookahead_term
        return Lookahead(term)

    def visit_not_term(self, node, not_term):
        exclamation, term, _ = not_term
        return Not(term)

    def visit_rule(self, node, rule):
        """Assign a name to the Expression and return it."""
        label, equals, expression = rule
        expression.name = label  # Assign a name to the expr.
        return expression

    def visit_sequence(self, node, sequence):
        """A parsed Sequence looks like [term node, OneOrMore node of
        ``another_term``s]. Flatten it out."""
        term, other_terms = sequence
        return Sequence(term, *other_terms)

    def visit_ored(self, node, ored):
        first_term, other_terms = ored
        return OneOf(first_term, *other_terms)

    def visit_or_term(self, node, or_term):
        """Return just the term from an ``or_term``.

        We already know it's going to be ored, from the containing ``ored``.

        """
        slash, _, term = or_term
        return term

    def visit_label(self, node, label):
        """Turn a label into a unicode string."""
        name, _ = label
        return name.text

    def visit_reference(self, node, reference):
        """Stick a :class:`LazyReference` in the tree as a placeholder.

        We resolve them all later.

        """
        label, not_equals = reference
        return LazyReference(label)

    def visit_regex(self, node, regex):
        """Return a ``Regex`` expression."""
        tilde, literal, flags, _ = regex
        flags = flags.text.upper()
        pattern = literal.literal  # Pull the string back out of the Literal
                                   # object.
        return Regex(pattern, ignore_case='I' in flags,
                              locale='L' in flags,
                              multiline='M' in flags,
                              dot_all='S' in flags,
                              unicode='U' in flags,
                              verbose='X' in flags)

    def visit_spaceless_literal(self, spaceless_literal, visited_children):
        """Turn a string literal into a ``Literal`` that recognizes it."""
        return Literal(evaluate_string(spaceless_literal.text))

    def visit_literal(self, node, literal):
        """Pick just the literal out of a literal-and-junk combo."""
        spaceless_literal, _ = literal
        return spaceless_literal

    def generic_visit(self, node, visited_children):
        """Replace childbearing nodes with a list of their children; keep
        others untouched.

        For our case, if a node has children, only the children are important.
        Otherwise, keep the node around for (for example) the flags of the
        regex rule. Most of these kept-around nodes are subsequently thrown
        away by the other visitor methods.

        We can't simply hang the visited children off the original node; that
        would be disastrous if the node occurred in more than one place in the
        tree.

        """
        return visited_children or node  # should semantically be a tuple

    def _resolve_refs(self, rule_map, expr, done):
        """Return an expression with all its lazy references recursively
        resolved.

        Resolve any lazy references in the expression ``expr``, recursing into
        all subexpressions.

        :arg done: The set of Expressions that have already been or are
            currently being resolved, to ward off redundant work and prevent
            infinite recursion for circular refs

        """
        if isinstance(expr, LazyReference):
            label = text_type(expr)
            try:
                reffed_expr = rule_map[label]
            except KeyError:
                raise UndefinedLabel(expr)
            return self._resolve_refs(rule_map, reffed_expr, done)
        else:
            if getattr(expr, 'members', ()) and expr not in done:
                # Prevents infinite recursion for circular refs. At worst, one
                # of `expr.members` can refer back to `expr`, but it can't go
                # any farther.
                done.add(expr)
                expr.members = tuple(self._resolve_refs(rule_map, member, done)
                                     for member in expr.members)
            return expr

    def visit_rules(self, node, rules_list):
        """Collate all the rules into a map. Return (map, default rule).

        The default rule is the first one. Or, if you have more than one rule
        of that name, it's the last-occurring rule of that name. (This lets you
        override the default rule when you extend a grammar.) If there are no
        string-based rules, the default rule is None, because the custom rules,
        due to being kwarg-based, are unordered.

        """
        _, rules = rules_list

        # Map each rule's name to its Expression. Later rules of the same name
        # override earlier ones. This lets us define rules multiple times and
        # have the last declaration win, so you can extend grammars by
        # concatenation.
        rule_map = OrderedDict((expr.name, expr) for expr in rules)

        # And custom rules override string-based rules. This is the least
        # surprising choice when you compare the dict constructor:
        # dict({'x': 5}, x=6).
        rule_map.update(self.custom_rules)

        # Resolve references. This tolerates forward references.
        done = set()
        rule_map = OrderedDict((expr.name, self._resolve_refs(rule_map, expr, done))
                               for expr in itervalues(rule_map))

        # isinstance() is a temporary hack around the fact that * rules don't
        # always get transformed into lists by NodeVisitor. We should fix that;
        # it's surprising and requires writing lame branches like this.
        return rule_map, (rule_map[rules[0].name]
                          if isinstance(rules, list) and rules else None)


class TokenRuleVisitor(RuleVisitor):
    """A visitor which builds expression trees meant to work on sequences of
    pre-lexed tokens rather than strings"""

    def visit_spaceless_literal(self, spaceless_literal, visited_children):
        """Turn a string literal into a ``TokenMatcher`` that matches
        ``Token`` objects by their ``type`` attributes."""
        return TokenMatcher(evaluate_string(spaceless_literal.text))

    def visit_regex(self, node, regex):
        tilde, literal, flags, _ = regex
        raise BadGrammar('Regexes do not make sense in TokenGrammars, since '
                         'TokenGrammars operate on pre-lexed tokens rather '
                         'than characters.')


# Bootstrap to level 1...
rule_grammar = BootstrappingGrammar(rule_syntax)
# ...and then to level 2. This establishes that the node tree of our rule
# syntax is built by the same machinery that will build trees of our users'
# grammars. And the correctness of that tree is tested, indirectly, in
# test_grammar.
rule_grammar = Grammar(rule_syntax)


# TODO: Teach Expression trees how to spit out Python representations of
# themselves. Then we can just paste that in above, and we won't have to
# bootstrap on import. Though it'll be a little less DRY. [Ah, but this is not
# so clean, because it would have to output multiple statements to get multiple
# refs to a single expression hooked up.]