| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473 |
- """Subexpressions that make up a parsed grammar
- These do the parsing.
- """
- # TODO: Make sure all symbol refs are local--not class lookups or
- # anything--for speed. And kill all the dots.
- from collections import defaultdict
- from inspect import getfullargspec, isfunction, ismethod, ismethoddescriptor
- import regex as re
- from parsimonious.exceptions import ParseError, IncompleteParseError, LeftRecursionError
- from parsimonious.nodes import Node, RegexNode
- from parsimonious.utils import StrAndRepr
- def is_callable(value):
- criteria = [isfunction, ismethod, ismethoddescriptor]
- return any([criterion(value) for criterion in criteria])
- def expression(callable, rule_name, grammar):
- """Turn a plain callable into an Expression.
- The callable can be of this simple form::
- def foo(text, pos):
- '''If this custom expression matches starting at text[pos], return
- the index where it stops matching. Otherwise, return None.'''
- if the expression matched:
- return end_pos
- If there child nodes to return, return a tuple::
- return end_pos, children
- If the expression doesn't match at the given ``pos`` at all... ::
- return None
- If your callable needs to make sub-calls to other rules in the grammar or
- do error reporting, it can take this form, gaining additional arguments::
- def foo(text, pos, cache, error, grammar):
- # Call out to other rules:
- node = grammar['another_rule'].match_core(text, pos, cache, error)
- ...
- # Return values as above.
- The return value of the callable, if an int or a tuple, will be
- automatically transmuted into a :class:`~parsimonious.Node`. If it returns
- a Node-like class directly, it will be passed through unchanged.
- :arg rule_name: The rule name to attach to the resulting
- :class:`~parsimonious.Expression`
- :arg grammar: The :class:`~parsimonious.Grammar` this expression will be a
- part of, to make delegating to other rules possible
- """
- # Resolve unbound methods; allows grammars to use @staticmethod custom rules
- # https://stackoverflow.com/questions/41921255/staticmethod-object-is-not-callable
- if ismethoddescriptor(callable) and hasattr(callable, '__func__'):
- callable = callable.__func__
- num_args = len(getfullargspec(callable).args)
- if ismethod(callable):
- # do not count the first argument (typically 'self') for methods
- num_args -= 1
- if num_args == 2:
- is_simple = True
- elif num_args == 5:
- is_simple = False
- else:
- raise RuntimeError("Custom rule functions must take either 2 or 5 "
- "arguments, not %s." % num_args)
- class AdHocExpression(Expression):
- def _uncached_match(self, text, pos, cache, error):
- result = (callable(text, pos) if is_simple else
- callable(text, pos, cache, error, grammar))
- if isinstance(result, int):
- end, children = result, None
- elif isinstance(result, tuple):
- end, children = result
- else:
- # Node or None
- return result
- return Node(self, text, pos, end, children=children)
- def _as_rhs(self):
- return '{custom function "%s"}' % callable.__name__
- return AdHocExpression(name=rule_name)
- IN_PROGRESS = object()
- class Expression(StrAndRepr):
- """A thing that can be matched against a piece of text"""
- # Slots are about twice as fast as __dict__-based attributes:
- # http://stackoverflow.com/questions/1336791/dictionary-vs-object-which-is-more-efficient-and-why
- # Top-level expressions--rules--have names. Subexpressions are named ''.
- __slots__ = ['name', 'identity_tuple']
- def __init__(self, name=''):
- self.name = name
- self.identity_tuple = (self.name, )
- def __hash__(self):
- return hash(self.identity_tuple)
- def __eq__(self, other):
- return self._eq_check_cycles(other, set())
- def __ne__(self, other):
- return not (self == other)
- def _eq_check_cycles(self, other, checked):
- # keep a set of all pairs that are already checked, so we won't fall into infinite recursions.
- checked.add((id(self), id(other)))
- return other.__class__ is self.__class__ and self.identity_tuple == other.identity_tuple
- def resolve_refs(self, rule_map):
- # Nothing to do on the base expression.
- return self
- def parse(self, text, pos=0):
- """Return a parse tree of ``text``.
- Raise ``ParseError`` if the expression wasn't satisfied. Raise
- ``IncompleteParseError`` if the expression was satisfied but didn't
- consume the full string.
- """
- node = self.match(text, pos=pos)
- if node.end < len(text):
- raise IncompleteParseError(text, node.end, self)
- return node
- def match(self, text, pos=0):
- """Return the parse tree matching this expression at the given
- position, not necessarily extending all the way to the end of ``text``.
- Raise ``ParseError`` if there is no match there.
- :arg pos: The index at which to start matching
- """
- error = ParseError(text)
- node = self.match_core(text, pos, defaultdict(dict), error)
- if node is None:
- raise error
- return node
- def match_core(self, text, pos, cache, error):
- """Internal guts of ``match()``
- This is appropriate to call only from custom rules or Expression
- subclasses.
- :arg cache: The packrat cache::
- {(oid, pos): Node tree matched by object `oid` at index `pos` ...}
- :arg error: A ParseError instance with ``text`` already filled in but
- otherwise blank. We update the error reporting info on this object
- as we go. (Sticking references on an existing instance is faster
- than allocating a new one for each expression that fails.) We
- return None rather than raising and catching ParseErrors because
- catching is slow.
- """
- # TODO: Optimize. Probably a hot spot.
- #
- # Is there a faster way of looking up cached stuff?
- #
- # If this is slow, think about the array module. It might (or might
- # not!) use more RAM, but it'll likely be faster than hashing things
- # all the time. Also, can we move all the allocs up front?
- #
- # To save space, we have lots of choices: (0) Quit caching whole Node
- # objects. Cache just what you need to reconstitute them. (1) Cache
- # only the results of entire rules, not subexpressions (probably a
- # horrible idea for rules that need to backtrack internally a lot). (2)
- # Age stuff out of the cache somehow. LRU? (3) Cuts.
- expr_cache = cache[id(self)]
- if pos in expr_cache:
- node = expr_cache[pos]
- else:
- # TODO: Set default value to prevent infinite recursion in left-recursive rules.
- expr_cache[pos] = IN_PROGRESS # Mark as in progress
- node = expr_cache[pos] = self._uncached_match(text, pos, cache, error)
- if node is IN_PROGRESS:
- raise LeftRecursionError(text, pos=-1, expr=self)
- # Record progress for error reporting:
- if node is None and pos >= error.pos and (
- self.name or getattr(error.expr, 'name', None) is None):
- # Don't bother reporting on unnamed expressions (unless that's all
- # we've seen so far), as they're hard to track down for a human.
- # Perhaps we could include the unnamed subexpressions later as
- # auxiliary info.
- error.expr = self
- error.pos = pos
- return node
- def __str__(self):
- return '<%s %s>' % (
- self.__class__.__name__,
- self.as_rule())
- def as_rule(self):
- """Return the left- and right-hand sides of a rule that represents me.
- Return unicode. If I have no ``name``, omit the left-hand side.
- """
- rhs = self._as_rhs().strip()
- if rhs.startswith('(') and rhs.endswith(')'):
- rhs = rhs[1:-1]
- return ('%s = %s' % (self.name, rhs)) if self.name else rhs
- def _unicode_members(self):
- """Return an iterable of my unicode-represented children, stopping
- descent when we hit a named node so the returned value resembles the
- input rule."""
- return [(m.name or m._as_rhs()) for m in self.members]
- def _as_rhs(self):
- """Return the right-hand side of a rule that represents me.
- Implemented by subclasses.
- """
- raise NotImplementedError
- class Literal(Expression):
- """A string literal
- Use these if you can; they're the fastest.
- """
- __slots__ = ['literal']
- def __init__(self, literal, name=''):
- super().__init__(name)
- self.literal = literal
- self.identity_tuple = (name, literal)
- def _uncached_match(self, text, pos, cache, error):
- if text.startswith(self.literal, pos):
- return Node(self, text, pos, pos + len(self.literal))
- def _as_rhs(self):
- return repr(self.literal)
- class TokenMatcher(Literal):
- """An expression matching a single token of a given type
- This is for use only with TokenGrammars.
- """
- def _uncached_match(self, token_list, pos, cache, error):
- if token_list[pos].type == self.literal:
- return Node(self, token_list, pos, pos + 1)
- class Regex(Expression):
- """An expression that matches what a regex does.
- Use these as much as you can and jam as much into each one as you can;
- they're fast.
- """
- __slots__ = ['re']
- def __init__(self, pattern, name='', ignore_case=False, locale=False,
- multiline=False, dot_all=False, unicode=False, verbose=False, ascii=False):
- super().__init__(name)
- self.re = re.compile(pattern, (ignore_case and re.I) |
- (locale and re.L) |
- (multiline and re.M) |
- (dot_all and re.S) |
- (unicode and re.U) |
- (verbose and re.X) |
- (ascii and re.A))
- self.identity_tuple = (self.name, self.re)
- def _uncached_match(self, text, pos, cache, error):
- """Return length of match, ``None`` if no match."""
- m = self.re.match(text, pos)
- if m is not None:
- span = m.span()
- node = RegexNode(self, text, pos, pos + span[1] - span[0])
- node.match = m # TODO: A terrible idea for cache size?
- return node
- def _regex_flags_from_bits(self, bits):
- """Return the textual equivalent of numerically encoded regex flags."""
- flags = 'ilmsuxa'
- return ''.join(flags[i - 1] if (1 << i) & bits else '' for i in range(1, len(flags) + 1))
- def _as_rhs(self):
- return '~{!r}{}'.format(self.re.pattern,
- self._regex_flags_from_bits(self.re.flags))
- class Compound(Expression):
- """An abstract expression which contains other expressions"""
- __slots__ = ['members']
- def __init__(self, *members, **kwargs):
- """``members`` is a sequence of expressions."""
- super().__init__(kwargs.get('name', ''))
- self.members = members
- def resolve_refs(self, rule_map):
- self.members = tuple(m.resolve_refs(rule_map) for m in self.members)
- return self
- def _eq_check_cycles(self, other, checked):
- return (
- super()._eq_check_cycles(other, checked) and
- len(self.members) == len(other.members) and
- all(m._eq_check_cycles(mo, checked) for m, mo in zip(self.members, other.members) if (id(m), id(mo)) not in checked)
- )
- def __hash__(self):
- # Note we leave members out of the hash computation, since compounds can get added to
- # sets, then have their members mutated. See RuleVisitor._resolve_refs.
- # Equality should still work, but we want the rules to go into the correct hash bucket.
- return hash((self.__class__, self.name))
- class Sequence(Compound):
- """A series of expressions that must match contiguous, ordered pieces of
- the text
- In other words, it's a concatenation operator: each piece has to match, one
- after another.
- """
- def _uncached_match(self, text, pos, cache, error):
- new_pos = pos
- children = []
- for m in self.members:
- node = m.match_core(text, new_pos, cache, error)
- if node is None:
- return None
- children.append(node)
- length = node.end - node.start
- new_pos += length
- # Hooray! We got through all the members!
- return Node(self, text, pos, new_pos, children)
- def _as_rhs(self):
- return '({0})'.format(' '.join(self._unicode_members()))
- class OneOf(Compound):
- """A series of expressions, one of which must match
- Expressions are tested in order from first to last. The first to succeed
- wins.
- """
- def _uncached_match(self, text, pos, cache, error):
- for m in self.members:
- node = m.match_core(text, pos, cache, error)
- if node is not None:
- # Wrap the succeeding child in a node representing the OneOf:
- return Node(self, text, pos, node.end, children=[node])
- def _as_rhs(self):
- return '({0})'.format(' / '.join(self._unicode_members()))
- class Lookahead(Compound):
- """An expression which consumes nothing, even if its contained expression
- succeeds"""
- __slots__ = ['negativity']
- def __init__(self, member, *, negative=False, **kwargs):
- super().__init__(member, **kwargs)
- self.negativity = bool(negative)
- def _uncached_match(self, text, pos, cache, error):
- node = self.members[0].match_core(text, pos, cache, error)
- if (node is None) == self.negativity: # negative lookahead == match only if not found
- return Node(self, text, pos, pos)
- def _as_rhs(self):
- return '%s%s' % ('!' if self.negativity else '&', self._unicode_members()[0])
- def _eq_check_cycles(self, other, checked):
- return (
- super()._eq_check_cycles(other, checked) and
- self.negativity == other.negativity
- )
- def Not(term):
- return Lookahead(term, negative=True)
- # Quantifiers. None of these is strictly necessary, but they're darn handy.
- class Quantifier(Compound):
- """An expression wrapper like the */+/?/{n,m} quantifier in regexes."""
- __slots__ = ['min', 'max']
- def __init__(self, member, *, min=0, max=float('inf'), name='', **kwargs):
- super().__init__(member, name=name, **kwargs)
- self.min = min
- self.max = max
- def _uncached_match(self, text, pos, cache, error):
- new_pos = pos
- children = []
- size = len(text)
- while new_pos < size and len(children) < self.max:
- node = self.members[0].match_core(text, new_pos, cache, error)
- if node is None:
- break # no more matches
- children.append(node)
- length = node.end - node.start
- if len(children) >= self.min and length == 0: # Don't loop infinitely
- break
- new_pos += length
- if len(children) >= self.min:
- return Node(self, text, pos, new_pos, children)
- def _as_rhs(self):
- if self.min == 0 and self.max == 1:
- qualifier = '?'
- elif self.min == 0 and self.max == float('inf'):
- qualifier = '*'
- elif self.min == 1 and self.max == float('inf'):
- qualifier = '+'
- elif self.max == float('inf'):
- qualifier = '{%d,}' % self.min
- elif self.min == 0:
- qualifier = '{,%d}' % self.max
- else:
- qualifier = '{%d,%d}' % (self.min, self.max)
- return '%s%s' % (self._unicode_members()[0], qualifier)
- def _eq_check_cycles(self, other, checked):
- return (
- super()._eq_check_cycles(other, checked) and
- self.min == other.min and
- self.max == other.max
- )
- def ZeroOrMore(member, name=''):
- return Quantifier(member, name=name, min=0, max=float('inf'))
- def OneOrMore(member, name='', min=1):
- return Quantifier(member, name=name, min=min, max=float('inf'))
- def Optional(member, name=''):
- return Quantifier(member, name=name, min=0, max=1)
|