Source code for spack.tokenize

# Copyright Spack Project Developers. See COPYRIGHT file for details.
#
# SPDX-License-Identifier: (Apache-2.0 OR MIT)
"""This module provides building blocks for tokenizing strings. Users can define tokens by
inheriting from TokenBase and defining tokens as ordered enum members. The Tokenizer class can then
be used to iterate over tokens in a string."""

import enum
import re
from typing import Generator, Match, Optional, Type


[docs] class TokenBase(enum.Enum): """Base class for an enum type with a regex value""" def __new__(cls, *args, **kwargs): value = len(cls.__members__) + 1 obj = object.__new__(cls) obj._value_ = value return obj def __init__(self, regex): self.regex = regex def __str__(self): return f"{self._name_}"
[docs] class Token: """Represents tokens; generated from input by lexer and fed to parse().""" __slots__ = "kind", "value", "start", "end", "subvalues" def __init__(self, kind: TokenBase, value: str, start: int = 0, end: int = 0, **kwargs): self.kind = kind self.value = value self.start = start self.end = end self.subvalues = kwargs if kwargs else None def __repr__(self): return str(self) def __str__(self): parts = [self.kind, self.value] if self.subvalues: parts += [self.subvalues] return f"({', '.join(f'`{p}`' for p in parts)})" def __eq__(self, other): return ( self.kind == other.kind and self.value == other.value and self.subvalues == other.subvalues )
[docs] def token_match_regex(token: TokenBase): """Generate a regular expression that matches the provided token and its subvalues. This will extract named capture groups from the provided regex and prefix them with token name, so they can coexist together in a larger, joined regular expression. Returns: A regex with a capture group for the token and rewritten capture groups for any subvalues. """ pairs = [] def replace(m): subvalue_name = m.group(1) token_prefixed_subvalue_name = f"{token.name}_{subvalue_name}" pairs.append((subvalue_name, token_prefixed_subvalue_name)) return f"(?P<{token_prefixed_subvalue_name}>" # rewrite all subvalue capture groups so they're prefixed with the token name rewritten_token_regex = re.sub(r"\(\?P<([^>]+)>", replace, token.regex) # construct a regex that matches the token as a whole *and* the subvalue capture groups token_regex = f"(?P<{token}>{rewritten_token_regex})" return token_regex, pairs
[docs] class Tokenizer: def __init__(self, tokens: Type[TokenBase]): self.tokens = tokens # tokens can have named subexpressions, if their regexes define named capture groups. # record this so we can associate them with the token self.token_subvalues = {} parts = [] for token in tokens: token_regex, pairs = token_match_regex(token) parts.append(token_regex) if pairs: self.token_subvalues[token.name] = pairs self.regex = re.compile("|".join(parts))
[docs] def tokenize(self, text: str) -> Generator[Token, None, None]: if not text: return scanner = self.regex.scanner(text) # type: ignore[attr-defined] m: Optional[Match] = None for m in iter(scanner.match, None): # The following two assertions are to help mypy msg = ( "unexpected value encountered during parsing. Please submit a bug report " "at https://github.com/spack/spack/issues/new/choose" ) assert m is not None, msg assert m.lastgroup is not None, msg token = Token(self.tokens.__members__[m.lastgroup], m.group(), m.start(), m.end()) # add any subvalues to the token subvalues = self.token_subvalues.get(m.lastgroup) if subvalues: if any(m.group(rewritten) for subval, rewritten in subvalues): token.subvalues = { subval: m.group(rewritten) for subval, rewritten in subvalues } yield token