Module `scrolls.ast.tokenizer`

The tokenizer implementation.

Expand source code

"""
The tokenizer implementation.

See [ast: Tokenizing](../ast/index.html#tokenizing).
"""

import dataclasses
import logging
import types
import typing as t

from scrolls import errors as base_errors

from . import ast_errors, streams
from .ast_constants import (BLOCK_CLOSE, BLOCK_OPEN, CLOSE_ARGS, COMMAND_SEP,
                            COMMENT_SIGIL, CONTROL_SIGIL, EOF, ESCAPE_SIGIL,
                            EXPANSION_SIGIL, OPEN_ARGS, QUOTE, SPREAD_SIGIL,
                            TokenizeConsumeRestState, TokenType)

__all__ = (
    "Token",
    "Tokenizer"
)


logger = logging.getLogger(__name__)


def _str_ensure(s: str, ensure: str) -> str:
    if ensure not in s:
        return s + ensure
    else:
        return s


def _str_remove(s: str, remove: str) -> str:
    return s.replace(remove, "")


def _str_switch(s: str, switch: str, en: bool) -> str:
    """
    Utility function for enabling/disabling detection of certain characters.
    """

    if en:
        return _str_ensure(s, switch)
    else:
        return _str_remove(s, switch)


@dataclasses.dataclass
class Token:
    """A token."""

    type: TokenType
    """The type of this token."""

    value: str
    """The value of this token."""

    line: int
    """The line this token *started* generating on. Some tokens may span multiple lines."""

    position: int
    """The column along the line that this token *started* generating on. """

    tokenizer: "Tokenizer"
    """The tokenizer that generated this token."""

    consume_rest: bool = False
    """Sets whether this token was generated by CONSUME_REST."""

    def __str__(self) -> str:
        return f"{self.type.name}:{repr(self.value)}"


class Tokenizer:
    """
    The tokenizer. This class is responsible for identifying meaningful pieces of scripts
    (such as string literals, block open and close, etc.), and tagging them.

    .. WARNING::
        If the tokenizer is supplied with a string, then this `Tokenizer` is **single use**.
        If you wish to stream input, implement a `scrolls.ast.streams.CharStream`. See
        `scrolls.ast.streams.StringStream.feed` and see if that works for you.
        See `scrolls.ast.streams.REPLStream` for an example of streaming input
        from a user.

    Args:
        stream: The script to tokenize. This may be a string or a `scrolls.ast.streams.CharStream` instance.
        consume_rest_triggers: Triggers for CONSUME_REST.
    """
    def __init__(
        self,
        stream: t.Union[str, streams.CharStream],
        consume_rest_triggers: t.Mapping[str, int] = types.MappingProxyType({})
    ):
        if isinstance(stream, str):
            self.stream: streams.CharStream = streams.StringStream(stream.strip())
        else:
            self.stream = stream

        self.consume_rest_triggers = consume_rest_triggers
        self.consume_rest_state = TokenizeConsumeRestState.OFF
        self.consume_rest_count = 0
        self.previous_token_was_sep = True
        self.whitespace = "\t "

        # Map of single characters to token types
        self.charmap = {
            "\n": TokenType.COMMAND_SEP,
            ";": TokenType.COMMAND_SEP,
            OPEN_ARGS: TokenType.OPEN_ARGS,
            CLOSE_ARGS: TokenType.CLOSE_ARGS,
            BLOCK_OPEN: TokenType.OPEN_BLOCK,
            BLOCK_CLOSE: TokenType.CLOSE_BLOCK,
            EXPANSION_SIGIL: TokenType.EXPANSION_SIGIL,
            CONTROL_SIGIL: TokenType.CONTROL_SIGIL,
            SPREAD_SIGIL: TokenType.SPREAD_SIGIL
        }

        self.escape_sequences: t.MutableMapping[
            str,
            t.Union[str, t.Callable[[Tokenizer], str]]
        ] = {
            "n": "\n",
            "t": "\t",
            "r": "\r",
            ESCAPE_SIGIL: ESCAPE_SIGIL,
            QUOTE: QUOTE,
            "u": Tokenizer._unicode_escape
        }

        # Set up stop chars for unquoted string literals.
        self._string_literal_always_stop = self.whitespace
        self._string_literal_stop_quoted = QUOTE
        self._string_literal_stop_comment = COMMENT_SIGIL

        # Note: Add an exception for newlines. Even when we don't consider newlines to be command separators,
        # we would normally want newlines to separate string literals. So remove \n from this switch.
        self._string_literal_stop_single_char = _str_remove(
            "".join(self.charmap.keys()),
            "\n"
        )

        # Override flag for behavior when single_char_token_enable is False.
        self.newlines_separate_strings = True

        self.string_literal_stop: str = self._string_literal_always_stop
        self.single_char_token_enable = True
        self.set_single_char_token_enable(True)

        # Set up stop chars for CONSUME_REST.
        self._consume_rest_stop_switch: str = "".join([*COMMAND_SEP, BLOCK_CLOSE, BLOCK_OPEN])
        self.consume_rest_stop: str = ""
        self.set_consume_rest_all(False)

        # Set up stop chars for quoted literals.
        self.quoted_literal_stop: str = QUOTE  # For now, quoted literals ONLY stop on another quote.
        self.quoted_literal_enable = True
        self.set_quoted_literals_enable(True)

        # Set up stop chars for comments. (Note: No need for specific comment stop char here, it's hardcoded to
        # be \n at the moment.)
        self.comments_enable = True
        self.set_comments_enable(True)

    def _unicode_escape(self) -> str:
        code_point = ""  # Initialization not needed, just satisfies some linters.
        try:
            code_point = self.next_n_chars(4)
        except ast_errors.TokenizeEofError:
            self.error(
                ast_errors.TokenizeEofError,
                "Ran off end of script trying to parse unicode escape."
            )

        if QUOTE in code_point:
            self.error(
                ast_errors.TokenizeError,
                f"Encountered {QUOTE} while consuming unicode escape.",
                pos=self.stream.current_pos() - 4
            )

        char = ""
        try:
            char = chr(int(code_point, 16))
        except ValueError:
            self.error(
                ast_errors.TokenizeError,
                f"Bad hex number {code_point}.",
                pos=self.stream.current_pos() - 4
            )

        return char

    def set_consume_rest_all(self, consume_all: bool) -> None:
        """
        Set whether CONSUME_REST consumes until EOF. Defaults to `False`.

        If `False`, CONSUME_REST will stop on block open/close, and command separators.

        If `True`, CONSUME_REST will not stop until EOF.
        """
        self.consume_rest_stop = _str_switch(
            self.consume_rest_stop,
            self._consume_rest_stop_switch,
            not consume_all
        )

    def set_single_char_token_enable(self, en: bool) -> None:
        """
        Set whether single character tokens should be parsed. This includes ALL token types except for
        `scrolls.ast.ast_constants.TokenType.STRING_LITERAL` and `scrolls.ast.ast_constants.TokenType.COMMENT`.
        Defaults to `True`.

        If `False`, then all special characters that would otherwise be their own token will be rolled
        into string literals.
        """
        if self.newlines_separate_strings and not self.single_char_token_enable and en:
            # If we're re-enabling single char tokens and the newline separator behavior is still on,
            # we need to undo that first.
            self.set_newlines_separate_strings(False)

        self.single_char_token_enable = en

        self.string_literal_stop = _str_switch(
            self.string_literal_stop,
            self._string_literal_stop_single_char,
            en
        )

        if not en:
            self.set_newlines_separate_strings(True)
        else:
            # If single char tokens are enabled, newlines must stop string literals for this to work properly.
            self.string_literal_stop = _str_ensure(self.string_literal_stop, "\n")

    def set_quoted_literals_enable(self, en: bool) -> None:
        """
        Set whether quoted string literals are enabled. If disabled, quotes will be rolled into normal string token
        parsing.

        For instance, if quoted literals are disabled, `"Hello World"` would be interpreted as `"Hello`, `World"`.
        """
        self.quoted_literal_enable = en
        self.string_literal_stop = _str_switch(
            self.string_literal_stop,
            self._string_literal_stop_quoted,
            en
        )

    def set_comments_enable(self, en: bool) -> None:
        """
        Set whether comments are enabled. If disabled, the comment character will be ignored, and anything that
        would be a comment will be treated as ordinary code.
        """
        self.comments_enable = en
        self.string_literal_stop = _str_switch(
            self.string_literal_stop,
            self._string_literal_stop_comment,
            en
        )

    def set_newlines_separate_strings(self, en: bool) -> None:
        """
        Set whether newlines separate string literals. This can only be modified if
        `Tokenizer.set_single_char_token_enable` has been set to `False`, and will raise a
        `scrolls.errors.ScrollError` otherwise.

        By default, when `Tokenizer.set_single_char_token_enable` is set to `False`, newlines will instead be
        considered whitespace, and will separate strings without producing
        `scrolls.ast.ast_constants.TokenType.COMMAND_SEP` tokens.

        To override this behavior, this function may be set to `False`. In this case, newlines will be rolled into
        string literals, and ONLY spaces and tabs will separate string literals.
        """
        if self.single_char_token_enable:
            raise base_errors.ScrollError("Cannot use set_newlines_separate_strings when single char tokens are enabled.")

        self.newlines_separate_strings = en
        self.string_literal_stop = _str_switch(
            self.string_literal_stop,
            "\n",
            en
        )
        self.whitespace = _str_switch(
            self.whitespace,
            "\n",
            en
        )

    def error(
        self,
        err_type: t.Type[base_errors.PositionalError],
        message: str,
        line: t.Optional[int] = None,
        pos: t.Optional[int] = None
    ) -> t.NoReturn:
        if line is not None:
            _line = line
        else:
            _line = self.stream.current_line()

        if pos is not None:
            _pos = pos
        else:
            _pos = self.stream.current_pos()

        raise err_type(
            _line,
            _pos,
            self.stream.history(),
            message
        )

    def forbid_eof(self, msg: str = "", *args: t.Any, **kwargs: t.Any) -> None:
        if not msg:
            msg = "Unexpected EOF while parsing script."

        if self.stream.at_eof() or self.stream.after_eof():
            self.error(ast_errors.TokenizeEofError, msg.format(*args, **kwargs))

    def next_n_chars(self, n: int) -> str:
        """
        Unconditionally consume N characters and return them.
        """
        chars: t.MutableSequence[str] = []
        for _ in range(n):
            self.forbid_eof(
                "Ran into EOF while consuming characters. Got {}, wanted {}.",
                len(chars), n
            )

            chars.append(self.stream.get_char())
            self.stream.next_char()

        return "".join(chars)

    # Get a single char token.
    def accept_single_char(self) -> t.Optional[Token]:
        if not self.single_char_token_enable:
            return None

        char = self.stream.get_char()

        if char in self.charmap:
            tok = Token(
                self.charmap[char],
                char,
                self.stream.current_line(),
                self.stream.current_pos(),
                self
            )
            self.stream.next_char()
            return tok

        return None

    def accept_eof(self) -> t.Optional[Token]:
        if self.stream.at_eof():
            # Once an EOF is generated, there are no more tokens.
            # Any attempts after this to generate a token will
            # result in an exception.
            self.stream.next_char()  # Put stream into after eof state

            return Token(
                TokenType.EOF,
                EOF,
                self.stream.current_line(),
                self.stream.current_pos(),
                self
            )
        else:
            return None

    def accept_whitespace(self) -> t.Optional[Token]:
        char = self.stream.get_char()
        if char in self.whitespace:
            self.stream.next_char()
            return Token(
                TokenType.WHITESPACE,
                char,
                self.stream.current_line(),
                self.stream.current_pos(),
                self
            )

        return None

    def try_consume_escape(self) -> t.Optional[str]:
        if self.stream.get_char() != ESCAPE_SIGIL:
            return None

        self.stream.next_char()
        self.forbid_eof()

        escape_char = self.stream.get_char()
        if escape_char not in self.escape_sequences:
            self.error(ast_errors.TokenizeError, f"Invalid escape '{escape_char}'")

        self.stream.next_char()
        self.forbid_eof()

        replacement = self.escape_sequences[escape_char]
        if isinstance(replacement, str):
            return replacement
        elif callable(replacement):
            return replacement(self)
        else:
            raise TypeError(f"Bad type for escape sequence {escape_char}, "
                            "must be 'str' or '(Tokenizer) -> str'")

    def accept_string_literal(
        self,
        stop_chars: t.Sequence[str] = (),
        error_on_eof: bool = False,
        allow_escapes: bool = False
    ) -> t.Optional[Token]:
        self.forbid_eof("String literal should not start on EOF")

        char = self.stream.get_char()
        pos = self.stream.current_pos()
        line = self.stream.current_line()
        chars = []

        while char not in stop_chars:
            if allow_escapes:
                escape = self.try_consume_escape()
                if escape is not None:
                    chars.append(escape)
                    char = self.stream.get_char()
                    continue

            chars.append(char)
            self.stream.next_char()
            if self.stream.at_eof():
                if error_on_eof:
                    self.error(
                        ast_errors.TokenizeEofError,
                        "Unexpected EOF while parsing string literal."
                    )
                else:
                    break

            char = self.stream.get_char()

        return Token(
            TokenType.STRING_LITERAL,
            "".join(chars),
            line,
            pos,
            self
        )

    def accept_comment(self) -> t.Optional[Token]:
        if not self.comments_enable:
            return None

        char = self.stream.get_char()
        pos = self.stream.current_pos()
        line = self.stream.current_line()
        chars = []

        if char != COMMENT_SIGIL:
            return None

        self.stream.next_char()
        while char != "\n":
            chars.append(char)
            self.stream.next_char()

            if self.stream.at_eof():
                break

            char = self.stream.get_char()

        return Token(
            TokenType.COMMENT,
            "".join(chars),
            line,
            pos,
            self
        )

    # Accepts a normal string literal. No CONSUME_REST, not quoted.
    def accept_string_literal_normal(self) -> t.Optional[Token]:
        return self.accept_string_literal(
            stop_chars=self.string_literal_stop,
            error_on_eof=False  # Just stop on EOF, no errors.
        )

    # Accept a CONSUME_REST literal.
    def accept_string_literal_consume_rest(self) -> t.Optional[Token]:
        return self.accept_string_literal(
            stop_chars=self.consume_rest_stop,
            error_on_eof=False  # Stop on EOF. No errors.
        )

    # Accept a quoted string literal.
    def accept_string_literal_quoted(self) -> t.Optional[Token]:
        if not self.quoted_literal_enable:
            return None

        if self.stream.get_char() != QUOTE:
            return None
        else:
            self.stream.next_char()

        literal = self.accept_string_literal(
            stop_chars=self.quoted_literal_stop,
            error_on_eof=True,  # Quoted literals must be closed.
            allow_escapes=True  # Escapes only allowed in quoted literals.
        )

        if literal is None:
            self.error(
                ast_errors.TokenizeError,
                "internal: Got None from accept_string_literal, shouldn't have."
            )

        if self.stream.get_char() != QUOTE:
            self.error(
                ast_errors.TokenizeError,
                "internal: Missing end quote, should have resulted in EOF error."
            )
        else:
            self.stream.next_char()

        return literal

    @staticmethod
    def accept_any_of(*f: t.Callable[[], t.Optional[Token]]) -> t.Optional[Token]:
        for fun in f:
            tok = fun()
            if tok is not None:
                return tok

        return None

    def handle_consume_rest_off(self, tok: Token) -> None:
        if tok.type in (TokenType.COMMAND_SEP, TokenType.CLOSE_BLOCK, TokenType.CLOSE_ARGS):
            self.previous_token_was_sep = True
            return

        # Test to see if we should enter CONSUME_REST state.
        # Only trigger CONSUME_REST if the previous token was a command separator.
        should_enter_consume_rest = (
                self.previous_token_was_sep and
                tok.type == TokenType.STRING_LITERAL and
                tok.value in self.consume_rest_triggers
        )
        self.previous_token_was_sep = False
        if should_enter_consume_rest:
            count = self.consume_rest_triggers[tok.value]

            if count == 0:
                self.consume_rest_state = TokenizeConsumeRestState.CONSUME
            else:
                self.consume_rest_state = TokenizeConsumeRestState.COUNTING
                self.consume_rest_count = count

    def handle_consume_rest_counting(self, tok: Token) -> None:
        self.previous_token_was_sep = False

        # Only count down on string literals.
        if tok.type == TokenType.STRING_LITERAL:
            self.consume_rest_count -= 1

            # Once countdown is over, CONSUME_REST on next token.
            if self.consume_rest_count == 0:
                self.consume_rest_state = TokenizeConsumeRestState.CONSUME

        # If we get any other token type, then cancel CONSUME_REST
        else:
            self.consume_rest_state = TokenizeConsumeRestState.OFF
            self.consume_rest_count = 0

    def handle_consume_rest_consume(self, tok: Token) -> None:
        # This function runs AFTER a CONSUME_REST consumption. So, just set consume_rest back to OFF.
        self.consume_rest_state = TokenizeConsumeRestState.OFF
        self.consume_rest_count = 0

    # TODO
    # Consume rest state handler. All this code is pretty ugly, and does not account
    # for more advanced usage.
    def handle_consume_rest(self, tok: Token) -> None:
        f_map: t.Mapping[TokenizeConsumeRestState, t.Callable[[Token], None]] = {
            TokenizeConsumeRestState.OFF: self.handle_consume_rest_off,
            TokenizeConsumeRestState.COUNTING: self.handle_consume_rest_counting,
            TokenizeConsumeRestState.CONSUME: self.handle_consume_rest_consume
        }

        f_map[self.consume_rest_state](tok)

    def next_token(self) -> Token:
        """
        Extract the next token. If the tokenizing is finished, this will return a `Token` of type
        `scrolls.ast.ast_constants.TokenType.EOF`

        Raises:
            `scrolls.ast.ast_errors.TokenizeEofError`: If EOF was hit unexpectedly.
            `scrolls.ast.ast_errors.TokenizeError`: If a generic issue happened while tokenizing.
        """
        if self.consume_rest_state == TokenizeConsumeRestState.CONSUME:
            while True:
                tok = self.accept_any_of(
                    self.accept_whitespace
                )

                if tok is None:
                    break

                if tok.type == TokenType.WHITESPACE:
                    continue

            tok = self.accept_string_literal_consume_rest()
            if tok is None:
                self.error(
                    ast_errors.TokenizeError,
                    "Got bad string literal during consume_rest"
                )
            logger.debug(f"tokenize: Got token {tok.type.name}:{repr(tok.value)}")
            tok.consume_rest = True  # Signal we got this token using CONSUME_REST

            self.handle_consume_rest(tok)
            return tok
        else:
            while True:
                if self.stream.after_eof():
                    self.error(
                        ast_errors.TokenizeEofError,
                        "No more tokens."
                    )

                tok = None

                try:
                    tok = self.accept_any_of(
                        self.accept_whitespace,
                        self.accept_comment,
                        self.accept_single_char,
                        self.accept_string_literal_quoted,
                        self.accept_string_literal_normal
                    )
                except ast_errors.StreamEofError:
                    # !!! HACK
                    # I really, really need to rethink how EOF is handled
                    # throughout this entire module. It's broken.
                    pass

                if tok is None:
                    # If tok is None, then all tokenizing functions got
                    # rejected. So, try to accept and return EOF.

                    eof_tok = self.accept_eof()

                    if eof_tok is None:
                        self.error(
                            ast_errors.TokenizeError,
                            "Unexpectedly rejected all tokenizing functions."
                        )
                    else:
                        return eof_tok

                # Loop until we get a non-whitespace, non-comment token.
                if tok.type not in [TokenType.WHITESPACE, TokenType.COMMENT]:
                    logger.debug(f"tokenize: Got token {tok.type.name}:{repr(tok.value)}")
                    self.handle_consume_rest(tok)
                    return tok

    def get_all_tokens(self) -> t.Sequence[Token]:
        """
        Extracts all tokens at once, until the end of the script. A sequence of tokens obtained this way
        will always end with a token of type `scrolls.ast.ast_constants.TokenType.EOF`.

        Raises:
            `scrolls.ast.ast_errors.TokenizeEofError`: If EOF was hit unexpectedly.
            `scrolls.ast.ast_errors.TokenizeError`: If a generic issue happened while tokenizing.
        """
        tokens: t.MutableSequence[Token] = []

        while True:
            tok = self.next_token()
            tokens.append(tok)
            if tok.type == TokenType.EOF:
                return tokens

Classes

class Token (type: TokenType, value: str, line: int, position: int, tokenizer: Tokenizer, consume_rest: bool = False)

A token.

Expand source code

@dataclasses.dataclass
class Token:
    """A token."""

    type: TokenType
    """The type of this token."""

    value: str
    """The value of this token."""

    line: int
    """The line this token *started* generating on. Some tokens may span multiple lines."""

    position: int
    """The column along the line that this token *started* generating on. """

    tokenizer: "Tokenizer"
    """The tokenizer that generated this token."""

    consume_rest: bool = False
    """Sets whether this token was generated by CONSUME_REST."""

    def __str__(self) -> str:
        return f"{self.type.name}:{repr(self.value)}"

Class variables

var consume_rest : bool: Sets whether this token was generated by CONSUME_REST.
var line : int: The line this token started generating on. Some tokens may span multiple lines.
var position : int: The column along the line that this token started generating on.
var tokenizer : Tokenizer: The tokenizer that generated this token.
var type : TokenType: The type of this token.
var value : str: The value of this token.

class Tokenizer (stream: Union[str, CharStream], consume_rest_triggers: Mapping[str, int] = mappingproxy({}))

The tokenizer. This class is responsible for identifying meaningful pieces of scripts (such as string literals, block open and close, etc.), and tagging them.

Warning

If the tokenizer is supplied with a string, then this Tokenizer is single use. If you wish to stream input, implement a CharStream. See StringStream.feed() and see if that works for you. See REPLStream for an example of streaming input from a user.

Args

stream: The script to tokenize. This may be a string or a CharStream instance.
consume_rest_triggers: Triggers for CONSUME_REST.

Expand source code

class Tokenizer:
    """
    The tokenizer. This class is responsible for identifying meaningful pieces of scripts
    (such as string literals, block open and close, etc.), and tagging them.

    .. WARNING::
        If the tokenizer is supplied with a string, then this `Tokenizer` is **single use**.
        If you wish to stream input, implement a `scrolls.ast.streams.CharStream`. See
        `scrolls.ast.streams.StringStream.feed` and see if that works for you.
        See `scrolls.ast.streams.REPLStream` for an example of streaming input
        from a user.

    Args:
        stream: The script to tokenize. This may be a string or a `scrolls.ast.streams.CharStream` instance.
        consume_rest_triggers: Triggers for CONSUME_REST.
    """
    def __init__(
        self,
        stream: t.Union[str, streams.CharStream],
        consume_rest_triggers: t.Mapping[str, int] = types.MappingProxyType({})
    ):
        if isinstance(stream, str):
            self.stream: streams.CharStream = streams.StringStream(stream.strip())
        else:
            self.stream = stream

        self.consume_rest_triggers = consume_rest_triggers
        self.consume_rest_state = TokenizeConsumeRestState.OFF
        self.consume_rest_count = 0
        self.previous_token_was_sep = True
        self.whitespace = "\t "

        # Map of single characters to token types
        self.charmap = {
            "\n": TokenType.COMMAND_SEP,
            ";": TokenType.COMMAND_SEP,
            OPEN_ARGS: TokenType.OPEN_ARGS,
            CLOSE_ARGS: TokenType.CLOSE_ARGS,
            BLOCK_OPEN: TokenType.OPEN_BLOCK,
            BLOCK_CLOSE: TokenType.CLOSE_BLOCK,
            EXPANSION_SIGIL: TokenType.EXPANSION_SIGIL,
            CONTROL_SIGIL: TokenType.CONTROL_SIGIL,
            SPREAD_SIGIL: TokenType.SPREAD_SIGIL
        }

        self.escape_sequences: t.MutableMapping[
            str,
            t.Union[str, t.Callable[[Tokenizer], str]]
        ] = {
            "n": "\n",
            "t": "\t",
            "r": "\r",
            ESCAPE_SIGIL: ESCAPE_SIGIL,
            QUOTE: QUOTE,
            "u": Tokenizer._unicode_escape
        }

        # Set up stop chars for unquoted string literals.
        self._string_literal_always_stop = self.whitespace
        self._string_literal_stop_quoted = QUOTE
        self._string_literal_stop_comment = COMMENT_SIGIL

        # Note: Add an exception for newlines. Even when we don't consider newlines to be command separators,
        # we would normally want newlines to separate string literals. So remove \n from this switch.
        self._string_literal_stop_single_char = _str_remove(
            "".join(self.charmap.keys()),
            "\n"
        )

        # Override flag for behavior when single_char_token_enable is False.
        self.newlines_separate_strings = True

        self.string_literal_stop: str = self._string_literal_always_stop
        self.single_char_token_enable = True
        self.set_single_char_token_enable(True)

        # Set up stop chars for CONSUME_REST.
        self._consume_rest_stop_switch: str = "".join([*COMMAND_SEP, BLOCK_CLOSE, BLOCK_OPEN])
        self.consume_rest_stop: str = ""
        self.set_consume_rest_all(False)

        # Set up stop chars for quoted literals.
        self.quoted_literal_stop: str = QUOTE  # For now, quoted literals ONLY stop on another quote.
        self.quoted_literal_enable = True
        self.set_quoted_literals_enable(True)

        # Set up stop chars for comments. (Note: No need for specific comment stop char here, it's hardcoded to
        # be \n at the moment.)
        self.comments_enable = True
        self.set_comments_enable(True)

    def _unicode_escape(self) -> str:
        code_point = ""  # Initialization not needed, just satisfies some linters.
        try:
            code_point = self.next_n_chars(4)
        except ast_errors.TokenizeEofError:
            self.error(
                ast_errors.TokenizeEofError,
                "Ran off end of script trying to parse unicode escape."
            )

        if QUOTE in code_point:
            self.error(
                ast_errors.TokenizeError,
                f"Encountered {QUOTE} while consuming unicode escape.",
                pos=self.stream.current_pos() - 4
            )

        char = ""
        try:
            char = chr(int(code_point, 16))
        except ValueError:
            self.error(
                ast_errors.TokenizeError,
                f"Bad hex number {code_point}.",
                pos=self.stream.current_pos() - 4
            )

        return char

    def set_consume_rest_all(self, consume_all: bool) -> None:
        """
        Set whether CONSUME_REST consumes until EOF. Defaults to `False`.

        If `False`, CONSUME_REST will stop on block open/close, and command separators.

        If `True`, CONSUME_REST will not stop until EOF.
        """
        self.consume_rest_stop = _str_switch(
            self.consume_rest_stop,
            self._consume_rest_stop_switch,
            not consume_all
        )

    def set_single_char_token_enable(self, en: bool) -> None:
        """
        Set whether single character tokens should be parsed. This includes ALL token types except for
        `scrolls.ast.ast_constants.TokenType.STRING_LITERAL` and `scrolls.ast.ast_constants.TokenType.COMMENT`.
        Defaults to `True`.

        If `False`, then all special characters that would otherwise be their own token will be rolled
        into string literals.
        """
        if self.newlines_separate_strings and not self.single_char_token_enable and en:
            # If we're re-enabling single char tokens and the newline separator behavior is still on,
            # we need to undo that first.
            self.set_newlines_separate_strings(False)

        self.single_char_token_enable = en

        self.string_literal_stop = _str_switch(
            self.string_literal_stop,
            self._string_literal_stop_single_char,
            en
        )

        if not en:
            self.set_newlines_separate_strings(True)
        else:
            # If single char tokens are enabled, newlines must stop string literals for this to work properly.
            self.string_literal_stop = _str_ensure(self.string_literal_stop, "\n")

    def set_quoted_literals_enable(self, en: bool) -> None:
        """
        Set whether quoted string literals are enabled. If disabled, quotes will be rolled into normal string token
        parsing.

        For instance, if quoted literals are disabled, `"Hello World"` would be interpreted as `"Hello`, `World"`.
        """
        self.quoted_literal_enable = en
        self.string_literal_stop = _str_switch(
            self.string_literal_stop,
            self._string_literal_stop_quoted,
            en
        )

    def set_comments_enable(self, en: bool) -> None:
        """
        Set whether comments are enabled. If disabled, the comment character will be ignored, and anything that
        would be a comment will be treated as ordinary code.
        """
        self.comments_enable = en
        self.string_literal_stop = _str_switch(
            self.string_literal_stop,
            self._string_literal_stop_comment,
            en
        )

    def set_newlines_separate_strings(self, en: bool) -> None:
        """
        Set whether newlines separate string literals. This can only be modified if
        `Tokenizer.set_single_char_token_enable` has been set to `False`, and will raise a
        `scrolls.errors.ScrollError` otherwise.

        By default, when `Tokenizer.set_single_char_token_enable` is set to `False`, newlines will instead be
        considered whitespace, and will separate strings without producing
        `scrolls.ast.ast_constants.TokenType.COMMAND_SEP` tokens.

        To override this behavior, this function may be set to `False`. In this case, newlines will be rolled into
        string literals, and ONLY spaces and tabs will separate string literals.
        """
        if self.single_char_token_enable:
            raise base_errors.ScrollError("Cannot use set_newlines_separate_strings when single char tokens are enabled.")

        self.newlines_separate_strings = en
        self.string_literal_stop = _str_switch(
            self.string_literal_stop,
            "\n",
            en
        )
        self.whitespace = _str_switch(
            self.whitespace,
            "\n",
            en
        )

    def error(
        self,
        err_type: t.Type[base_errors.PositionalError],
        message: str,
        line: t.Optional[int] = None,
        pos: t.Optional[int] = None
    ) -> t.NoReturn:
        if line is not None:
            _line = line
        else:
            _line = self.stream.current_line()

        if pos is not None:
            _pos = pos
        else:
            _pos = self.stream.current_pos()

        raise err_type(
            _line,
            _pos,
            self.stream.history(),
            message
        )

    def forbid_eof(self, msg: str = "", *args: t.Any, **kwargs: t.Any) -> None:
        if not msg:
            msg = "Unexpected EOF while parsing script."

        if self.stream.at_eof() or self.stream.after_eof():
            self.error(ast_errors.TokenizeEofError, msg.format(*args, **kwargs))

    def next_n_chars(self, n: int) -> str:
        """
        Unconditionally consume N characters and return them.
        """
        chars: t.MutableSequence[str] = []
        for _ in range(n):
            self.forbid_eof(
                "Ran into EOF while consuming characters. Got {}, wanted {}.",
                len(chars), n
            )

            chars.append(self.stream.get_char())
            self.stream.next_char()

        return "".join(chars)

    # Get a single char token.
    def accept_single_char(self) -> t.Optional[Token]:
        if not self.single_char_token_enable:
            return None

        char = self.stream.get_char()

        if char in self.charmap:
            tok = Token(
                self.charmap[char],
                char,
                self.stream.current_line(),
                self.stream.current_pos(),
                self
            )
            self.stream.next_char()
            return tok

        return None

    def accept_eof(self) -> t.Optional[Token]:
        if self.stream.at_eof():
            # Once an EOF is generated, there are no more tokens.
            # Any attempts after this to generate a token will
            # result in an exception.
            self.stream.next_char()  # Put stream into after eof state

            return Token(
                TokenType.EOF,
                EOF,
                self.stream.current_line(),
                self.stream.current_pos(),
                self
            )
        else:
            return None

    def accept_whitespace(self) -> t.Optional[Token]:
        char = self.stream.get_char()
        if char in self.whitespace:
            self.stream.next_char()
            return Token(
                TokenType.WHITESPACE,
                char,
                self.stream.current_line(),
                self.stream.current_pos(),
                self
            )

        return None

    def try_consume_escape(self) -> t.Optional[str]:
        if self.stream.get_char() != ESCAPE_SIGIL:
            return None

        self.stream.next_char()
        self.forbid_eof()

        escape_char = self.stream.get_char()
        if escape_char not in self.escape_sequences:
            self.error(ast_errors.TokenizeError, f"Invalid escape '{escape_char}'")

        self.stream.next_char()
        self.forbid_eof()

        replacement = self.escape_sequences[escape_char]
        if isinstance(replacement, str):
            return replacement
        elif callable(replacement):
            return replacement(self)
        else:
            raise TypeError(f"Bad type for escape sequence {escape_char}, "
                            "must be 'str' or '(Tokenizer) -> str'")

    def accept_string_literal(
        self,
        stop_chars: t.Sequence[str] = (),
        error_on_eof: bool = False,
        allow_escapes: bool = False
    ) -> t.Optional[Token]:
        self.forbid_eof("String literal should not start on EOF")

        char = self.stream.get_char()
        pos = self.stream.current_pos()
        line = self.stream.current_line()
        chars = []

        while char not in stop_chars:
            if allow_escapes:
                escape = self.try_consume_escape()
                if escape is not None:
                    chars.append(escape)
                    char = self.stream.get_char()
                    continue

            chars.append(char)
            self.stream.next_char()
            if self.stream.at_eof():
                if error_on_eof:
                    self.error(
                        ast_errors.TokenizeEofError,
                        "Unexpected EOF while parsing string literal."
                    )
                else:
                    break

            char = self.stream.get_char()

        return Token(
            TokenType.STRING_LITERAL,
            "".join(chars),
            line,
            pos,
            self
        )

    def accept_comment(self) -> t.Optional[Token]:
        if not self.comments_enable:
            return None

        char = self.stream.get_char()
        pos = self.stream.current_pos()
        line = self.stream.current_line()
        chars = []

        if char != COMMENT_SIGIL:
            return None

        self.stream.next_char()
        while char != "\n":
            chars.append(char)
            self.stream.next_char()

            if self.stream.at_eof():
                break

            char = self.stream.get_char()

        return Token(
            TokenType.COMMENT,
            "".join(chars),
            line,
            pos,
            self
        )

    # Accepts a normal string literal. No CONSUME_REST, not quoted.
    def accept_string_literal_normal(self) -> t.Optional[Token]:
        return self.accept_string_literal(
            stop_chars=self.string_literal_stop,
            error_on_eof=False  # Just stop on EOF, no errors.
        )

    # Accept a CONSUME_REST literal.
    def accept_string_literal_consume_rest(self) -> t.Optional[Token]:
        return self.accept_string_literal(
            stop_chars=self.consume_rest_stop,
            error_on_eof=False  # Stop on EOF. No errors.
        )

    # Accept a quoted string literal.
    def accept_string_literal_quoted(self) -> t.Optional[Token]:
        if not self.quoted_literal_enable:
            return None

        if self.stream.get_char() != QUOTE:
            return None
        else:
            self.stream.next_char()

        literal = self.accept_string_literal(
            stop_chars=self.quoted_literal_stop,
            error_on_eof=True,  # Quoted literals must be closed.
            allow_escapes=True  # Escapes only allowed in quoted literals.
        )

        if literal is None:
            self.error(
                ast_errors.TokenizeError,
                "internal: Got None from accept_string_literal, shouldn't have."
            )

        if self.stream.get_char() != QUOTE:
            self.error(
                ast_errors.TokenizeError,
                "internal: Missing end quote, should have resulted in EOF error."
            )
        else:
            self.stream.next_char()

        return literal

    @staticmethod
    def accept_any_of(*f: t.Callable[[], t.Optional[Token]]) -> t.Optional[Token]:
        for fun in f:
            tok = fun()
            if tok is not None:
                return tok

        return None

    def handle_consume_rest_off(self, tok: Token) -> None:
        if tok.type in (TokenType.COMMAND_SEP, TokenType.CLOSE_BLOCK, TokenType.CLOSE_ARGS):
            self.previous_token_was_sep = True
            return

        # Test to see if we should enter CONSUME_REST state.
        # Only trigger CONSUME_REST if the previous token was a command separator.
        should_enter_consume_rest = (
                self.previous_token_was_sep and
                tok.type == TokenType.STRING_LITERAL and
                tok.value in self.consume_rest_triggers
        )
        self.previous_token_was_sep = False
        if should_enter_consume_rest:
            count = self.consume_rest_triggers[tok.value]

            if count == 0:
                self.consume_rest_state = TokenizeConsumeRestState.CONSUME
            else:
                self.consume_rest_state = TokenizeConsumeRestState.COUNTING
                self.consume_rest_count = count

    def handle_consume_rest_counting(self, tok: Token) -> None:
        self.previous_token_was_sep = False

        # Only count down on string literals.
        if tok.type == TokenType.STRING_LITERAL:
            self.consume_rest_count -= 1

            # Once countdown is over, CONSUME_REST on next token.
            if self.consume_rest_count == 0:
                self.consume_rest_state = TokenizeConsumeRestState.CONSUME

        # If we get any other token type, then cancel CONSUME_REST
        else:
            self.consume_rest_state = TokenizeConsumeRestState.OFF
            self.consume_rest_count = 0

    def handle_consume_rest_consume(self, tok: Token) -> None:
        # This function runs AFTER a CONSUME_REST consumption. So, just set consume_rest back to OFF.
        self.consume_rest_state = TokenizeConsumeRestState.OFF
        self.consume_rest_count = 0

    # TODO
    # Consume rest state handler. All this code is pretty ugly, and does not account
    # for more advanced usage.
    def handle_consume_rest(self, tok: Token) -> None:
        f_map: t.Mapping[TokenizeConsumeRestState, t.Callable[[Token], None]] = {
            TokenizeConsumeRestState.OFF: self.handle_consume_rest_off,
            TokenizeConsumeRestState.COUNTING: self.handle_consume_rest_counting,
            TokenizeConsumeRestState.CONSUME: self.handle_consume_rest_consume
        }

        f_map[self.consume_rest_state](tok)

    def next_token(self) -> Token:
        """
        Extract the next token. If the tokenizing is finished, this will return a `Token` of type
        `scrolls.ast.ast_constants.TokenType.EOF`

        Raises:
            `scrolls.ast.ast_errors.TokenizeEofError`: If EOF was hit unexpectedly.
            `scrolls.ast.ast_errors.TokenizeError`: If a generic issue happened while tokenizing.
        """
        if self.consume_rest_state == TokenizeConsumeRestState.CONSUME:
            while True:
                tok = self.accept_any_of(
                    self.accept_whitespace
                )

                if tok is None:
                    break

                if tok.type == TokenType.WHITESPACE:
                    continue

            tok = self.accept_string_literal_consume_rest()
            if tok is None:
                self.error(
                    ast_errors.TokenizeError,
                    "Got bad string literal during consume_rest"
                )
            logger.debug(f"tokenize: Got token {tok.type.name}:{repr(tok.value)}")
            tok.consume_rest = True  # Signal we got this token using CONSUME_REST

            self.handle_consume_rest(tok)
            return tok
        else:
            while True:
                if self.stream.after_eof():
                    self.error(
                        ast_errors.TokenizeEofError,
                        "No more tokens."
                    )

                tok = None

                try:
                    tok = self.accept_any_of(
                        self.accept_whitespace,
                        self.accept_comment,
                        self.accept_single_char,
                        self.accept_string_literal_quoted,
                        self.accept_string_literal_normal
                    )
                except ast_errors.StreamEofError:
                    # !!! HACK
                    # I really, really need to rethink how EOF is handled
                    # throughout this entire module. It's broken.
                    pass

                if tok is None:
                    # If tok is None, then all tokenizing functions got
                    # rejected. So, try to accept and return EOF.

                    eof_tok = self.accept_eof()

                    if eof_tok is None:
                        self.error(
                            ast_errors.TokenizeError,
                            "Unexpectedly rejected all tokenizing functions."
                        )
                    else:
                        return eof_tok

                # Loop until we get a non-whitespace, non-comment token.
                if tok.type not in [TokenType.WHITESPACE, TokenType.COMMENT]:
                    logger.debug(f"tokenize: Got token {tok.type.name}:{repr(tok.value)}")
                    self.handle_consume_rest(tok)
                    return tok

    def get_all_tokens(self) -> t.Sequence[Token]:
        """
        Extracts all tokens at once, until the end of the script. A sequence of tokens obtained this way
        will always end with a token of type `scrolls.ast.ast_constants.TokenType.EOF`.

        Raises:
            `scrolls.ast.ast_errors.TokenizeEofError`: If EOF was hit unexpectedly.
            `scrolls.ast.ast_errors.TokenizeError`: If a generic issue happened while tokenizing.
        """
        tokens: t.MutableSequence[Token] = []

        while True:
            tok = self.next_token()
            tokens.append(tok)
            if tok.type == TokenType.EOF:
                return tokens

Static methods

def accept_any_of(*f: Callable[[], Optional[Token]]) ‑> Optional[Token]

Expand source code

@staticmethod
def accept_any_of(*f: t.Callable[[], t.Optional[Token]]) -> t.Optional[Token]:
    for fun in f:
        tok = fun()
        if tok is not None:
            return tok

    return None

Methods

def accept_comment(self) ‑> Optional[Token]

Expand source code

def accept_comment(self) -> t.Optional[Token]:
    if not self.comments_enable:
        return None

    char = self.stream.get_char()
    pos = self.stream.current_pos()
    line = self.stream.current_line()
    chars = []

    if char != COMMENT_SIGIL:
        return None

    self.stream.next_char()
    while char != "\n":
        chars.append(char)
        self.stream.next_char()

        if self.stream.at_eof():
            break

        char = self.stream.get_char()

    return Token(
        TokenType.COMMENT,
        "".join(chars),
        line,
        pos,
        self
    )

def accept_eof(self) ‑> Optional[Token]

Expand source code

def accept_eof(self) -> t.Optional[Token]:
    if self.stream.at_eof():
        # Once an EOF is generated, there are no more tokens.
        # Any attempts after this to generate a token will
        # result in an exception.
        self.stream.next_char()  # Put stream into after eof state

        return Token(
            TokenType.EOF,
            EOF,
            self.stream.current_line(),
            self.stream.current_pos(),
            self
        )
    else:
        return None

def accept_single_char(self) ‑> Optional[Token]

Expand source code

def accept_single_char(self) -> t.Optional[Token]:
    if not self.single_char_token_enable:
        return None

    char = self.stream.get_char()

    if char in self.charmap:
        tok = Token(
            self.charmap[char],
            char,
            self.stream.current_line(),
            self.stream.current_pos(),
            self
        )
        self.stream.next_char()
        return tok

    return None

def accept_string_literal(self, stop_chars: Sequence[str] = (), error_on_eof: bool = False, allow_escapes: bool = False) ‑> Optional[Token]

Expand source code

def accept_string_literal(
    self,
    stop_chars: t.Sequence[str] = (),
    error_on_eof: bool = False,
    allow_escapes: bool = False
) -> t.Optional[Token]:
    self.forbid_eof("String literal should not start on EOF")

    char = self.stream.get_char()
    pos = self.stream.current_pos()
    line = self.stream.current_line()
    chars = []

    while char not in stop_chars:
        if allow_escapes:
            escape = self.try_consume_escape()
            if escape is not None:
                chars.append(escape)
                char = self.stream.get_char()
                continue

        chars.append(char)
        self.stream.next_char()
        if self.stream.at_eof():
            if error_on_eof:
                self.error(
                    ast_errors.TokenizeEofError,
                    "Unexpected EOF while parsing string literal."
                )
            else:
                break

        char = self.stream.get_char()

    return Token(
        TokenType.STRING_LITERAL,
        "".join(chars),
        line,
        pos,
        self
    )

def accept_string_literal_consume_rest(self) ‑> Optional[Token]

Expand source code

def accept_string_literal_consume_rest(self) -> t.Optional[Token]:
    return self.accept_string_literal(
        stop_chars=self.consume_rest_stop,
        error_on_eof=False  # Stop on EOF. No errors.
    )

def accept_string_literal_normal(self) ‑> Optional[Token]

Expand source code

def accept_string_literal_normal(self) -> t.Optional[Token]:
    return self.accept_string_literal(
        stop_chars=self.string_literal_stop,
        error_on_eof=False  # Just stop on EOF, no errors.
    )

def accept_string_literal_quoted(self) ‑> Optional[Token]

Expand source code

def accept_string_literal_quoted(self) -> t.Optional[Token]:
    if not self.quoted_literal_enable:
        return None

    if self.stream.get_char() != QUOTE:
        return None
    else:
        self.stream.next_char()

    literal = self.accept_string_literal(
        stop_chars=self.quoted_literal_stop,
        error_on_eof=True,  # Quoted literals must be closed.
        allow_escapes=True  # Escapes only allowed in quoted literals.
    )

    if literal is None:
        self.error(
            ast_errors.TokenizeError,
            "internal: Got None from accept_string_literal, shouldn't have."
        )

    if self.stream.get_char() != QUOTE:
        self.error(
            ast_errors.TokenizeError,
            "internal: Missing end quote, should have resulted in EOF error."
        )
    else:
        self.stream.next_char()

    return literal

def accept_whitespace(self) ‑> Optional[Token]

Expand source code

def accept_whitespace(self) -> t.Optional[Token]:
    char = self.stream.get_char()
    if char in self.whitespace:
        self.stream.next_char()
        return Token(
            TokenType.WHITESPACE,
            char,
            self.stream.current_line(),
            self.stream.current_pos(),
            self
        )

    return None

def error(self, err_type: Type[PositionalError], message: str, line: Optional[int] = None, pos: Optional[int] = None) ‑> NoReturn

Expand source code

def error(
    self,
    err_type: t.Type[base_errors.PositionalError],
    message: str,
    line: t.Optional[int] = None,
    pos: t.Optional[int] = None
) -> t.NoReturn:
    if line is not None:
        _line = line
    else:
        _line = self.stream.current_line()

    if pos is not None:
        _pos = pos
    else:
        _pos = self.stream.current_pos()

    raise err_type(
        _line,
        _pos,
        self.stream.history(),
        message
    )

def forbid_eof(self, msg: str = '', *args: Any, **kwargs: Any) ‑> None

Expand source code

def forbid_eof(self, msg: str = "", *args: t.Any, **kwargs: t.Any) -> None:
    if not msg:
        msg = "Unexpected EOF while parsing script."

    if self.stream.at_eof() or self.stream.after_eof():
        self.error(ast_errors.TokenizeEofError, msg.format(*args, **kwargs))

def get_all_tokens(self) ‑> Sequence[Token]

Extracts all tokens at once, until the end of the script. A sequence of tokens obtained this way will always end with a token of type TokenType.EOF.

Raises

TokenizeEofError: If EOF was hit unexpectedly. TokenizeError: If a generic issue happened while tokenizing.

Expand source code

def get_all_tokens(self) -> t.Sequence[Token]:
    """
    Extracts all tokens at once, until the end of the script. A sequence of tokens obtained this way
    will always end with a token of type `scrolls.ast.ast_constants.TokenType.EOF`.

    Raises:
        `scrolls.ast.ast_errors.TokenizeEofError`: If EOF was hit unexpectedly.
        `scrolls.ast.ast_errors.TokenizeError`: If a generic issue happened while tokenizing.
    """
    tokens: t.MutableSequence[Token] = []

    while True:
        tok = self.next_token()
        tokens.append(tok)
        if tok.type == TokenType.EOF:
            return tokens

def handle_consume_rest(self, tok: Token) ‑> None

Expand source code

def handle_consume_rest(self, tok: Token) -> None:
    f_map: t.Mapping[TokenizeConsumeRestState, t.Callable[[Token], None]] = {
        TokenizeConsumeRestState.OFF: self.handle_consume_rest_off,
        TokenizeConsumeRestState.COUNTING: self.handle_consume_rest_counting,
        TokenizeConsumeRestState.CONSUME: self.handle_consume_rest_consume
    }

    f_map[self.consume_rest_state](tok)

def handle_consume_rest_consume(self, tok: Token) ‑> None

Expand source code

def handle_consume_rest_consume(self, tok: Token) -> None:
    # This function runs AFTER a CONSUME_REST consumption. So, just set consume_rest back to OFF.
    self.consume_rest_state = TokenizeConsumeRestState.OFF
    self.consume_rest_count = 0

def handle_consume_rest_counting(self, tok: Token) ‑> None

Expand source code

def handle_consume_rest_counting(self, tok: Token) -> None:
    self.previous_token_was_sep = False

    # Only count down on string literals.
    if tok.type == TokenType.STRING_LITERAL:
        self.consume_rest_count -= 1

        # Once countdown is over, CONSUME_REST on next token.
        if self.consume_rest_count == 0:
            self.consume_rest_state = TokenizeConsumeRestState.CONSUME

    # If we get any other token type, then cancel CONSUME_REST
    else:
        self.consume_rest_state = TokenizeConsumeRestState.OFF
        self.consume_rest_count = 0

def handle_consume_rest_off(self, tok: Token) ‑> None

Expand source code

def handle_consume_rest_off(self, tok: Token) -> None:
    if tok.type in (TokenType.COMMAND_SEP, TokenType.CLOSE_BLOCK, TokenType.CLOSE_ARGS):
        self.previous_token_was_sep = True
        return

    # Test to see if we should enter CONSUME_REST state.
    # Only trigger CONSUME_REST if the previous token was a command separator.
    should_enter_consume_rest = (
            self.previous_token_was_sep and
            tok.type == TokenType.STRING_LITERAL and
            tok.value in self.consume_rest_triggers
    )
    self.previous_token_was_sep = False
    if should_enter_consume_rest:
        count = self.consume_rest_triggers[tok.value]

        if count == 0:
            self.consume_rest_state = TokenizeConsumeRestState.CONSUME
        else:
            self.consume_rest_state = TokenizeConsumeRestState.COUNTING
            self.consume_rest_count = count

def next_n_chars(self, n: int) ‑> str

Unconditionally consume N characters and return them.

Expand source code

def next_n_chars(self, n: int) -> str:
    """
    Unconditionally consume N characters and return them.
    """
    chars: t.MutableSequence[str] = []
    for _ in range(n):
        self.forbid_eof(
            "Ran into EOF while consuming characters. Got {}, wanted {}.",
            len(chars), n
        )

        chars.append(self.stream.get_char())
        self.stream.next_char()

    return "".join(chars)

def next_token(self) ‑> Token

Extract the next token. If the tokenizing is finished, this will return a Token of type TokenType.EOF

Raises

TokenizeEofError: If EOF was hit unexpectedly. TokenizeError: If a generic issue happened while tokenizing.

Expand source code

def next_token(self) -> Token:
    """
    Extract the next token. If the tokenizing is finished, this will return a `Token` of type
    `scrolls.ast.ast_constants.TokenType.EOF`

    Raises:
        `scrolls.ast.ast_errors.TokenizeEofError`: If EOF was hit unexpectedly.
        `scrolls.ast.ast_errors.TokenizeError`: If a generic issue happened while tokenizing.
    """
    if self.consume_rest_state == TokenizeConsumeRestState.CONSUME:
        while True:
            tok = self.accept_any_of(
                self.accept_whitespace
            )

            if tok is None:
                break

            if tok.type == TokenType.WHITESPACE:
                continue

        tok = self.accept_string_literal_consume_rest()
        if tok is None:
            self.error(
                ast_errors.TokenizeError,
                "Got bad string literal during consume_rest"
            )
        logger.debug(f"tokenize: Got token {tok.type.name}:{repr(tok.value)}")
        tok.consume_rest = True  # Signal we got this token using CONSUME_REST

        self.handle_consume_rest(tok)
        return tok
    else:
        while True:
            if self.stream.after_eof():
                self.error(
                    ast_errors.TokenizeEofError,
                    "No more tokens."
                )

            tok = None

            try:
                tok = self.accept_any_of(
                    self.accept_whitespace,
                    self.accept_comment,
                    self.accept_single_char,
                    self.accept_string_literal_quoted,
                    self.accept_string_literal_normal
                )
            except ast_errors.StreamEofError:
                # !!! HACK
                # I really, really need to rethink how EOF is handled
                # throughout this entire module. It's broken.
                pass

            if tok is None:
                # If tok is None, then all tokenizing functions got
                # rejected. So, try to accept and return EOF.

                eof_tok = self.accept_eof()

                if eof_tok is None:
                    self.error(
                        ast_errors.TokenizeError,
                        "Unexpectedly rejected all tokenizing functions."
                    )
                else:
                    return eof_tok

            # Loop until we get a non-whitespace, non-comment token.
            if tok.type not in [TokenType.WHITESPACE, TokenType.COMMENT]:
                logger.debug(f"tokenize: Got token {tok.type.name}:{repr(tok.value)}")
                self.handle_consume_rest(tok)
                return tok

def set_comments_enable(self, en: bool) ‑> None

Set whether comments are enabled. If disabled, the comment character will be ignored, and anything that would be a comment will be treated as ordinary code.

Expand source code

def set_comments_enable(self, en: bool) -> None:
    """
    Set whether comments are enabled. If disabled, the comment character will be ignored, and anything that
    would be a comment will be treated as ordinary code.
    """
    self.comments_enable = en
    self.string_literal_stop = _str_switch(
        self.string_literal_stop,
        self._string_literal_stop_comment,
        en
    )

def set_consume_rest_all(self, consume_all: bool) ‑> None

Set whether CONSUME_REST consumes until EOF. Defaults to False.

If False, CONSUME_REST will stop on block open/close, and command separators.

If True, CONSUME_REST will not stop until EOF.

Expand source code

def set_consume_rest_all(self, consume_all: bool) -> None:
    """
    Set whether CONSUME_REST consumes until EOF. Defaults to `False`.

    If `False`, CONSUME_REST will stop on block open/close, and command separators.

    If `True`, CONSUME_REST will not stop until EOF.
    """
    self.consume_rest_stop = _str_switch(
        self.consume_rest_stop,
        self._consume_rest_stop_switch,
        not consume_all
    )

def set_newlines_separate_strings(self, en: bool) ‑> None

Set whether newlines separate string literals. This can only be modified if Tokenizer.set_single_char_token_enable() has been set to False, and will raise a ScrollError otherwise.

By default, when Tokenizer.set_single_char_token_enable() is set to False, newlines will instead be considered whitespace, and will separate strings without producing TokenType.COMMAND_SEP tokens.

To override this behavior, this function may be set to False. In this case, newlines will be rolled into string literals, and ONLY spaces and tabs will separate string literals.

Expand source code

def set_newlines_separate_strings(self, en: bool) -> None:
    """
    Set whether newlines separate string literals. This can only be modified if
    `Tokenizer.set_single_char_token_enable` has been set to `False`, and will raise a
    `scrolls.errors.ScrollError` otherwise.

    By default, when `Tokenizer.set_single_char_token_enable` is set to `False`, newlines will instead be
    considered whitespace, and will separate strings without producing
    `scrolls.ast.ast_constants.TokenType.COMMAND_SEP` tokens.

    To override this behavior, this function may be set to `False`. In this case, newlines will be rolled into
    string literals, and ONLY spaces and tabs will separate string literals.
    """
    if self.single_char_token_enable:
        raise base_errors.ScrollError("Cannot use set_newlines_separate_strings when single char tokens are enabled.")

    self.newlines_separate_strings = en
    self.string_literal_stop = _str_switch(
        self.string_literal_stop,
        "\n",
        en
    )
    self.whitespace = _str_switch(
        self.whitespace,
        "\n",
        en
    )

def set_quoted_literals_enable(self, en: bool) ‑> None

Set whether quoted string literals are enabled. If disabled, quotes will be rolled into normal string token parsing.

For instance, if quoted literals are disabled, "Hello World" would be interpreted as "Hello, World".

Expand source code

def set_quoted_literals_enable(self, en: bool) -> None:
    """
    Set whether quoted string literals are enabled. If disabled, quotes will be rolled into normal string token
    parsing.

    For instance, if quoted literals are disabled, `"Hello World"` would be interpreted as `"Hello`, `World"`.
    """
    self.quoted_literal_enable = en
    self.string_literal_stop = _str_switch(
        self.string_literal_stop,
        self._string_literal_stop_quoted,
        en
    )

def set_single_char_token_enable(self, en: bool) ‑> None

Set whether single character tokens should be parsed. This includes ALL token types except for TokenType.STRING_LITERAL and TokenType.COMMENT. Defaults to True.

If False, then all special characters that would otherwise be their own token will be rolled into string literals.

Expand source code

def set_single_char_token_enable(self, en: bool) -> None:
    """
    Set whether single character tokens should be parsed. This includes ALL token types except for
    `scrolls.ast.ast_constants.TokenType.STRING_LITERAL` and `scrolls.ast.ast_constants.TokenType.COMMENT`.
    Defaults to `True`.

    If `False`, then all special characters that would otherwise be their own token will be rolled
    into string literals.
    """
    if self.newlines_separate_strings and not self.single_char_token_enable and en:
        # If we're re-enabling single char tokens and the newline separator behavior is still on,
        # we need to undo that first.
        self.set_newlines_separate_strings(False)

    self.single_char_token_enable = en

    self.string_literal_stop = _str_switch(
        self.string_literal_stop,
        self._string_literal_stop_single_char,
        en
    )

    if not en:
        self.set_newlines_separate_strings(True)
    else:
        # If single char tokens are enabled, newlines must stop string literals for this to work properly.
        self.string_literal_stop = _str_ensure(self.string_literal_stop, "\n")

def try_consume_escape(self) ‑> Optional[str]

Expand source code

def try_consume_escape(self) -> t.Optional[str]:
    if self.stream.get_char() != ESCAPE_SIGIL:
        return None

    self.stream.next_char()
    self.forbid_eof()

    escape_char = self.stream.get_char()
    if escape_char not in self.escape_sequences:
        self.error(ast_errors.TokenizeError, f"Invalid escape '{escape_char}'")

    self.stream.next_char()
    self.forbid_eof()

    replacement = self.escape_sequences[escape_char]
    if isinstance(replacement, str):
        return replacement
    elif callable(replacement):
        return replacement(self)
    else:
        raise TypeError(f"Bad type for escape sequence {escape_char}, "
                        "must be 'str' or '(Tokenizer) -> str'")