Module scrolls.ast.tokenizer
The tokenizer implementation.
See ast: Tokenizing.
Expand source code
"""
The tokenizer implementation.
See [ast: Tokenizing](../ast/index.html#tokenizing).
"""
import dataclasses
import logging
import types
import typing as t
from scrolls import errors as base_errors
from . import ast_errors, streams
from .ast_constants import (BLOCK_CLOSE, BLOCK_OPEN, CLOSE_ARGS, COMMAND_SEP,
COMMENT_SIGIL, CONTROL_SIGIL, EOF, ESCAPE_SIGIL,
EXPANSION_SIGIL, OPEN_ARGS, QUOTE, SPREAD_SIGIL,
TokenizeConsumeRestState, TokenType)
__all__ = (
"Token",
"Tokenizer"
)
logger = logging.getLogger(__name__)
def _str_ensure(s: str, ensure: str) -> str:
if ensure not in s:
return s + ensure
else:
return s
def _str_remove(s: str, remove: str) -> str:
return s.replace(remove, "")
def _str_switch(s: str, switch: str, en: bool) -> str:
"""
Utility function for enabling/disabling detection of certain characters.
"""
if en:
return _str_ensure(s, switch)
else:
return _str_remove(s, switch)
@dataclasses.dataclass
class Token:
"""A token."""
type: TokenType
"""The type of this token."""
value: str
"""The value of this token."""
line: int
"""The line this token *started* generating on. Some tokens may span multiple lines."""
position: int
"""The column along the line that this token *started* generating on. """
tokenizer: "Tokenizer"
"""The tokenizer that generated this token."""
consume_rest: bool = False
"""Sets whether this token was generated by CONSUME_REST."""
def __str__(self) -> str:
return f"{self.type.name}:{repr(self.value)}"
class Tokenizer:
"""
The tokenizer. This class is responsible for identifying meaningful pieces of scripts
(such as string literals, block open and close, etc.), and tagging them.
.. WARNING::
If the tokenizer is supplied with a string, then this `Tokenizer` is **single use**.
If you wish to stream input, implement a `scrolls.ast.streams.CharStream`. See
`scrolls.ast.streams.StringStream.feed` and see if that works for you.
See `scrolls.ast.streams.REPLStream` for an example of streaming input
from a user.
Args:
stream: The script to tokenize. This may be a string or a `scrolls.ast.streams.CharStream` instance.
consume_rest_triggers: Triggers for CONSUME_REST.
"""
def __init__(
self,
stream: t.Union[str, streams.CharStream],
consume_rest_triggers: t.Mapping[str, int] = types.MappingProxyType({})
):
if isinstance(stream, str):
self.stream: streams.CharStream = streams.StringStream(stream.strip())
else:
self.stream = stream
self.consume_rest_triggers = consume_rest_triggers
self.consume_rest_state = TokenizeConsumeRestState.OFF
self.consume_rest_count = 0
self.previous_token_was_sep = True
self.whitespace = "\t "
# Map of single characters to token types
self.charmap = {
"\n": TokenType.COMMAND_SEP,
";": TokenType.COMMAND_SEP,
OPEN_ARGS: TokenType.OPEN_ARGS,
CLOSE_ARGS: TokenType.CLOSE_ARGS,
BLOCK_OPEN: TokenType.OPEN_BLOCK,
BLOCK_CLOSE: TokenType.CLOSE_BLOCK,
EXPANSION_SIGIL: TokenType.EXPANSION_SIGIL,
CONTROL_SIGIL: TokenType.CONTROL_SIGIL,
SPREAD_SIGIL: TokenType.SPREAD_SIGIL
}
self.escape_sequences: t.MutableMapping[
str,
t.Union[str, t.Callable[[Tokenizer], str]]
] = {
"n": "\n",
"t": "\t",
"r": "\r",
ESCAPE_SIGIL: ESCAPE_SIGIL,
QUOTE: QUOTE,
"u": Tokenizer._unicode_escape
}
# Set up stop chars for unquoted string literals.
self._string_literal_always_stop = self.whitespace
self._string_literal_stop_quoted = QUOTE
self._string_literal_stop_comment = COMMENT_SIGIL
# Note: Add an exception for newlines. Even when we don't consider newlines to be command separators,
# we would normally want newlines to separate string literals. So remove \n from this switch.
self._string_literal_stop_single_char = _str_remove(
"".join(self.charmap.keys()),
"\n"
)
# Override flag for behavior when single_char_token_enable is False.
self.newlines_separate_strings = True
self.string_literal_stop: str = self._string_literal_always_stop
self.single_char_token_enable = True
self.set_single_char_token_enable(True)
# Set up stop chars for CONSUME_REST.
self._consume_rest_stop_switch: str = "".join([*COMMAND_SEP, BLOCK_CLOSE, BLOCK_OPEN])
self.consume_rest_stop: str = ""
self.set_consume_rest_all(False)
# Set up stop chars for quoted literals.
self.quoted_literal_stop: str = QUOTE # For now, quoted literals ONLY stop on another quote.
self.quoted_literal_enable = True
self.set_quoted_literals_enable(True)
# Set up stop chars for comments. (Note: No need for specific comment stop char here, it's hardcoded to
# be \n at the moment.)
self.comments_enable = True
self.set_comments_enable(True)
def _unicode_escape(self) -> str:
code_point = "" # Initialization not needed, just satisfies some linters.
try:
code_point = self.next_n_chars(4)
except ast_errors.TokenizeEofError:
self.error(
ast_errors.TokenizeEofError,
"Ran off end of script trying to parse unicode escape."
)
if QUOTE in code_point:
self.error(
ast_errors.TokenizeError,
f"Encountered {QUOTE} while consuming unicode escape.",
pos=self.stream.current_pos() - 4
)
char = ""
try:
char = chr(int(code_point, 16))
except ValueError:
self.error(
ast_errors.TokenizeError,
f"Bad hex number {code_point}.",
pos=self.stream.current_pos() - 4
)
return char
def set_consume_rest_all(self, consume_all: bool) -> None:
"""
Set whether CONSUME_REST consumes until EOF. Defaults to `False`.
If `False`, CONSUME_REST will stop on block open/close, and command separators.
If `True`, CONSUME_REST will not stop until EOF.
"""
self.consume_rest_stop = _str_switch(
self.consume_rest_stop,
self._consume_rest_stop_switch,
not consume_all
)
def set_single_char_token_enable(self, en: bool) -> None:
"""
Set whether single character tokens should be parsed. This includes ALL token types except for
`scrolls.ast.ast_constants.TokenType.STRING_LITERAL` and `scrolls.ast.ast_constants.TokenType.COMMENT`.
Defaults to `True`.
If `False`, then all special characters that would otherwise be their own token will be rolled
into string literals.
"""
if self.newlines_separate_strings and not self.single_char_token_enable and en:
# If we're re-enabling single char tokens and the newline separator behavior is still on,
# we need to undo that first.
self.set_newlines_separate_strings(False)
self.single_char_token_enable = en
self.string_literal_stop = _str_switch(
self.string_literal_stop,
self._string_literal_stop_single_char,
en
)
if not en:
self.set_newlines_separate_strings(True)
else:
# If single char tokens are enabled, newlines must stop string literals for this to work properly.
self.string_literal_stop = _str_ensure(self.string_literal_stop, "\n")
def set_quoted_literals_enable(self, en: bool) -> None:
"""
Set whether quoted string literals are enabled. If disabled, quotes will be rolled into normal string token
parsing.
For instance, if quoted literals are disabled, `"Hello World"` would be interpreted as `"Hello`, `World"`.
"""
self.quoted_literal_enable = en
self.string_literal_stop = _str_switch(
self.string_literal_stop,
self._string_literal_stop_quoted,
en
)
def set_comments_enable(self, en: bool) -> None:
"""
Set whether comments are enabled. If disabled, the comment character will be ignored, and anything that
would be a comment will be treated as ordinary code.
"""
self.comments_enable = en
self.string_literal_stop = _str_switch(
self.string_literal_stop,
self._string_literal_stop_comment,
en
)
def set_newlines_separate_strings(self, en: bool) -> None:
"""
Set whether newlines separate string literals. This can only be modified if
`Tokenizer.set_single_char_token_enable` has been set to `False`, and will raise a
`scrolls.errors.ScrollError` otherwise.
By default, when `Tokenizer.set_single_char_token_enable` is set to `False`, newlines will instead be
considered whitespace, and will separate strings without producing
`scrolls.ast.ast_constants.TokenType.COMMAND_SEP` tokens.
To override this behavior, this function may be set to `False`. In this case, newlines will be rolled into
string literals, and ONLY spaces and tabs will separate string literals.
"""
if self.single_char_token_enable:
raise base_errors.ScrollError("Cannot use set_newlines_separate_strings when single char tokens are enabled.")
self.newlines_separate_strings = en
self.string_literal_stop = _str_switch(
self.string_literal_stop,
"\n",
en
)
self.whitespace = _str_switch(
self.whitespace,
"\n",
en
)
def error(
self,
err_type: t.Type[base_errors.PositionalError],
message: str,
line: t.Optional[int] = None,
pos: t.Optional[int] = None
) -> t.NoReturn:
if line is not None:
_line = line
else:
_line = self.stream.current_line()
if pos is not None:
_pos = pos
else:
_pos = self.stream.current_pos()
raise err_type(
_line,
_pos,
self.stream.history(),
message
)
def forbid_eof(self, msg: str = "", *args: t.Any, **kwargs: t.Any) -> None:
if not msg:
msg = "Unexpected EOF while parsing script."
if self.stream.at_eof() or self.stream.after_eof():
self.error(ast_errors.TokenizeEofError, msg.format(*args, **kwargs))
def next_n_chars(self, n: int) -> str:
"""
Unconditionally consume N characters and return them.
"""
chars: t.MutableSequence[str] = []
for _ in range(n):
self.forbid_eof(
"Ran into EOF while consuming characters. Got {}, wanted {}.",
len(chars), n
)
chars.append(self.stream.get_char())
self.stream.next_char()
return "".join(chars)
# Get a single char token.
def accept_single_char(self) -> t.Optional[Token]:
if not self.single_char_token_enable:
return None
char = self.stream.get_char()
if char in self.charmap:
tok = Token(
self.charmap[char],
char,
self.stream.current_line(),
self.stream.current_pos(),
self
)
self.stream.next_char()
return tok
return None
def accept_eof(self) -> t.Optional[Token]:
if self.stream.at_eof():
# Once an EOF is generated, there are no more tokens.
# Any attempts after this to generate a token will
# result in an exception.
self.stream.next_char() # Put stream into after eof state
return Token(
TokenType.EOF,
EOF,
self.stream.current_line(),
self.stream.current_pos(),
self
)
else:
return None
def accept_whitespace(self) -> t.Optional[Token]:
char = self.stream.get_char()
if char in self.whitespace:
self.stream.next_char()
return Token(
TokenType.WHITESPACE,
char,
self.stream.current_line(),
self.stream.current_pos(),
self
)
return None
def try_consume_escape(self) -> t.Optional[str]:
if self.stream.get_char() != ESCAPE_SIGIL:
return None
self.stream.next_char()
self.forbid_eof()
escape_char = self.stream.get_char()
if escape_char not in self.escape_sequences:
self.error(ast_errors.TokenizeError, f"Invalid escape '{escape_char}'")
self.stream.next_char()
self.forbid_eof()
replacement = self.escape_sequences[escape_char]
if isinstance(replacement, str):
return replacement
elif callable(replacement):
return replacement(self)
else:
raise TypeError(f"Bad type for escape sequence {escape_char}, "
"must be 'str' or '(Tokenizer) -> str'")
def accept_string_literal(
self,
stop_chars: t.Sequence[str] = (),
error_on_eof: bool = False,
allow_escapes: bool = False
) -> t.Optional[Token]:
self.forbid_eof("String literal should not start on EOF")
char = self.stream.get_char()
pos = self.stream.current_pos()
line = self.stream.current_line()
chars = []
while char not in stop_chars:
if allow_escapes:
escape = self.try_consume_escape()
if escape is not None:
chars.append(escape)
char = self.stream.get_char()
continue
chars.append(char)
self.stream.next_char()
if self.stream.at_eof():
if error_on_eof:
self.error(
ast_errors.TokenizeEofError,
"Unexpected EOF while parsing string literal."
)
else:
break
char = self.stream.get_char()
return Token(
TokenType.STRING_LITERAL,
"".join(chars),
line,
pos,
self
)
def accept_comment(self) -> t.Optional[Token]:
if not self.comments_enable:
return None
char = self.stream.get_char()
pos = self.stream.current_pos()
line = self.stream.current_line()
chars = []
if char != COMMENT_SIGIL:
return None
self.stream.next_char()
while char != "\n":
chars.append(char)
self.stream.next_char()
if self.stream.at_eof():
break
char = self.stream.get_char()
return Token(
TokenType.COMMENT,
"".join(chars),
line,
pos,
self
)
# Accepts a normal string literal. No CONSUME_REST, not quoted.
def accept_string_literal_normal(self) -> t.Optional[Token]:
return self.accept_string_literal(
stop_chars=self.string_literal_stop,
error_on_eof=False # Just stop on EOF, no errors.
)
# Accept a CONSUME_REST literal.
def accept_string_literal_consume_rest(self) -> t.Optional[Token]:
return self.accept_string_literal(
stop_chars=self.consume_rest_stop,
error_on_eof=False # Stop on EOF. No errors.
)
# Accept a quoted string literal.
def accept_string_literal_quoted(self) -> t.Optional[Token]:
if not self.quoted_literal_enable:
return None
if self.stream.get_char() != QUOTE:
return None
else:
self.stream.next_char()
literal = self.accept_string_literal(
stop_chars=self.quoted_literal_stop,
error_on_eof=True, # Quoted literals must be closed.
allow_escapes=True # Escapes only allowed in quoted literals.
)
if literal is None:
self.error(
ast_errors.TokenizeError,
"internal: Got None from accept_string_literal, shouldn't have."
)
if self.stream.get_char() != QUOTE:
self.error(
ast_errors.TokenizeError,
"internal: Missing end quote, should have resulted in EOF error."
)
else:
self.stream.next_char()
return literal
@staticmethod
def accept_any_of(*f: t.Callable[[], t.Optional[Token]]) -> t.Optional[Token]:
for fun in f:
tok = fun()
if tok is not None:
return tok
return None
def handle_consume_rest_off(self, tok: Token) -> None:
if tok.type in (TokenType.COMMAND_SEP, TokenType.CLOSE_BLOCK, TokenType.CLOSE_ARGS):
self.previous_token_was_sep = True
return
# Test to see if we should enter CONSUME_REST state.
# Only trigger CONSUME_REST if the previous token was a command separator.
should_enter_consume_rest = (
self.previous_token_was_sep and
tok.type == TokenType.STRING_LITERAL and
tok.value in self.consume_rest_triggers
)
self.previous_token_was_sep = False
if should_enter_consume_rest:
count = self.consume_rest_triggers[tok.value]
if count == 0:
self.consume_rest_state = TokenizeConsumeRestState.CONSUME
else:
self.consume_rest_state = TokenizeConsumeRestState.COUNTING
self.consume_rest_count = count
def handle_consume_rest_counting(self, tok: Token) -> None:
self.previous_token_was_sep = False
# Only count down on string literals.
if tok.type == TokenType.STRING_LITERAL:
self.consume_rest_count -= 1
# Once countdown is over, CONSUME_REST on next token.
if self.consume_rest_count == 0:
self.consume_rest_state = TokenizeConsumeRestState.CONSUME
# If we get any other token type, then cancel CONSUME_REST
else:
self.consume_rest_state = TokenizeConsumeRestState.OFF
self.consume_rest_count = 0
def handle_consume_rest_consume(self, tok: Token) -> None:
# This function runs AFTER a CONSUME_REST consumption. So, just set consume_rest back to OFF.
self.consume_rest_state = TokenizeConsumeRestState.OFF
self.consume_rest_count = 0
# TODO
# Consume rest state handler. All this code is pretty ugly, and does not account
# for more advanced usage.
def handle_consume_rest(self, tok: Token) -> None:
f_map: t.Mapping[TokenizeConsumeRestState, t.Callable[[Token], None]] = {
TokenizeConsumeRestState.OFF: self.handle_consume_rest_off,
TokenizeConsumeRestState.COUNTING: self.handle_consume_rest_counting,
TokenizeConsumeRestState.CONSUME: self.handle_consume_rest_consume
}
f_map[self.consume_rest_state](tok)
def next_token(self) -> Token:
"""
Extract the next token. If the tokenizing is finished, this will return a `Token` of type
`scrolls.ast.ast_constants.TokenType.EOF`
Raises:
`scrolls.ast.ast_errors.TokenizeEofError`: If EOF was hit unexpectedly.
`scrolls.ast.ast_errors.TokenizeError`: If a generic issue happened while tokenizing.
"""
if self.consume_rest_state == TokenizeConsumeRestState.CONSUME:
while True:
tok = self.accept_any_of(
self.accept_whitespace
)
if tok is None:
break
if tok.type == TokenType.WHITESPACE:
continue
tok = self.accept_string_literal_consume_rest()
if tok is None:
self.error(
ast_errors.TokenizeError,
"Got bad string literal during consume_rest"
)
logger.debug(f"tokenize: Got token {tok.type.name}:{repr(tok.value)}")
tok.consume_rest = True # Signal we got this token using CONSUME_REST
self.handle_consume_rest(tok)
return tok
else:
while True:
if self.stream.after_eof():
self.error(
ast_errors.TokenizeEofError,
"No more tokens."
)
tok = None
try:
tok = self.accept_any_of(
self.accept_whitespace,
self.accept_comment,
self.accept_single_char,
self.accept_string_literal_quoted,
self.accept_string_literal_normal
)
except ast_errors.StreamEofError:
# !!! HACK
# I really, really need to rethink how EOF is handled
# throughout this entire module. It's broken.
pass
if tok is None:
# If tok is None, then all tokenizing functions got
# rejected. So, try to accept and return EOF.
eof_tok = self.accept_eof()
if eof_tok is None:
self.error(
ast_errors.TokenizeError,
"Unexpectedly rejected all tokenizing functions."
)
else:
return eof_tok
# Loop until we get a non-whitespace, non-comment token.
if tok.type not in [TokenType.WHITESPACE, TokenType.COMMENT]:
logger.debug(f"tokenize: Got token {tok.type.name}:{repr(tok.value)}")
self.handle_consume_rest(tok)
return tok
def get_all_tokens(self) -> t.Sequence[Token]:
"""
Extracts all tokens at once, until the end of the script. A sequence of tokens obtained this way
will always end with a token of type `scrolls.ast.ast_constants.TokenType.EOF`.
Raises:
`scrolls.ast.ast_errors.TokenizeEofError`: If EOF was hit unexpectedly.
`scrolls.ast.ast_errors.TokenizeError`: If a generic issue happened while tokenizing.
"""
tokens: t.MutableSequence[Token] = []
while True:
tok = self.next_token()
tokens.append(tok)
if tok.type == TokenType.EOF:
return tokens
Classes
class Token (type: TokenType, value: str, line: int, position: int, tokenizer: Tokenizer, consume_rest: bool = False)
-
A token.
Expand source code
@dataclasses.dataclass class Token: """A token.""" type: TokenType """The type of this token.""" value: str """The value of this token.""" line: int """The line this token *started* generating on. Some tokens may span multiple lines.""" position: int """The column along the line that this token *started* generating on. """ tokenizer: "Tokenizer" """The tokenizer that generated this token.""" consume_rest: bool = False """Sets whether this token was generated by CONSUME_REST.""" def __str__(self) -> str: return f"{self.type.name}:{repr(self.value)}"
Class variables
var consume_rest : bool
-
Sets whether this token was generated by CONSUME_REST.
var line : int
-
The line this token started generating on. Some tokens may span multiple lines.
var position : int
-
The column along the line that this token started generating on.
var tokenizer : Tokenizer
-
The tokenizer that generated this token.
var type : TokenType
-
The type of this token.
var value : str
-
The value of this token.
class Tokenizer (stream: Union[str, CharStream], consume_rest_triggers: Mapping[str, int] = mappingproxy({}))
-
The tokenizer. This class is responsible for identifying meaningful pieces of scripts (such as string literals, block open and close, etc.), and tagging them.
Warning
If the tokenizer is supplied with a string, then this
Tokenizer
is single use. If you wish to stream input, implement aCharStream
. SeeStringStream.feed()
and see if that works for you. SeeREPLStream
for an example of streaming input from a user.Args
stream
- The script to tokenize. This may be a string or a
CharStream
instance. consume_rest_triggers
- Triggers for CONSUME_REST.
Expand source code
class Tokenizer: """ The tokenizer. This class is responsible for identifying meaningful pieces of scripts (such as string literals, block open and close, etc.), and tagging them. .. WARNING:: If the tokenizer is supplied with a string, then this `Tokenizer` is **single use**. If you wish to stream input, implement a `scrolls.ast.streams.CharStream`. See `scrolls.ast.streams.StringStream.feed` and see if that works for you. See `scrolls.ast.streams.REPLStream` for an example of streaming input from a user. Args: stream: The script to tokenize. This may be a string or a `scrolls.ast.streams.CharStream` instance. consume_rest_triggers: Triggers for CONSUME_REST. """ def __init__( self, stream: t.Union[str, streams.CharStream], consume_rest_triggers: t.Mapping[str, int] = types.MappingProxyType({}) ): if isinstance(stream, str): self.stream: streams.CharStream = streams.StringStream(stream.strip()) else: self.stream = stream self.consume_rest_triggers = consume_rest_triggers self.consume_rest_state = TokenizeConsumeRestState.OFF self.consume_rest_count = 0 self.previous_token_was_sep = True self.whitespace = "\t " # Map of single characters to token types self.charmap = { "\n": TokenType.COMMAND_SEP, ";": TokenType.COMMAND_SEP, OPEN_ARGS: TokenType.OPEN_ARGS, CLOSE_ARGS: TokenType.CLOSE_ARGS, BLOCK_OPEN: TokenType.OPEN_BLOCK, BLOCK_CLOSE: TokenType.CLOSE_BLOCK, EXPANSION_SIGIL: TokenType.EXPANSION_SIGIL, CONTROL_SIGIL: TokenType.CONTROL_SIGIL, SPREAD_SIGIL: TokenType.SPREAD_SIGIL } self.escape_sequences: t.MutableMapping[ str, t.Union[str, t.Callable[[Tokenizer], str]] ] = { "n": "\n", "t": "\t", "r": "\r", ESCAPE_SIGIL: ESCAPE_SIGIL, QUOTE: QUOTE, "u": Tokenizer._unicode_escape } # Set up stop chars for unquoted string literals. self._string_literal_always_stop = self.whitespace self._string_literal_stop_quoted = QUOTE self._string_literal_stop_comment = COMMENT_SIGIL # Note: Add an exception for newlines. Even when we don't consider newlines to be command separators, # we would normally want newlines to separate string literals. So remove \n from this switch. self._string_literal_stop_single_char = _str_remove( "".join(self.charmap.keys()), "\n" ) # Override flag for behavior when single_char_token_enable is False. self.newlines_separate_strings = True self.string_literal_stop: str = self._string_literal_always_stop self.single_char_token_enable = True self.set_single_char_token_enable(True) # Set up stop chars for CONSUME_REST. self._consume_rest_stop_switch: str = "".join([*COMMAND_SEP, BLOCK_CLOSE, BLOCK_OPEN]) self.consume_rest_stop: str = "" self.set_consume_rest_all(False) # Set up stop chars for quoted literals. self.quoted_literal_stop: str = QUOTE # For now, quoted literals ONLY stop on another quote. self.quoted_literal_enable = True self.set_quoted_literals_enable(True) # Set up stop chars for comments. (Note: No need for specific comment stop char here, it's hardcoded to # be \n at the moment.) self.comments_enable = True self.set_comments_enable(True) def _unicode_escape(self) -> str: code_point = "" # Initialization not needed, just satisfies some linters. try: code_point = self.next_n_chars(4) except ast_errors.TokenizeEofError: self.error( ast_errors.TokenizeEofError, "Ran off end of script trying to parse unicode escape." ) if QUOTE in code_point: self.error( ast_errors.TokenizeError, f"Encountered {QUOTE} while consuming unicode escape.", pos=self.stream.current_pos() - 4 ) char = "" try: char = chr(int(code_point, 16)) except ValueError: self.error( ast_errors.TokenizeError, f"Bad hex number {code_point}.", pos=self.stream.current_pos() - 4 ) return char def set_consume_rest_all(self, consume_all: bool) -> None: """ Set whether CONSUME_REST consumes until EOF. Defaults to `False`. If `False`, CONSUME_REST will stop on block open/close, and command separators. If `True`, CONSUME_REST will not stop until EOF. """ self.consume_rest_stop = _str_switch( self.consume_rest_stop, self._consume_rest_stop_switch, not consume_all ) def set_single_char_token_enable(self, en: bool) -> None: """ Set whether single character tokens should be parsed. This includes ALL token types except for `scrolls.ast.ast_constants.TokenType.STRING_LITERAL` and `scrolls.ast.ast_constants.TokenType.COMMENT`. Defaults to `True`. If `False`, then all special characters that would otherwise be their own token will be rolled into string literals. """ if self.newlines_separate_strings and not self.single_char_token_enable and en: # If we're re-enabling single char tokens and the newline separator behavior is still on, # we need to undo that first. self.set_newlines_separate_strings(False) self.single_char_token_enable = en self.string_literal_stop = _str_switch( self.string_literal_stop, self._string_literal_stop_single_char, en ) if not en: self.set_newlines_separate_strings(True) else: # If single char tokens are enabled, newlines must stop string literals for this to work properly. self.string_literal_stop = _str_ensure(self.string_literal_stop, "\n") def set_quoted_literals_enable(self, en: bool) -> None: """ Set whether quoted string literals are enabled. If disabled, quotes will be rolled into normal string token parsing. For instance, if quoted literals are disabled, `"Hello World"` would be interpreted as `"Hello`, `World"`. """ self.quoted_literal_enable = en self.string_literal_stop = _str_switch( self.string_literal_stop, self._string_literal_stop_quoted, en ) def set_comments_enable(self, en: bool) -> None: """ Set whether comments are enabled. If disabled, the comment character will be ignored, and anything that would be a comment will be treated as ordinary code. """ self.comments_enable = en self.string_literal_stop = _str_switch( self.string_literal_stop, self._string_literal_stop_comment, en ) def set_newlines_separate_strings(self, en: bool) -> None: """ Set whether newlines separate string literals. This can only be modified if `Tokenizer.set_single_char_token_enable` has been set to `False`, and will raise a `scrolls.errors.ScrollError` otherwise. By default, when `Tokenizer.set_single_char_token_enable` is set to `False`, newlines will instead be considered whitespace, and will separate strings without producing `scrolls.ast.ast_constants.TokenType.COMMAND_SEP` tokens. To override this behavior, this function may be set to `False`. In this case, newlines will be rolled into string literals, and ONLY spaces and tabs will separate string literals. """ if self.single_char_token_enable: raise base_errors.ScrollError("Cannot use set_newlines_separate_strings when single char tokens are enabled.") self.newlines_separate_strings = en self.string_literal_stop = _str_switch( self.string_literal_stop, "\n", en ) self.whitespace = _str_switch( self.whitespace, "\n", en ) def error( self, err_type: t.Type[base_errors.PositionalError], message: str, line: t.Optional[int] = None, pos: t.Optional[int] = None ) -> t.NoReturn: if line is not None: _line = line else: _line = self.stream.current_line() if pos is not None: _pos = pos else: _pos = self.stream.current_pos() raise err_type( _line, _pos, self.stream.history(), message ) def forbid_eof(self, msg: str = "", *args: t.Any, **kwargs: t.Any) -> None: if not msg: msg = "Unexpected EOF while parsing script." if self.stream.at_eof() or self.stream.after_eof(): self.error(ast_errors.TokenizeEofError, msg.format(*args, **kwargs)) def next_n_chars(self, n: int) -> str: """ Unconditionally consume N characters and return them. """ chars: t.MutableSequence[str] = [] for _ in range(n): self.forbid_eof( "Ran into EOF while consuming characters. Got {}, wanted {}.", len(chars), n ) chars.append(self.stream.get_char()) self.stream.next_char() return "".join(chars) # Get a single char token. def accept_single_char(self) -> t.Optional[Token]: if not self.single_char_token_enable: return None char = self.stream.get_char() if char in self.charmap: tok = Token( self.charmap[char], char, self.stream.current_line(), self.stream.current_pos(), self ) self.stream.next_char() return tok return None def accept_eof(self) -> t.Optional[Token]: if self.stream.at_eof(): # Once an EOF is generated, there are no more tokens. # Any attempts after this to generate a token will # result in an exception. self.stream.next_char() # Put stream into after eof state return Token( TokenType.EOF, EOF, self.stream.current_line(), self.stream.current_pos(), self ) else: return None def accept_whitespace(self) -> t.Optional[Token]: char = self.stream.get_char() if char in self.whitespace: self.stream.next_char() return Token( TokenType.WHITESPACE, char, self.stream.current_line(), self.stream.current_pos(), self ) return None def try_consume_escape(self) -> t.Optional[str]: if self.stream.get_char() != ESCAPE_SIGIL: return None self.stream.next_char() self.forbid_eof() escape_char = self.stream.get_char() if escape_char not in self.escape_sequences: self.error(ast_errors.TokenizeError, f"Invalid escape '{escape_char}'") self.stream.next_char() self.forbid_eof() replacement = self.escape_sequences[escape_char] if isinstance(replacement, str): return replacement elif callable(replacement): return replacement(self) else: raise TypeError(f"Bad type for escape sequence {escape_char}, " "must be 'str' or '(Tokenizer) -> str'") def accept_string_literal( self, stop_chars: t.Sequence[str] = (), error_on_eof: bool = False, allow_escapes: bool = False ) -> t.Optional[Token]: self.forbid_eof("String literal should not start on EOF") char = self.stream.get_char() pos = self.stream.current_pos() line = self.stream.current_line() chars = [] while char not in stop_chars: if allow_escapes: escape = self.try_consume_escape() if escape is not None: chars.append(escape) char = self.stream.get_char() continue chars.append(char) self.stream.next_char() if self.stream.at_eof(): if error_on_eof: self.error( ast_errors.TokenizeEofError, "Unexpected EOF while parsing string literal." ) else: break char = self.stream.get_char() return Token( TokenType.STRING_LITERAL, "".join(chars), line, pos, self ) def accept_comment(self) -> t.Optional[Token]: if not self.comments_enable: return None char = self.stream.get_char() pos = self.stream.current_pos() line = self.stream.current_line() chars = [] if char != COMMENT_SIGIL: return None self.stream.next_char() while char != "\n": chars.append(char) self.stream.next_char() if self.stream.at_eof(): break char = self.stream.get_char() return Token( TokenType.COMMENT, "".join(chars), line, pos, self ) # Accepts a normal string literal. No CONSUME_REST, not quoted. def accept_string_literal_normal(self) -> t.Optional[Token]: return self.accept_string_literal( stop_chars=self.string_literal_stop, error_on_eof=False # Just stop on EOF, no errors. ) # Accept a CONSUME_REST literal. def accept_string_literal_consume_rest(self) -> t.Optional[Token]: return self.accept_string_literal( stop_chars=self.consume_rest_stop, error_on_eof=False # Stop on EOF. No errors. ) # Accept a quoted string literal. def accept_string_literal_quoted(self) -> t.Optional[Token]: if not self.quoted_literal_enable: return None if self.stream.get_char() != QUOTE: return None else: self.stream.next_char() literal = self.accept_string_literal( stop_chars=self.quoted_literal_stop, error_on_eof=True, # Quoted literals must be closed. allow_escapes=True # Escapes only allowed in quoted literals. ) if literal is None: self.error( ast_errors.TokenizeError, "internal: Got None from accept_string_literal, shouldn't have." ) if self.stream.get_char() != QUOTE: self.error( ast_errors.TokenizeError, "internal: Missing end quote, should have resulted in EOF error." ) else: self.stream.next_char() return literal @staticmethod def accept_any_of(*f: t.Callable[[], t.Optional[Token]]) -> t.Optional[Token]: for fun in f: tok = fun() if tok is not None: return tok return None def handle_consume_rest_off(self, tok: Token) -> None: if tok.type in (TokenType.COMMAND_SEP, TokenType.CLOSE_BLOCK, TokenType.CLOSE_ARGS): self.previous_token_was_sep = True return # Test to see if we should enter CONSUME_REST state. # Only trigger CONSUME_REST if the previous token was a command separator. should_enter_consume_rest = ( self.previous_token_was_sep and tok.type == TokenType.STRING_LITERAL and tok.value in self.consume_rest_triggers ) self.previous_token_was_sep = False if should_enter_consume_rest: count = self.consume_rest_triggers[tok.value] if count == 0: self.consume_rest_state = TokenizeConsumeRestState.CONSUME else: self.consume_rest_state = TokenizeConsumeRestState.COUNTING self.consume_rest_count = count def handle_consume_rest_counting(self, tok: Token) -> None: self.previous_token_was_sep = False # Only count down on string literals. if tok.type == TokenType.STRING_LITERAL: self.consume_rest_count -= 1 # Once countdown is over, CONSUME_REST on next token. if self.consume_rest_count == 0: self.consume_rest_state = TokenizeConsumeRestState.CONSUME # If we get any other token type, then cancel CONSUME_REST else: self.consume_rest_state = TokenizeConsumeRestState.OFF self.consume_rest_count = 0 def handle_consume_rest_consume(self, tok: Token) -> None: # This function runs AFTER a CONSUME_REST consumption. So, just set consume_rest back to OFF. self.consume_rest_state = TokenizeConsumeRestState.OFF self.consume_rest_count = 0 # TODO # Consume rest state handler. All this code is pretty ugly, and does not account # for more advanced usage. def handle_consume_rest(self, tok: Token) -> None: f_map: t.Mapping[TokenizeConsumeRestState, t.Callable[[Token], None]] = { TokenizeConsumeRestState.OFF: self.handle_consume_rest_off, TokenizeConsumeRestState.COUNTING: self.handle_consume_rest_counting, TokenizeConsumeRestState.CONSUME: self.handle_consume_rest_consume } f_map[self.consume_rest_state](tok) def next_token(self) -> Token: """ Extract the next token. If the tokenizing is finished, this will return a `Token` of type `scrolls.ast.ast_constants.TokenType.EOF` Raises: `scrolls.ast.ast_errors.TokenizeEofError`: If EOF was hit unexpectedly. `scrolls.ast.ast_errors.TokenizeError`: If a generic issue happened while tokenizing. """ if self.consume_rest_state == TokenizeConsumeRestState.CONSUME: while True: tok = self.accept_any_of( self.accept_whitespace ) if tok is None: break if tok.type == TokenType.WHITESPACE: continue tok = self.accept_string_literal_consume_rest() if tok is None: self.error( ast_errors.TokenizeError, "Got bad string literal during consume_rest" ) logger.debug(f"tokenize: Got token {tok.type.name}:{repr(tok.value)}") tok.consume_rest = True # Signal we got this token using CONSUME_REST self.handle_consume_rest(tok) return tok else: while True: if self.stream.after_eof(): self.error( ast_errors.TokenizeEofError, "No more tokens." ) tok = None try: tok = self.accept_any_of( self.accept_whitespace, self.accept_comment, self.accept_single_char, self.accept_string_literal_quoted, self.accept_string_literal_normal ) except ast_errors.StreamEofError: # !!! HACK # I really, really need to rethink how EOF is handled # throughout this entire module. It's broken. pass if tok is None: # If tok is None, then all tokenizing functions got # rejected. So, try to accept and return EOF. eof_tok = self.accept_eof() if eof_tok is None: self.error( ast_errors.TokenizeError, "Unexpectedly rejected all tokenizing functions." ) else: return eof_tok # Loop until we get a non-whitespace, non-comment token. if tok.type not in [TokenType.WHITESPACE, TokenType.COMMENT]: logger.debug(f"tokenize: Got token {tok.type.name}:{repr(tok.value)}") self.handle_consume_rest(tok) return tok def get_all_tokens(self) -> t.Sequence[Token]: """ Extracts all tokens at once, until the end of the script. A sequence of tokens obtained this way will always end with a token of type `scrolls.ast.ast_constants.TokenType.EOF`. Raises: `scrolls.ast.ast_errors.TokenizeEofError`: If EOF was hit unexpectedly. `scrolls.ast.ast_errors.TokenizeError`: If a generic issue happened while tokenizing. """ tokens: t.MutableSequence[Token] = [] while True: tok = self.next_token() tokens.append(tok) if tok.type == TokenType.EOF: return tokens
Static methods
def accept_any_of(*f: Callable[[], Optional[Token]]) ‑> Optional[Token]
-
Expand source code
@staticmethod def accept_any_of(*f: t.Callable[[], t.Optional[Token]]) -> t.Optional[Token]: for fun in f: tok = fun() if tok is not None: return tok return None
Methods
def accept_comment(self) ‑> Optional[Token]
-
Expand source code
def accept_comment(self) -> t.Optional[Token]: if not self.comments_enable: return None char = self.stream.get_char() pos = self.stream.current_pos() line = self.stream.current_line() chars = [] if char != COMMENT_SIGIL: return None self.stream.next_char() while char != "\n": chars.append(char) self.stream.next_char() if self.stream.at_eof(): break char = self.stream.get_char() return Token( TokenType.COMMENT, "".join(chars), line, pos, self )
def accept_eof(self) ‑> Optional[Token]
-
Expand source code
def accept_eof(self) -> t.Optional[Token]: if self.stream.at_eof(): # Once an EOF is generated, there are no more tokens. # Any attempts after this to generate a token will # result in an exception. self.stream.next_char() # Put stream into after eof state return Token( TokenType.EOF, EOF, self.stream.current_line(), self.stream.current_pos(), self ) else: return None
def accept_single_char(self) ‑> Optional[Token]
-
Expand source code
def accept_single_char(self) -> t.Optional[Token]: if not self.single_char_token_enable: return None char = self.stream.get_char() if char in self.charmap: tok = Token( self.charmap[char], char, self.stream.current_line(), self.stream.current_pos(), self ) self.stream.next_char() return tok return None
def accept_string_literal(self, stop_chars: Sequence[str] = (), error_on_eof: bool = False, allow_escapes: bool = False) ‑> Optional[Token]
-
Expand source code
def accept_string_literal( self, stop_chars: t.Sequence[str] = (), error_on_eof: bool = False, allow_escapes: bool = False ) -> t.Optional[Token]: self.forbid_eof("String literal should not start on EOF") char = self.stream.get_char() pos = self.stream.current_pos() line = self.stream.current_line() chars = [] while char not in stop_chars: if allow_escapes: escape = self.try_consume_escape() if escape is not None: chars.append(escape) char = self.stream.get_char() continue chars.append(char) self.stream.next_char() if self.stream.at_eof(): if error_on_eof: self.error( ast_errors.TokenizeEofError, "Unexpected EOF while parsing string literal." ) else: break char = self.stream.get_char() return Token( TokenType.STRING_LITERAL, "".join(chars), line, pos, self )
def accept_string_literal_consume_rest(self) ‑> Optional[Token]
-
Expand source code
def accept_string_literal_consume_rest(self) -> t.Optional[Token]: return self.accept_string_literal( stop_chars=self.consume_rest_stop, error_on_eof=False # Stop on EOF. No errors. )
def accept_string_literal_normal(self) ‑> Optional[Token]
-
Expand source code
def accept_string_literal_normal(self) -> t.Optional[Token]: return self.accept_string_literal( stop_chars=self.string_literal_stop, error_on_eof=False # Just stop on EOF, no errors. )
def accept_string_literal_quoted(self) ‑> Optional[Token]
-
Expand source code
def accept_string_literal_quoted(self) -> t.Optional[Token]: if not self.quoted_literal_enable: return None if self.stream.get_char() != QUOTE: return None else: self.stream.next_char() literal = self.accept_string_literal( stop_chars=self.quoted_literal_stop, error_on_eof=True, # Quoted literals must be closed. allow_escapes=True # Escapes only allowed in quoted literals. ) if literal is None: self.error( ast_errors.TokenizeError, "internal: Got None from accept_string_literal, shouldn't have." ) if self.stream.get_char() != QUOTE: self.error( ast_errors.TokenizeError, "internal: Missing end quote, should have resulted in EOF error." ) else: self.stream.next_char() return literal
def accept_whitespace(self) ‑> Optional[Token]
-
Expand source code
def accept_whitespace(self) -> t.Optional[Token]: char = self.stream.get_char() if char in self.whitespace: self.stream.next_char() return Token( TokenType.WHITESPACE, char, self.stream.current_line(), self.stream.current_pos(), self ) return None
def error(self, err_type: Type[PositionalError], message: str, line: Optional[int] = None, pos: Optional[int] = None) ‑> NoReturn
-
Expand source code
def error( self, err_type: t.Type[base_errors.PositionalError], message: str, line: t.Optional[int] = None, pos: t.Optional[int] = None ) -> t.NoReturn: if line is not None: _line = line else: _line = self.stream.current_line() if pos is not None: _pos = pos else: _pos = self.stream.current_pos() raise err_type( _line, _pos, self.stream.history(), message )
def forbid_eof(self, msg: str = '', *args: Any, **kwargs: Any) ‑> None
-
Expand source code
def forbid_eof(self, msg: str = "", *args: t.Any, **kwargs: t.Any) -> None: if not msg: msg = "Unexpected EOF while parsing script." if self.stream.at_eof() or self.stream.after_eof(): self.error(ast_errors.TokenizeEofError, msg.format(*args, **kwargs))
def get_all_tokens(self) ‑> Sequence[Token]
-
Extracts all tokens at once, until the end of the script. A sequence of tokens obtained this way will always end with a token of type
TokenType.EOF
.Raises
TokenizeEofError
: If EOF was hit unexpectedly.TokenizeError
: If a generic issue happened while tokenizing.Expand source code
def get_all_tokens(self) -> t.Sequence[Token]: """ Extracts all tokens at once, until the end of the script. A sequence of tokens obtained this way will always end with a token of type `scrolls.ast.ast_constants.TokenType.EOF`. Raises: `scrolls.ast.ast_errors.TokenizeEofError`: If EOF was hit unexpectedly. `scrolls.ast.ast_errors.TokenizeError`: If a generic issue happened while tokenizing. """ tokens: t.MutableSequence[Token] = [] while True: tok = self.next_token() tokens.append(tok) if tok.type == TokenType.EOF: return tokens
def handle_consume_rest(self, tok: Token) ‑> None
-
Expand source code
def handle_consume_rest(self, tok: Token) -> None: f_map: t.Mapping[TokenizeConsumeRestState, t.Callable[[Token], None]] = { TokenizeConsumeRestState.OFF: self.handle_consume_rest_off, TokenizeConsumeRestState.COUNTING: self.handle_consume_rest_counting, TokenizeConsumeRestState.CONSUME: self.handle_consume_rest_consume } f_map[self.consume_rest_state](tok)
def handle_consume_rest_consume(self, tok: Token) ‑> None
-
Expand source code
def handle_consume_rest_consume(self, tok: Token) -> None: # This function runs AFTER a CONSUME_REST consumption. So, just set consume_rest back to OFF. self.consume_rest_state = TokenizeConsumeRestState.OFF self.consume_rest_count = 0
def handle_consume_rest_counting(self, tok: Token) ‑> None
-
Expand source code
def handle_consume_rest_counting(self, tok: Token) -> None: self.previous_token_was_sep = False # Only count down on string literals. if tok.type == TokenType.STRING_LITERAL: self.consume_rest_count -= 1 # Once countdown is over, CONSUME_REST on next token. if self.consume_rest_count == 0: self.consume_rest_state = TokenizeConsumeRestState.CONSUME # If we get any other token type, then cancel CONSUME_REST else: self.consume_rest_state = TokenizeConsumeRestState.OFF self.consume_rest_count = 0
def handle_consume_rest_off(self, tok: Token) ‑> None
-
Expand source code
def handle_consume_rest_off(self, tok: Token) -> None: if tok.type in (TokenType.COMMAND_SEP, TokenType.CLOSE_BLOCK, TokenType.CLOSE_ARGS): self.previous_token_was_sep = True return # Test to see if we should enter CONSUME_REST state. # Only trigger CONSUME_REST if the previous token was a command separator. should_enter_consume_rest = ( self.previous_token_was_sep and tok.type == TokenType.STRING_LITERAL and tok.value in self.consume_rest_triggers ) self.previous_token_was_sep = False if should_enter_consume_rest: count = self.consume_rest_triggers[tok.value] if count == 0: self.consume_rest_state = TokenizeConsumeRestState.CONSUME else: self.consume_rest_state = TokenizeConsumeRestState.COUNTING self.consume_rest_count = count
def next_n_chars(self, n: int) ‑> str
-
Unconditionally consume N characters and return them.
Expand source code
def next_n_chars(self, n: int) -> str: """ Unconditionally consume N characters and return them. """ chars: t.MutableSequence[str] = [] for _ in range(n): self.forbid_eof( "Ran into EOF while consuming characters. Got {}, wanted {}.", len(chars), n ) chars.append(self.stream.get_char()) self.stream.next_char() return "".join(chars)
def next_token(self) ‑> Token
-
Extract the next token. If the tokenizing is finished, this will return a
Token
of typeTokenType.EOF
Raises
TokenizeEofError
: If EOF was hit unexpectedly.TokenizeError
: If a generic issue happened while tokenizing.Expand source code
def next_token(self) -> Token: """ Extract the next token. If the tokenizing is finished, this will return a `Token` of type `scrolls.ast.ast_constants.TokenType.EOF` Raises: `scrolls.ast.ast_errors.TokenizeEofError`: If EOF was hit unexpectedly. `scrolls.ast.ast_errors.TokenizeError`: If a generic issue happened while tokenizing. """ if self.consume_rest_state == TokenizeConsumeRestState.CONSUME: while True: tok = self.accept_any_of( self.accept_whitespace ) if tok is None: break if tok.type == TokenType.WHITESPACE: continue tok = self.accept_string_literal_consume_rest() if tok is None: self.error( ast_errors.TokenizeError, "Got bad string literal during consume_rest" ) logger.debug(f"tokenize: Got token {tok.type.name}:{repr(tok.value)}") tok.consume_rest = True # Signal we got this token using CONSUME_REST self.handle_consume_rest(tok) return tok else: while True: if self.stream.after_eof(): self.error( ast_errors.TokenizeEofError, "No more tokens." ) tok = None try: tok = self.accept_any_of( self.accept_whitespace, self.accept_comment, self.accept_single_char, self.accept_string_literal_quoted, self.accept_string_literal_normal ) except ast_errors.StreamEofError: # !!! HACK # I really, really need to rethink how EOF is handled # throughout this entire module. It's broken. pass if tok is None: # If tok is None, then all tokenizing functions got # rejected. So, try to accept and return EOF. eof_tok = self.accept_eof() if eof_tok is None: self.error( ast_errors.TokenizeError, "Unexpectedly rejected all tokenizing functions." ) else: return eof_tok # Loop until we get a non-whitespace, non-comment token. if tok.type not in [TokenType.WHITESPACE, TokenType.COMMENT]: logger.debug(f"tokenize: Got token {tok.type.name}:{repr(tok.value)}") self.handle_consume_rest(tok) return tok
def set_comments_enable(self, en: bool) ‑> None
-
Set whether comments are enabled. If disabled, the comment character will be ignored, and anything that would be a comment will be treated as ordinary code.
Expand source code
def set_comments_enable(self, en: bool) -> None: """ Set whether comments are enabled. If disabled, the comment character will be ignored, and anything that would be a comment will be treated as ordinary code. """ self.comments_enable = en self.string_literal_stop = _str_switch( self.string_literal_stop, self._string_literal_stop_comment, en )
def set_consume_rest_all(self, consume_all: bool) ‑> None
-
Set whether CONSUME_REST consumes until EOF. Defaults to
False
.If
False
, CONSUME_REST will stop on block open/close, and command separators.If
True
, CONSUME_REST will not stop until EOF.Expand source code
def set_consume_rest_all(self, consume_all: bool) -> None: """ Set whether CONSUME_REST consumes until EOF. Defaults to `False`. If `False`, CONSUME_REST will stop on block open/close, and command separators. If `True`, CONSUME_REST will not stop until EOF. """ self.consume_rest_stop = _str_switch( self.consume_rest_stop, self._consume_rest_stop_switch, not consume_all )
def set_newlines_separate_strings(self, en: bool) ‑> None
-
Set whether newlines separate string literals. This can only be modified if
Tokenizer.set_single_char_token_enable()
has been set toFalse
, and will raise aScrollError
otherwise.By default, when
Tokenizer.set_single_char_token_enable()
is set toFalse
, newlines will instead be considered whitespace, and will separate strings without producingTokenType.COMMAND_SEP
tokens.To override this behavior, this function may be set to
False
. In this case, newlines will be rolled into string literals, and ONLY spaces and tabs will separate string literals.Expand source code
def set_newlines_separate_strings(self, en: bool) -> None: """ Set whether newlines separate string literals. This can only be modified if `Tokenizer.set_single_char_token_enable` has been set to `False`, and will raise a `scrolls.errors.ScrollError` otherwise. By default, when `Tokenizer.set_single_char_token_enable` is set to `False`, newlines will instead be considered whitespace, and will separate strings without producing `scrolls.ast.ast_constants.TokenType.COMMAND_SEP` tokens. To override this behavior, this function may be set to `False`. In this case, newlines will be rolled into string literals, and ONLY spaces and tabs will separate string literals. """ if self.single_char_token_enable: raise base_errors.ScrollError("Cannot use set_newlines_separate_strings when single char tokens are enabled.") self.newlines_separate_strings = en self.string_literal_stop = _str_switch( self.string_literal_stop, "\n", en ) self.whitespace = _str_switch( self.whitespace, "\n", en )
def set_quoted_literals_enable(self, en: bool) ‑> None
-
Set whether quoted string literals are enabled. If disabled, quotes will be rolled into normal string token parsing.
For instance, if quoted literals are disabled,
"Hello World"
would be interpreted as"Hello
,World"
.Expand source code
def set_quoted_literals_enable(self, en: bool) -> None: """ Set whether quoted string literals are enabled. If disabled, quotes will be rolled into normal string token parsing. For instance, if quoted literals are disabled, `"Hello World"` would be interpreted as `"Hello`, `World"`. """ self.quoted_literal_enable = en self.string_literal_stop = _str_switch( self.string_literal_stop, self._string_literal_stop_quoted, en )
def set_single_char_token_enable(self, en: bool) ‑> None
-
Set whether single character tokens should be parsed. This includes ALL token types except for
TokenType.STRING_LITERAL
andTokenType.COMMENT
. Defaults toTrue
.If
False
, then all special characters that would otherwise be their own token will be rolled into string literals.Expand source code
def set_single_char_token_enable(self, en: bool) -> None: """ Set whether single character tokens should be parsed. This includes ALL token types except for `scrolls.ast.ast_constants.TokenType.STRING_LITERAL` and `scrolls.ast.ast_constants.TokenType.COMMENT`. Defaults to `True`. If `False`, then all special characters that would otherwise be their own token will be rolled into string literals. """ if self.newlines_separate_strings and not self.single_char_token_enable and en: # If we're re-enabling single char tokens and the newline separator behavior is still on, # we need to undo that first. self.set_newlines_separate_strings(False) self.single_char_token_enable = en self.string_literal_stop = _str_switch( self.string_literal_stop, self._string_literal_stop_single_char, en ) if not en: self.set_newlines_separate_strings(True) else: # If single char tokens are enabled, newlines must stop string literals for this to work properly. self.string_literal_stop = _str_ensure(self.string_literal_stop, "\n")
def try_consume_escape(self) ‑> Optional[str]
-
Expand source code
def try_consume_escape(self) -> t.Optional[str]: if self.stream.get_char() != ESCAPE_SIGIL: return None self.stream.next_char() self.forbid_eof() escape_char = self.stream.get_char() if escape_char not in self.escape_sequences: self.error(ast_errors.TokenizeError, f"Invalid escape '{escape_char}'") self.stream.next_char() self.forbid_eof() replacement = self.escape_sequences[escape_char] if isinstance(replacement, str): return replacement elif callable(replacement): return replacement(self) else: raise TypeError(f"Bad type for escape sequence {escape_char}, " "must be 'str' or '(Tokenizer) -> str'")