summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorOwen Jacobson <owen@grimoire.ca>2017-11-11 01:51:06 -0500
committerOwen Jacobson <owen@grimoire.ca>2017-11-11 15:42:13 -0500
commit16d94a6e50eb81de9d9d438e1cce0746928597f3 (patch)
treee1cb628d34c49690128722a33cc1d19d7dcffb23
parente4fb8604aa2fc572a3aeeace1c32de7339d346b5 (diff)
Introduce input ports.
Ports are the lisp abstraction of files and streams. Actinide ports additionally guarantee a peek operation. This makes ``tokenize`` (now ``read_token``) callable as a lisp function, as it takes a port and reads one token from it. This is a substantial refactoring. As most of the state is now captured by closures, it's no longer practical to test individual states as readily. However, the top-level tokenizer tests exercise the full state space.
-rw-r--r--README.rst2
-rw-r--r--actinide/ports.py54
-rw-r--r--actinide/tokenizer.py283
-rwxr-xr-xbin/actinide-repl6
-rw-r--r--tests/test_ports.py29
-rw-r--r--tests/test_tokenizer.py360
-rw-r--r--tests/tokens.py4
7 files changed, 256 insertions, 482 deletions
diff --git a/README.rst b/README.rst
index 374af24..595c43e 100644
--- a/README.rst
+++ b/README.rst
@@ -67,7 +67,7 @@ Freestanding REPL
*****************
**Note: this section is presently incorrect - the ``actinide-repl`` command
-instead contains a test harness for the tokenizer.**
+doesn't exist.**
The Actinide interpreter can be started interactively using the
``actinide-repl`` command. In this mode, Actinide forms can be entered
diff --git a/actinide/ports.py b/actinide/ports.py
new file mode 100644
index 0000000..7748cd7
--- /dev/null
+++ b/actinide/ports.py
@@ -0,0 +1,54 @@
+import io
+
+# ## PORTS
+#
+# A port is a handle which characters can either be read from (an "input port")
+# or written to (an "output port").
+#
+# Actinide uses a very limited subset of the full Scheme ports system, and does
+# not support the creation of most kinds of port at runtime.
+
+# A port. Under the hood, this wraps a Python file-like object in character
+# mode, and guarantees support for peek and other operations needed by the
+# Actinide runtime.
+class Port(object):
+ def __init__(self, file):
+ self.file = file
+ self.peek_buffer = ''
+
+ # Read up to ``n`` bytes from the port without consuming them.
+ def peek(self, n):
+ if not self.peek_buffer:
+ self.peek_buffer = self.file.read(n)
+ return self.peek_buffer
+
+ # Read up to ``n`` bytes from the port, consuming them.
+ def read(self, n):
+ if self.peek_buffer:
+ read_result, self.peek_buffer = self.peek_buffer[:n], self.peek_buffer[n:]
+ return read_result
+ return self.file.read(n)
+
+ # Read all remaining input, consuming it.
+ def read_fully(self):
+ return self.peek_buffer + self.file.read()
+
+# Read at least 1 and up to ``n`` characters from a port. This consumes them
+# from the port: they are no longer available to future peeks or reads. ``n``
+# must be strictly positive.
+def read(port, n):
+ return port.read(n)
+
+# Read all remaining input from a port, consuming it.
+def read_fully(port):
+ return port.read_fully()
+
+# Read at least 1 and up to ``n`` characters from a port, without consuming
+# them. They will be available on future peeks and reads. ``n`` must be strictly
+# positive.
+def peek(port, n):
+ return port.peek(n)
+
+# Create an input port from a string.
+def string_to_input_port(string):
+ return Port(io.StringIO(string))
diff --git a/actinide/tokenizer.py b/actinide/tokenizer.py
index 9767033..a69d35d 100644
--- a/actinide/tokenizer.py
+++ b/actinide/tokenizer.py
@@ -1,3 +1,5 @@
+from .ports import read, peek
+
# ## TOKENIZATION
#
# The following code implements a state-machine-driven tokenizer which can
@@ -21,18 +23,14 @@
# special literals. (Strings are, technically, a kind of atom, but the lexer
# treats them specially due to their complexity.)
#
-# Internally, the tokenizer is a state machine which maintains two pieces of
-# state: the "lookahead" (holding data to feed to the next state transition
-# function) and the next state transition function. The top-level ``tokenize``
-# function acts as a trampoline, repeatedly calling ``next`` until input is
-# exhausted, yielding tokens any time ``next`` includes a token in its return
-# value.
+# Internally, the tokenizer is a state machine where each state (``next``) is a
+# function taking a port as input. The top-level ``tokenize`` function acts as a
+# trampoline, repeatedly calling ``next`` until input is exhausted or until
+# ``next`` includes a token in its return value.
#
-# The various ``next`` functions take the current lookahead and the port,
-# perform whatever logic is needed (including, potentially, reading from the
-# port) to determine the next state, and return a 3-tuple of ``token`` (may be
-# ``None``), ``lookahead`` (which replaces the previous lookahead), and ``next``
-# (the new next state transition function).
+# The various ``next`` functions take the the port, perform whatever logic is
+# needed to determine the next state, and return a 2-tuple of ``token`` (may be
+# ``None``) and ``next`` (the new next state transition function).
#
# This is heavily inspired by various tail-recursive approaches to tokenizing
# lisp streams. However, the host language does not guarantee tail call
@@ -46,50 +44,47 @@ class TokenError(Exception):
'''
pass
-# Tokenize a port, producing a generator that yields successive tokens as it's
-# advanced.
+# Read one token from a port.
#
# This is the top-level driver for the state machine that divides the underlying
-# input into tokens. It does no input handling itself, other than reading the
-# first character of the port: this calls the next state transition function to
-# determine what to do and how to change the lookahead.
+# input into tokens. It does no input handling itself: it calls the next state
+# transition function to determine what to do and how to change the lookahead,
+# and relies on that function to perform any necessary input on the port.
#
# Initially, this is in the ``tokenize_any`` state, and exits once it reaches
-# the ``tokenize_eof`` state.
-def tokenize(port):
- lookahead, next = port.read(1), tokenize_any
+# the ``tokenize_eof`` state or once it reads a complete token.
+#
+# This never reads past the end of the current token, relying on ``peek`` to
+# determine whether it should continue reading from the port.
+def read_token(port):
+ next = tokenize_any
while next != tokenize_eof:
- token, lookahead, next = next(lookahead, port)
+ token, next = next(port)
if token is not None:
- yield token
+ return token
-# If the lookahead is exactly one character, this will correctly determine the
-# next token type and transition to that state without consuming input. This is
-# generally the correct state to transition to any time the next token is
-# unknown - for example, at the end of another token.
+# Looks ahead one character in the port to determine what kind of token appears
+# next in the port. This is an appropriate state to transition to at any time
+# when the next token is not known, such as at the end of a token.
#
# This never produces a token directly. It can transition to the tokenizer state
-# for any token type, as well as to the trap state for EOF.
-def tokenize_any(lookahead, port):
+# for any token type, for any non-token type, or the trap state for EOF.
+def tokenize_any(port):
+ lookahead = peek_next(port)
if lookahead == '':
- return None, lookahead, tokenize_eof
+ return None, tokenize_eof
if lookahead == ';':
- return None, lookahead, tokenize_comment
+ return None, tokenize_comment
if lookahead in '()':
- return None, lookahead, tokenize_syntax
+ return None, tokenize_syntax
if lookahead in ' \t\n':
- return None, lookahead, tokenize_whitespace
- return None, lookahead, tokenize_atom
+ return None, tokenize_whitespace
+ return None, tokenize_atom
-# Special trap state. This never produces a token, and always transitions to
-# itself. The lookahead in this state is generally ``''``, and since this never
-# performs any further reads, it will remain that value indefinitely.
-#
-# The top-level parser exits in this situation by examining ``lookahead``, but
-# it's possible to reach this state from string literal tokenization or after a
-# comment.
-def tokenize_eof(lookahead, port):
- return None, lookahead, tokenize_eof
+# EOF trap state This never produces a token, and always transitions to itself
+# without reading any input. The tokenizer cannot exit this state.
+def tokenize_eof(port):
+ return None, tokenize_eof
# Consumes one character at a time until it finds an end of line or runs out of
# input. This throws away comments entirely, at tokenization time, without
@@ -98,123 +93,135 @@ def tokenize_eof(lookahead, port):
# port, but never more than one character at a time.
#
# This never produces a token.
-def tokenize_comment(lookahead, port):
- next = port.read(1)
+
+# Consumes one character and throws it away, transitioning back to tokenize_any
+# once it encounters either an end of line or the end of the input. This
+# consumes commments, and as it never generates a token, discards them.
+def tokenize_comment(port):
+ next = read_next(port)
if next == '':
- return None, next, tokenize_any
+ return None, tokenize_any
if next == '\n':
- return None, next, tokenize_any
- return None, next, tokenize_comment
-
-# Generates the entire lookahead as a token. This is generally appropriate for
-# the ``(`` and ``)`` syntactic elements.
-#
-# The resulting lookahead will be the next character of input, and this always
-# dispatches back to ``tokenize_any`` so that the next token (if any) can be
-# determined.
-def tokenize_syntax(lookahead, port):
- return lookahead, port.read(1), tokenize_any
-
-# Consumes and ignores one character of input. This never produces a token, and
-# throws away the lookahead entirely. The resulting lookahead is the next
-# character of input.
-def tokenize_whitespace(lookahead, port):
- return None, port.read(1), tokenize_any
-
-# We've ruled out all non-atom tokens. If the lookahead is a string delimiter,
-# transitions to a state which tokenizes a single string literal; otherwise,
-# transitions to a state which consumes a single non-string atom. In both cases,
-# this leaves the lookahead alone, and generates no token.
-def tokenize_atom(lookahead, port):
+ return None, tokenize_any
+ return None, tokenize_comment
+
+# Consumes one character, returning it as a token, before transitioning back to
+# the ``tokenize_any`` state. This correctly tokenizes the ``(`` and ``)``
+# tokens if they are at the front of the port.
+def tokenize_syntax(port):
+ return read_next(port), tokenize_any
+
+# Consumes and ignores one character of input. This never produces a token. This
+# is appropriate for discarding whitespace in the port.
+def tokenize_whitespace(port):
+ read_next(port)
+ return None, tokenize_any
+
+# Looks ahead one character into the port to determine which kind of atom to
+# tokenize: if the input begins with a quote, tokenize a string literal;
+# otherwise, tokenize a non-string atom such as a symbol or numeric literal.
+# This never generates a token directly.
+def tokenize_atom(port):
+ lookahead = peek_next(port)
if lookahead == '"':
- return None, lookahead, tokenize_string
- return None, lookahead, tokenize_nonstring_atom
-
-# Consumes characters until it finds a character which cannot be part of a
-# non-string atom, or until it finds the end of input, accumulating them into a
-# single token. This is a heavily-overloaded token category, as it contains not
-# only Actinide symbols but also all non-String literals.
-#
-# While the tokenizer remains in this state, the lookahead accumulates the
-# characters of the token. When this matches a completed token, it produces a
-# Symbol token, and resets the lookahead back to a single read result containing
-# the next character of input.
-def tokenize_nonstring_atom(lookahead, port):
- next = port.read(1)
- if next == '':
- return lookahead, next, tokenize_any
- if next in '"(); \t\n':
- return lookahead, next, tokenize_any
- return None, lookahead + next, tokenize_nonstring_atom
+ return None, tokenize_string
+ return None, tokenize_nonstring_atom('')
+
+# A state factory returning states that build non-string atoms. The resulting
+# state family consumes characters until it finds a character which cannot be
+# part of a non-string atom, or until it finds the end of input, accumulating
+# them into a single token. When either of those cases arise, the resulting
+# state generates the accumulated token and returns to the ``tokenize_any``
+# state to prepare for the next token.
+def tokenize_nonstring_atom(state):
+ def tokenize_nonstring_atom_next(port):
+ next = peek_next(port)
+ if next == '':
+ return state, tokenize_any
+ if next in '"(); \t\n':
+ return state, tokenize_any
+ return None, tokenize_nonstring_atom(state + read_next(port))
+ return tokenize_nonstring_atom_next
# ### STRINGS
#
-# The following family of states handles string literals in the input stream.
-# String literals are fairly simple: they begin with a quote, contain arbitrary
-# characters other than a bare \ or ", and end with a quote. (Note that ``\n``
-# is not an escape sequence: unescaped newlines are permitted within string
-# literals.)
+# The following family of states and state factories handles string literals in
+# the input stream. String literals are fairly simple: they begin with a quote,
+# contain arbitrary characters other than a bare \ or ", and end with a quote.
+# (Note that ``\n`` is not an escape sequence: unescaped newlines are permitted
+# within string literals.)
#
-# These states use the lookahead to accumulate the characters of the string. On
-# transition back to ``tokenize_any``, the lookahead is always set back to a
-# single character. If, at any point, these states encounter EOF, they raise a
-# ``TokenError``: no legal token in Actinide begins with a quote mark and ends
-# with EOF.
+# These states accumulate the characters of the string. On transition back to
+# ``tokenize_any``, the accumulated characters are returned as a token. If, at
+# any point, these states encounter EOF or an invalid escape sequence, they
+# raise a ``TokenError``: no legal token in Actinide begins with a quote mark
+# and ends with EOF, and no legal token includes an invalid escape sequence.
#
# Because tokenization is only concerned with dividing the input into tokens,
# this machine *does not* strip quotes or replace escape sequences. On success,
# it generates a token containing the whole the string literal, verbatim.
-# The lookahead is assumed to be the opening quote of a string, and discarded.
-# Read forwards one character to determine whether this is an empty string
-# literal or not, then proceed either to ``tokenize_string_end`` for an empty
-# string, or to ``tokenize_string_character`` for a non-empty string.
+# Reads the first character of a string literal, and looks ahead one character
+# to determine how the string proceeds so that it can transition to an
+# appropriate state.
#
-# This never yields a token. The lookahead is set to the characters of the
+# This never generates a token. The lookahead is set to the characters of the
# string read so far.
-def tokenize_string(lookahead, port):
- next = port.read(1)
+def tokenize_string(port):
+ quote = read_next(port)
+ next = peek_next(port)
if next == '':
raise TokenError('Unclosed string literal')
if next == '\\':
- return None, lookahead + next, tokenize_escaped_string_character
+ return None, tokenize_escaped_string_character(quote + read_next(port))
if next == '"':
- return None, lookahead + next, tokenize_string_end
- return None, lookahead + next, tokenize_string_character
+ return None, tokenize_string_end(quote)
+ return None, tokenize_string_character(quote)
-# The lookahead contains the body of the string read so far. Reads forwards one
-# character to determine if the string continues, contains an escaped character,
-# or ends.
+# A state factory returning states which accumulate string characters. The
+# returned states look ahead one character to determine how to proceed, and read
+# one token under most circumstances.
#
# This never yields a token.
-def tokenize_string_character(lookahead, port):
- next = port.read(1)
- if next == '':
- raise TokenError('Unclosed string literal')
- if next == '\\':
- return None, lookahead + next, tokenize_escaped_string_character
- if next == '"':
- return None, lookahead + next, tokenize_string_end
- return None, lookahead + next, tokenize_string_character
+def tokenize_string_character(state):
+ def tokenize_string_character_next(port):
+ next = peek_next(port)
+ if next == '':
+ raise TokenError('Unclosed string literal')
+ if next == '\\':
+ return None, tokenize_escaped_string_character(state + read_next(port))
+ if next == '"':
+ return None, tokenize_string_end(state)
+ return None, tokenize_string_character(state + read_next(port))
+ return tokenize_string_character_next
-# The lookahead contains the body of the string so far. Reads forwards one
-# character to determine which, if any, escaped character to process: if it's
-# one we recognize, append it to the string, otherwise raise a TokenError.
+# A state factory returning states which only recognize valid string escaped
+# characters (``\\`` and ``"``). If they encounter a valid character, they
+# accumulate it onto the string being read and continue reading the string;
+# otherwise, they reject the string by raising a TokenError.
#
-# This never yields a token, and always dispatches back to
-# ``tokenize_string_character`` on a legal escape character.
-def tokenize_escaped_string_character(lookahead, port):
- next = port.read(1)
- if next == '':
- raise TokenError('Unclosed string literal')
- if next == '"':
- return None, lookahead + next, tokenize_string_character
- if next == '\\':
- return None, lookahead + next, tokenize_string_character
- raise TokenError(f"Invalid string escape '\\{next}'")
-
-# Package the lookahead (the full string body, de-escaped and without leading
-# and trailing quotes) up as a String token and return it, then transition back
-# to the ``tokenize_any`` state with a single read result in the lookahead.
-def tokenize_string_end(lookahead, port):
- return lookahead, port.read(1), tokenize_any
+# This never yields a token.
+def tokenize_escaped_string_character(state):
+ def tokenize_escaped_string_character_next(port):
+ next = read_next(port)
+ print(f'Esc: state={repr(state)} next={repr(next)} peek={repr(peek_next(port))}')
+ if next == '':
+ raise TokenError('Unclosed string literal')
+ if next in '\\"':
+ return None, tokenize_string_character(state + next)
+ raise TokenError(f"Invalid string escape '\\{next}'")
+ return tokenize_escaped_string_character_next
+
+# A state factory which terminates a string literal. These states read off the
+# closing quote mark, and generates the accumulated string as a token before
+# transitioning back to the ``tokenize_any`` state.
+def tokenize_string_end(state):
+ def tokenize_string_end_next(port):
+ return state + read_next(port), tokenize_any
+ return tokenize_string_end_next
+
+def read_next(port):
+ return read(port, 1)
+
+def peek_next(port):
+ return peek(port, 1)
diff --git a/bin/actinide-repl b/bin/actinide-repl
deleted file mode 100755
index 7909d36..0000000
--- a/bin/actinide-repl
+++ /dev/null
@@ -1,6 +0,0 @@
-#!/usr/bin/env python
-
-import sys
-import actinide.tokenizer as at
-
-print(repr(list(at.tokenize(sys.stdin))))
diff --git a/tests/test_ports.py b/tests/test_ports.py
new file mode 100644
index 0000000..c2d1e06
--- /dev/null
+++ b/tests/test_ports.py
@@ -0,0 +1,29 @@
+from hypothesis import given
+from hypothesis.strategies import integers, text
+
+from actinide.ports import *
+
+@given(text(), integers(min_value=1, max_value=2**32 - 1))
+def test_read(input, n):
+ port = string_to_input_port(input)
+ output = read(port, n)
+
+ assert input.startswith(output)
+ assert (len(output) == 0 and len(input) == 0) != (0 < len(output) <= n)
+ assert output + read_fully(port) == input
+
+@given(text(), integers(min_value=1, max_value=2**32 - 1))
+def test_peek(input, n):
+ port = string_to_input_port(input)
+ output = peek(port, n)
+
+ assert input.startswith(output)
+ assert (len(output) == 0 and len(input) == 0) != (0 < len(output) <= n)
+ assert read_fully(port) == input
+
+@given(text(), integers(min_value=1, max_value=2**32 - 1))
+def test_read_fully(input, n):
+ port = string_to_input_port(input)
+ output = read_fully(port)
+
+ assert output == input
diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py
index 76a07a9..5c0ddea 100644
--- a/tests/test_tokenizer.py
+++ b/tests/test_tokenizer.py
@@ -3,354 +3,40 @@ from hypothesis.strategies import just, text, characters, from_regex, one_of, tu
import io
from actinide.tokenizer import *
+from actinide.ports import *
-from .tokens import spaced_token_sequences
+from .tokens import spaced_token_sequences, tokens, nontokens
-class ReadablePort(io.StringIO):
- def __repr__(self):
- # Slightly friendlier debugging output
- return f"ReadablePort(str={repr(self.getvalue())}, pos={self.tell()})"
-
-# Many of the following tests proceed by cases, because the underlying behaviour
-# is too complex to treat as a uniform set of properties. The cases are meant to
-# be total, and in principle could be defined as a set of filters on the
-# ``text()`` generator that , combined, exhaust the possible outcomes of that
-# generator.
-#
-# Implementing the tests that way causes Hypothesis to generate a significant
-# number of examples that it then throws away without verifying, because
-# Hypothesis has no insight into filters to use when generating examples.
-# Instead, this test suite specifies generators per-case.
-
-# Cases for tokenize_any:
-
-# We test this a bit differently from the subsequent tokenizer states. Because
-# it's a pure routing state, we can generate lookahead, expected_state pairs and
-# check them in one pass, rather than testing each possible outcome separately.
-# In every case, the input is irrelevant: this state never reads.
-
-def next_token_states():
- return one_of(
- tuples(just(''), just(tokenize_eof)),
- tuples(just(';'), just(tokenize_comment)),
- tuples(sampled_from('()'), just(tokenize_syntax)),
- tuples(sampled_from(' \t\n'), just(tokenize_whitespace)),
- tuples(just('"'), just(tokenize_atom)),
- tuples(characters(blacklist_characters=' \t\n();"'), just(tokenize_atom)),
- )
-
-@given(next_token_states(), text())
-def test_tokenize_any(lookahead_next, input):
- s, expected_state = lookahead_next
- port = ReadablePort(input)
- token, lookahead, next = tokenize_any(s, input)
-
- assert token is None
- assert lookahead == s
- assert next == expected_state
- assert port.tell() == 0
-
-# Since the previous test case is rigged for success, also verify that no input
-# causes tokenize_any to enter an unexpected state or to throw an exception.
-@given(text(), text())
-def test_tokenize_any_fuzz(s, input):
- port = ReadablePort(input)
- token, lookahead, next = tokenize_any(s, input)
-
- assert token is None
- assert lookahead == s
- assert next in (tokenize_eof, tokenize_comment, tokenize_syntax, tokenize_whitespace, tokenize_atom)
- assert port.tell() == 0
-
-# Cases for tokenize_eof:
-
-# * any lookahead, any input: tokenize_eof is a trap state performing no reads,
-# always returning to itself, and never generating a token.
-@given(text(), text())
-def test_tokenize_eof(s, input):
- port = ReadablePort(input)
- token, lookahead, next = tokenize_eof(s, port)
-
- assert token is None
- assert lookahead == s
- assert next == tokenize_eof
- assert port.tell() == 0
-
-# Cases for tokenize_comment:
-
-# * any lookahead, one or more characters beginning with a non-newline as input:
-# tokenize_comment continues the current comment, throwing away one character
-# of input, without generating a token.
-@given(text(), from_regex(r'^[^\n].*'))
-def test_tokenize_comment_continues(s, input):
- port = ReadablePort(input)
- token, lookahead, next = tokenize_comment(s, port)
-
- assert token is None
- assert port.tell() == 1
- assert lookahead == input[0]
- assert next == tokenize_comment
-
-# * any lookahead, one or more characters beginning with a newline as input, and
-# * any lookahead, empty input:
-# tokenize_comment concludes the current comment and prepares for the next
-# token, without generating a token.
-@given(text(), just('') | from_regex(r'^\n.*'))
-def test_tokenize_comment_ends(s, input):
- port = ReadablePort(input)
- token, lookahead, next = tokenize_comment(s, port)
-
- assert token is None
- assert port.tell() == (1 if input else 0)
- assert lookahead == (input[0] if input else '')
- assert next == tokenize_any
-
-# Cases for tokenize_syntax:
-
-# * any lookahead, any input: generate the lookahead as a Syntax token and
-# transition back to tokenize_any to prepare for the next token, with one
-# character of lookahead ready to go.
-@given(text(), text())
-def test_tokenize_syntax(s, input):
- port = ReadablePort(input)
- token, lookahead, next = tokenize_syntax(s, port)
-
- assert token == s
- assert port.tell() == (1 if input else 0)
- assert lookahead == (input[0] if input else '')
- assert next == tokenize_any
-
-# Cases for test_tokenize_whitespace:
-
-# * any lookahead, any input: throw away the presumed-whitespace lookahead, then
-# transition back to tokenize_any to prepare for the next token, with one
-# character of lookahead ready to go, without generating a token.
-@given(text(), text())
-def test_tokenize_whitespace(s, input):
- port = ReadablePort(input)
- token, lookahead, next = tokenize_whitespace(s, port)
-
- assert token is None
- assert port.tell() == (1 if input else 0)
- assert lookahead == (input[0] if input else '')
- assert next == tokenize_any
-
-# Cases for tokenize_nonstring_atom:
-
-# * any lookahead, any non-empty input not beginning with whitespace, syntax, a
-# comment delimiter, or a string literal: accumulate one character of input
-# onto the lookahead, then transition back to tokenize_symbol to process the
-# next character of input, without generating a token.
-@given(text(), from_regex(r'^[^ \n\t();"].*'))
-def test_tokenize_nonstring_atom_continues(s, input):
- port = ReadablePort(input)
- token, lookahead, next = tokenize_nonstring_atom(s, port)
-
- assert token is None
- assert port.tell() == 1
- assert lookahead == s + input[0]
- assert next == tokenize_nonstring_atom
-
-# * any lookahead, a non-empty input beginning with whitespace, syntax, a
-# comment delimiter, or a string literal, and
-# * any lookahead, empty input:
-# generate the accumulated input as a Symbol token, then transition back to tokenize_any with one character of lookahead ready to go.
-@given(text(), just('') | from_regex(r'^[ \n\t();"].*'))
-def test_tokenize_tokenize_nonstring_atom_ends(s, input):
- port = ReadablePort(input)
- token, lookahead, next = tokenize_nonstring_atom(s, port)
-
- assert token == s
- assert port.tell() == (1 if input else 0)
- assert lookahead == (input[0] if input else '')
- assert next == tokenize_any
-
-# And now, the _worst_ part of the state machine. Cases for tokenize_string:
-
-# * any lookahead, a non-empty input not beginning with a string delimiter:
-# begin a non-empty string by transitioning to the tokenize_string_character
-# state with one character of lookahead, without generating a token.
-@given(text(), from_regex(r'^[^"].*'))
-def test_tokenize_string_continues(s, input):
- port = ReadablePort(input)
- token, lookahead, next = tokenize_string(s, port)
-
- assert token is None
- assert port.tell() == 1
- assert lookahead == s + input[0]
- assert next == tokenize_string_character
-
-# * any lookahad, a non-empty input beginning with a string delimiter: terminate
-# an empty string by transitioning to the tokenize_string_end state with an
-# *empty* lookahead, without generating a token.
-@given(text(), from_regex(r'^["].*'))
-def test_tokenize_string_empty(s, input):
- port = ReadablePort(input)
- token, lookahead, next = tokenize_string(s, port)
-
- assert token is None
- assert port.tell() == 1
- assert lookahead == s + input[0]
- assert next == tokenize_string_end
-
-# * any lookahead, empty input: emit a tokenization error, as we've encountered
-# EOF inside of a string.
-@given(text(), just(''))
-def test_tokenize_string_eof(s, input):
- try:
- port = ReadablePort(input)
- token, lookahead, next = tokenize_string(s, port)
-
- assert False # must raise
- except TokenError:
- assert port.tell() == 0
-
-# Cases for tokenize_string_character:
-
-# * any lookahead, any non-empty input not beginning with a string delimiter or
-# escape character: append one character of input to the lookahead, then
-# continue in the tokenize_string_character state without generating a token.
-@given(text(), from_regex(r'^[^\\"].*'))
-def test_tokenize_string_character_continues(s, input):
- port = ReadablePort(input)
- token, lookahead, next = tokenize_string_character(s, port)
-
- assert token is None
- assert port.tell() == 1
- assert lookahead == s + input[0]
- assert next == tokenize_string_character
-
-# * any lookahead, any non-empty input which begins with an escape character:
-# leave the lookahead unchanged, but transition to the
-# tokenize_escaped_string_character state to determine which escape character
-# we're dealing with, without emitting a token.
-@given(text(), from_regex(r'^[\\].*'))
-def test_tokenize_string_character_begins_escape(s, input):
- port = ReadablePort(input)
- token, lookahead, next = tokenize_string_character(s, port)
-
- assert token is None
- assert port.tell() == 1
- assert lookahead == s + input[0]
- assert next == tokenize_escaped_string_character
-
-# * any lookahead, any non-empty input which begins with a string delimiter:
-# we're at the end of a string. Transition to the tokenize_string_end state
-# with the current lookahead, without generating a token.
-@given(text(), from_regex(r'^["].*'))
-def test_tokenize_string_character_ends(s, input):
- port = ReadablePort(input)
- token, lookahead, next = tokenize_string_character(s, port)
-
- assert token is None
- assert port.tell() == 1
- assert lookahead == s + input[0]
- assert next == tokenize_string_end
-
-# * any lookahead, empty input: emit a tokenization error, as we've encountered
-# EOF inside of a string literal.
-@given(text(), just(''))
-def test_tokenize_string_character_eof(s, input):
- try:
- port = ReadablePort(input)
- token, lookahead, next = tokenize_string_character(s, port)
-
- assert False # must raise
- except TokenError:
- assert input == ''
- assert port.tell() == 0
-
-# Cases for tokenize_escaped_string:
-
-# * any lookahead, any non-empty input beginning with a legal string escaped
-# character: de-escape the first character of the input, append the result to
-# the lookahead, then transition back to the tokenize_string_character state.
-@given(text(), from_regex(r'^["\\].*'))
-def test_tokenize_escaped_string_character_valid(s, input):
- port = ReadablePort(input)
- token, lookahead, next = tokenize_escaped_string_character(s, port)
-
- assert token is None
- assert port.tell() == 1
- assert lookahead == s + input[0]
- assert next == tokenize_string_character
-
-# * any lookahead, any non-empty input not beginning with a legal string escaped
-# character: emit a tokenization error, we've found an invalid string escape.
-@given(text(), from_regex(r'^[^"\\].*'))
-def test_tokenize_escaped_string_character_invalid(s, input):
- try:
- port = ReadablePort(input)
- token, lookahead, next = tokenize_escaped_string_character(s, port)
-
- assert False # must raise
- except TokenError:
- assert port.tell() == 1
-
-# * any lookahead, empty input: emit a tokenization error, we've found an EOF
-# inside of a string literal.
-@given(text(), just(''))
-def test_tokenize_escaped_string_character_eof(s, input):
- try:
- port = ReadablePort(input)
- token, lookahead, next = tokenize_escaped_string_character(s, port)
-
- assert False # must raise
- except TokenError:
- assert port.tell() == 0
-
-# Cases for tokenize_string_end:
-
-# * any lookahead, any input: generate a String token from the lookahead, then
-# transition back to the tokenize_any state with one character of lookahead
-# ready to go.
-@given(text(), text())
-def test_tokenize_string_end(s, input):
- port = ReadablePort(input)
- token, lookahead, next = tokenize_string_end(s, port)
-
- assert token == s
- assert port.tell() == (1 if input else 0)
- assert lookahead == (input[0] if input else '')
- assert next == tokenize_any
-
-# Cases for tokenize_atom:
-
-# * lookahead containing a string delimiter, any input: found a string atom,
-# transition to the tokenize_string state without reading or generating a
-# token.
-@given(just('"'), text())
-def test_tokenize_atom_string(s, input):
- port = ReadablePort(input)
- token, lookahead, next = tokenize_atom(s, port)
+# Cases for the tokenizer:
- assert token is None
- assert port.tell() == 0
- assert lookahead == s
- assert next == tokenize_string
+# * any single token: reads back that token.
+@given(tokens())
+def test_tokenizer_single_token(input):
+ port = string_to_input_port(input)
-# * lookahead containing something other than a string delimiter, any input:
-# found a nonstring atom, transition to the tokenize_nonstring_atom state
-# without reading or generating a token.
-@given(from_regex(r'^[^"]'), text())
-def test_tokenize_atom_nonstring(s, input):
- port = ReadablePort(input)
- token, lookahead, next = tokenize_atom(s, port)
+ assert read_token(port) == input
- assert token is None
- assert port.tell() == 0
- assert lookahead == s
- assert next == tokenize_nonstring_atom
+# * any input guaranteed not to contain a token: reads back None, consuming the
+# whole input in the process.
+@given(nontokens())
+def test_tokenizer_no_token(input):
+ port = string_to_input_port(input)
-# Cases for the tokenizer:
+ assert read_token(port) == None
# * any sequence of separator-token pairs: if the pairs are coalesced into a
# single giant input, does the tokenizer recover the tokens?
@given(spaced_token_sequences())
-def test_tokenizer(spaced_tokens):
+def test_tokenizer_spaced_sequence(spaced_tokens):
input = ''.join(''.join(pair) for pair in spaced_tokens)
tokens = [token for (_, token) in spaced_tokens]
- port = ReadablePort(input)
+ port = string_to_input_port(input)
+ def iterate_read_token(port):
+ token = read_token(port)
+ while token is not None:
+ yield token
+ token = read_token(port)
+
+ assert list(iterate_read_token(port)) == tokens
- assert list(tokenize(port)) == tokens
diff --git a/tests/tokens.py b/tests/tokens.py
index 0027fb2..3eb58b8 100644
--- a/tests/tokens.py
+++ b/tests/tokens.py
@@ -48,6 +48,10 @@ def whitespace_characters():
def tokens():
return one_of(symbols(), strings(), open_parens(), close_parens())
+# Generates a string which may not be empty, but which does not contain a token.
+def nontokens():
+ return one_of(whitespace(), comments(), just(''))
+
# Generates at least one character of whitespace.
def whitespace():
return text(whitespace_characters(), min_size=1)