Introduce input ports.

Ports are the lisp abstraction of files and streams. Actinide ports additionally guarantee a peek operation. This makes ``tokenize`` (now ``read_token``) callable as a lisp function, as it takes a port and reads one token from it. This is a substantial refactoring. As most of the state is now captured by closures, it's no longer practical to test individual states as readily. However, the top-level tokenizer tests exercise the full state space.
author: Owen Jacobson <owen@grimoire.ca> 2017-11-11 01:51:06 -0500
committer: Owen Jacobson <owen@grimoire.ca> 2017-11-11 15:42:13 -0500
commit: 16d94a6e50eb81de9d9d438e1cce0746928597f3 (patch)
tree: e1cb628d34c49690128722a33cc1d19d7dcffb23
parent: e4fb8604aa2fc572a3aeeace1c32de7339d346b5 (diff)
7 files changed, 256 insertions, 482 deletions
diff --git a/README.rst b/README.rst
index 374af24..595c43e 100644
--- a/README.rst
+++ b/README.rst
@@ -67,7 +67,7 @@ Freestanding REPL
 *****************
 
 **Note: this section is presently incorrect - the ``actinide-repl`` command
-instead contains a test harness for the tokenizer.**
+doesn't exist.**
 
 The Actinide interpreter can be started interactively using the
 ``actinide-repl`` command. In this mode, Actinide forms can be entered
diff --git a/actinide/ports.py b/actinide/ports.py
new file mode 100644
index 0000000..7748cd7
--- /dev/null
+++ b/actinide/ports.py
@@ -0,0 +1,54 @@
+import io
+
+# ## PORTS
+#
+# A port is a handle which characters can either be read from (an "input port")
+# or written to (an "output port").
+#
+# Actinide uses a very limited subset of the full Scheme ports system, and does
+# not support the creation of most kinds of port at runtime.
+
+# A port. Under the hood, this wraps a Python file-like object in character
+# mode, and guarantees support for peek and other operations needed by the
+# Actinide runtime.
+class Port(object):
+    def __init__(self, file):
+        self.file = file
+        self.peek_buffer = ''
+
+    # Read up to ``n`` bytes from the port without consuming them.
+    def peek(self, n):
+        if not self.peek_buffer:
+            self.peek_buffer = self.file.read(n)
+        return self.peek_buffer
+
+    # Read up to ``n`` bytes from the port, consuming them.
+    def read(self, n):
+        if self.peek_buffer:
+            read_result, self.peek_buffer = self.peek_buffer[:n], self.peek_buffer[n:]
+            return read_result
+        return self.file.read(n)
+
+    # Read all remaining input, consuming it.
+    def read_fully(self):
+        return self.peek_buffer + self.file.read()
+
+# Read at least 1 and up to ``n`` characters from a port. This consumes them
+# from the port: they are no longer available to future peeks or reads. ``n``
+# must be strictly positive.
+def read(port, n):
+    return port.read(n)
+
+# Read all remaining input from a port, consuming it.
+def read_fully(port):
+    return port.read_fully()
+
+# Read at least 1 and up to ``n`` characters from a port, without consuming
+# them. They will be available on future peeks and reads. ``n`` must be strictly
+# positive.
+def peek(port, n):
+    return port.peek(n)
+
+# Create an input port from a string.
+def string_to_input_port(string):
+    return Port(io.StringIO(string))
diff --git a/actinide/tokenizer.py b/actinide/tokenizer.py
index 9767033..a69d35d 100644
--- a/actinide/tokenizer.py
+++ b/actinide/tokenizer.py
@@ -1,3 +1,5 @@
+from .ports import read, peek
+
 # ## TOKENIZATION
 #
 # The following code implements a state-machine-driven tokenizer which can
@@ -21,18 +23,14 @@
 #   special literals. (Strings are, technically, a kind of atom, but the lexer
 #   treats them specially due to their complexity.)
 #
-# Internally, the tokenizer is a state machine which maintains two pieces of
-# state: the "lookahead" (holding data to feed to the next state transition
-# function) and the next state transition function. The top-level ``tokenize``
-# function acts as a trampoline, repeatedly calling ``next`` until input is
-# exhausted, yielding tokens any time ``next`` includes a token in its return
-# value.
+# Internally, the tokenizer is a state machine where each state (``next``) is a
+# function taking a port as input. The top-level ``tokenize`` function acts as a
+# trampoline, repeatedly calling ``next`` until input is exhausted or until
+# ``next`` includes a token in its return value.
 #
-# The various ``next`` functions take the current lookahead and the port,
-# perform whatever logic is needed (including, potentially, reading from the
-# port) to determine the next state, and return a 3-tuple of ``token`` (may be
-# ``None``), ``lookahead`` (which replaces the previous lookahead), and ``next``
-# (the new next state transition function).
+# The various ``next`` functions take the the port, perform whatever logic is
+# needed to determine the next state, and return a 2-tuple of ``token`` (may be
+# ``None``) and ``next`` (the new next state transition function).
 #
 # This is heavily inspired by various tail-recursive approaches to tokenizing
 # lisp streams. However, the host language does not guarantee tail call
@@ -46,50 +44,47 @@ class TokenError(Exception):
     '''
     pass
 
-# Tokenize a port, producing a generator that yields successive tokens as it's
-# advanced.
+# Read one token from a port.
 #
 # This is the top-level driver for the state machine that divides the underlying
-# input into tokens. It does no input handling itself, other than reading the
-# first character of the port: this calls the next state transition function to
-# determine what to do and how to change the lookahead.
+# input into tokens. It does no input handling itself: it calls the next state
+# transition function to determine what to do and how to change the lookahead,
+# and relies on that function to perform any necessary input on the port.
 #
 # Initially, this is in the ``tokenize_any`` state, and exits once it reaches
-# the ``tokenize_eof`` state.
-def tokenize(port):
-    lookahead, next = port.read(1), tokenize_any
+# the ``tokenize_eof`` state or once it reads a complete token.
+#
+# This never reads past the end of the current token, relying on ``peek`` to
+# determine whether it should continue reading from the port.
+def read_token(port):
+    next = tokenize_any
     while next != tokenize_eof:
-        token, lookahead, next = next(lookahead, port)
+        token, next = next(port)
         if token is not None:
-            yield token
+            return token
 
-# If the lookahead is exactly one character, this will correctly determine the
-# next token type and transition to that state without consuming input. This is
-# generally the correct state to transition to any time the next token is
-# unknown - for example, at the end of another token.
+# Looks ahead one character in the port to determine what kind of token appears
+# next in the port. This is an appropriate state to transition to at any time
+# when the next token is not known, such as at the end of a token.
 #
 # This never produces a token directly. It can transition to the tokenizer state
-# for any token type, as well as to the trap state for EOF.
-def tokenize_any(lookahead, port):
+# for any token type, for any non-token type, or the trap state for EOF.
+def tokenize_any(port):
+    lookahead = peek_next(port)
     if lookahead == '':
-        return None, lookahead, tokenize_eof
+        return None, tokenize_eof
     if lookahead == ';':
-        return None, lookahead, tokenize_comment
+        return None, tokenize_comment
     if lookahead in '()':
-        return None, lookahead, tokenize_syntax
+        return None, tokenize_syntax
     if lookahead in ' \t\n':
-        return None, lookahead, tokenize_whitespace
-    return None, lookahead, tokenize_atom
+        return None, tokenize_whitespace
+    return None, tokenize_atom
 
-# Special trap state. This never produces a token, and always transitions to
-# itself. The lookahead in this state is generally ``''``, and since this never
-# performs any further reads, it will remain that value indefinitely.
-#
-# The top-level parser exits in this situation by examining ``lookahead``, but
-# it's possible to reach this state from string literal tokenization or after a
-# comment.
-def tokenize_eof(lookahead, port):
-    return None, lookahead, tokenize_eof
+# EOF trap state This never produces a token, and always transitions to itself
+# without reading any input. The tokenizer cannot exit this state.
+def tokenize_eof(port):
+    return None, tokenize_eof
 
 # Consumes one character at a time until it finds an end of line or runs out of
 # input. This throws away comments entirely, at tokenization time, without
@@ -98,123 +93,135 @@ def tokenize_eof(lookahead, port):
 # port, but never more than one character at a time.
 #
 # This never produces a token.
-def tokenize_comment(lookahead, port):
-    next = port.read(1)
+
+# Consumes one character and throws it away, transitioning back to tokenize_any
+# once it encounters either an end of line or the end of the input. This
+# consumes commments, and as it never generates a token, discards them.
+def tokenize_comment(port):
+    next = read_next(port)
     if next == '':
-        return None, next, tokenize_any
+        return None, tokenize_any
     if next == '\n':
-        return None, next, tokenize_any
-    return None, next, tokenize_comment
-
-# Generates the entire lookahead as a token. This is generally appropriate for
-# the ``(`` and ``)`` syntactic elements.
-#
-# The resulting lookahead will be the next character of input, and this always
-# dispatches back to ``tokenize_any`` so that the next token (if any) can be
-# determined.
-def tokenize_syntax(lookahead, port):
-    return lookahead, port.read(1), tokenize_any
-
-# Consumes and ignores one character of input. This never produces a token, and
-# throws away the lookahead entirely. The resulting lookahead is the next
-# character of input.
-def tokenize_whitespace(lookahead, port):
-    return None, port.read(1), tokenize_any
-
-# We've ruled out all non-atom tokens. If the lookahead is a string delimiter,
-# transitions to a state which tokenizes a single string literal; otherwise,
-# transitions to a state which consumes a single non-string atom. In both cases,
-# this leaves the lookahead alone, and generates no token.
-def tokenize_atom(lookahead, port):
+        return None, tokenize_any
+    return None, tokenize_comment
+
+# Consumes one character, returning it as a token, before transitioning back to
+# the ``tokenize_any`` state. This correctly tokenizes the ``(`` and ``)``
+# tokens if they are at the front of the port.
+def tokenize_syntax(port):
+    return read_next(port), tokenize_any
+
+# Consumes and ignores one character of input. This never produces a token. This
+# is appropriate for discarding whitespace in the port.
+def tokenize_whitespace(port):
+    read_next(port)
+    return None, tokenize_any
+
+# Looks ahead one character into the port to determine which kind of atom to
+# tokenize: if the input begins with a quote, tokenize a string literal;
+# otherwise, tokenize a non-string atom such as a symbol or numeric literal.
+# This never generates a token directly.
+def tokenize_atom(port):
+    lookahead = peek_next(port)
     if lookahead == '"':
-        return None, lookahead, tokenize_string
-    return None, lookahead, tokenize_nonstring_atom
-
-# Consumes characters until it finds a character which cannot be part of a
-# non-string atom, or until it finds the end of input, accumulating them into a
-# single token. This is a heavily-overloaded token category, as it contains not
-# only Actinide symbols but also all non-String literals.
-#
-# While the tokenizer remains in this state, the lookahead accumulates the
-# characters of the token. When this matches a completed token, it produces a
-# Symbol token, and resets the lookahead back to a single read result containing
-# the next character of input.
-def tokenize_nonstring_atom(lookahead, port):
-    next = port.read(1)
-    if next == '':
-        return lookahead, next, tokenize_any
-    if next in '"(); \t\n':
-        return lookahead, next, tokenize_any
-    return None, lookahead + next, tokenize_nonstring_atom
+        return None, tokenize_string
+    return None, tokenize_nonstring_atom('')
+
+# A state factory returning states that build non-string atoms. The resulting
+# state family consumes characters until it finds a character which cannot be
+# part of a non-string atom, or until it finds the end of input, accumulating
+# them into a single token. When either of those cases arise, the resulting
+# state generates the accumulated token and returns to the ``tokenize_any``
+# state to prepare for the next token.
+def tokenize_nonstring_atom(state):
+    def tokenize_nonstring_atom_next(port):
+        next = peek_next(port)
+        if next == '':
+            return state, tokenize_any
+        if next in '"(); \t\n':
+            return state,  tokenize_any
+        return None, tokenize_nonstring_atom(state + read_next(port))
+    return tokenize_nonstring_atom_next
 
 # ### STRINGS
 #
-# The following family of states handles string literals in the input stream.
-# String literals are fairly simple: they begin with a quote, contain arbitrary
-# characters other than a bare \ or ", and end with a quote. (Note that ``\n``
-# is not an escape sequence: unescaped newlines are permitted within string
-# literals.)
+# The following family of states and state factories handles string literals in
+# the input stream. String literals are fairly simple: they begin with a quote,
+# contain arbitrary characters other than a bare \ or ", and end with a quote.
+# (Note that ``\n`` is not an escape sequence: unescaped newlines are permitted
+# within string literals.)
 #
-# These states use the lookahead to accumulate the characters of the string. On
-# transition back to ``tokenize_any``, the lookahead is always set back to a
-# single character. If, at any point, these states encounter EOF, they raise a
-# ``TokenError``: no legal token in Actinide begins with a quote mark and ends
-# with EOF.
+# These states accumulate the characters of the string. On transition back to
+# ``tokenize_any``, the accumulated characters are returned as a token. If, at
+# any point, these states encounter EOF or an invalid escape sequence, they
+# raise a ``TokenError``: no legal token in Actinide begins with a quote mark
+# and ends with EOF, and no legal token includes an invalid escape sequence.
 #
 # Because tokenization is only concerned with dividing the input into tokens,
 # this machine *does not* strip quotes or replace escape sequences. On success,
 # it generates a token containing the whole the string literal, verbatim.
 
-# The lookahead is assumed to be the opening quote of a string, and discarded.
-# Read forwards one character to determine whether this is an empty string
-# literal or not, then proceed either to ``tokenize_string_end`` for an empty
-# string, or to ``tokenize_string_character`` for a non-empty string.
+# Reads the first character of a string literal, and looks ahead one character
+# to determine how the string proceeds so that it can transition to an
+# appropriate state.
 #
-# This never yields a token. The lookahead is set to the characters of the
+# This never generates a token. The lookahead is set to the characters of the
 # string read so far.
-def tokenize_string(lookahead, port):
-    next = port.read(1)
+def tokenize_string(port):
+    quote = read_next(port)
+    next = peek_next(port)
     if next == '':
         raise TokenError('Unclosed string literal')
     if next == '\\':
-        return None, lookahead + next, tokenize_escaped_string_character
+        return None, tokenize_escaped_string_character(quote + read_next(port))
     if next == '"':
-        return None, lookahead + next, tokenize_string_end
-    return None, lookahead + next, tokenize_string_character
+        return None, tokenize_string_end(quote)
+    return None, tokenize_string_character(quote)
 
-# The lookahead contains the body of the string read so far. Reads forwards one
-# character to determine if the string continues, contains an escaped character,
-# or ends.
+# A state factory returning states which accumulate string characters. The
+# returned states look ahead one character to determine how to proceed, and read
+# one token under most circumstances.
 #
 # This never yields a token.
-def tokenize_string_character(lookahead, port):
-    next = port.read(1)
-    if next == '':
-        raise TokenError('Unclosed string literal')
-    if next == '\\':
-        return None, lookahead + next, tokenize_escaped_string_character
-    if next == '"':
-        return None, lookahead + next, tokenize_string_end
-    return None, lookahead + next, tokenize_string_character
+def tokenize_string_character(state):
+    def tokenize_string_character_next(port):
+        next = peek_next(port)
+        if next == '':
+            raise TokenError('Unclosed string literal')
+        if next == '\\':
+            return None, tokenize_escaped_string_character(state + read_next(port))
+        if next == '"':
+            return None, tokenize_string_end(state)
+        return None, tokenize_string_character(state + read_next(port))
+    return tokenize_string_character_next
 
-# The lookahead contains the body of the string so far. Reads forwards one
-# character to determine which, if any, escaped character to process: if it's
-# one we recognize, append it to the string, otherwise raise a TokenError.
+# A state factory returning states which only recognize valid string escaped
+# characters (``\\`` and ``"``). If they encounter a valid character, they
+# accumulate it onto the string being read and continue reading the string;
+# otherwise, they reject the string by raising a TokenError.
 #
-# This never yields a token, and always dispatches back to
-# ``tokenize_string_character`` on a legal escape character.
-def tokenize_escaped_string_character(lookahead, port):
-    next = port.read(1)
-    if next == '':
-        raise TokenError('Unclosed string literal')
-    if next == '"':
-        return None, lookahead + next, tokenize_string_character
-    if next == '\\':
-        return None, lookahead + next, tokenize_string_character
-    raise TokenError(f"Invalid string escape '\\{next}'")
-
-# Package the lookahead (the full string body, de-escaped and without leading
-# and trailing quotes) up as a String token and return it, then transition back
-# to the ``tokenize_any`` state with a single read result in the lookahead.
-def tokenize_string_end(lookahead, port):
-    return lookahead, port.read(1), tokenize_any
+# This never yields a token.
+def tokenize_escaped_string_character(state):
+    def tokenize_escaped_string_character_next(port):
+        next = read_next(port)
+        print(f'Esc: state={repr(state)} next={repr(next)} peek={repr(peek_next(port))}')
+        if next == '':
+            raise TokenError('Unclosed string literal')
+        if next in '\\"':
+            return None, tokenize_string_character(state + next)
+        raise TokenError(f"Invalid string escape '\\{next}'")
+    return tokenize_escaped_string_character_next
+
+# A state factory which terminates a string literal. These states read off the
+# closing quote mark, and generates the accumulated string as a token before
+# transitioning back to the ``tokenize_any`` state.
+def tokenize_string_end(state):
+    def tokenize_string_end_next(port):
+        return state + read_next(port), tokenize_any
+    return tokenize_string_end_next
+
+def read_next(port):
+    return read(port, 1)
+
+def peek_next(port):
+    return peek(port, 1)
diff --git a/bin/actinide-repl b/bin/actinide-repl
deleted file mode 100755
index 7909d36..0000000
--- a/bin/actinide-repl
+++ /dev/null
@@ -1,6 +0,0 @@
-#!/usr/bin/env python
-
-import sys
-import actinide.tokenizer as at
-
-print(repr(list(at.tokenize(sys.stdin))))
diff --git a/tests/test_ports.py b/tests/test_ports.py
new file mode 100644
index 0000000..c2d1e06
--- /dev/null
+++ b/tests/test_ports.py
@@ -0,0 +1,29 @@
+from hypothesis import given
+from hypothesis.strategies import integers, text
+
+from actinide.ports import *
+
+@given(text(), integers(min_value=1, max_value=2**32 - 1))
+def test_read(input, n):
+    port = string_to_input_port(input)
+    output = read(port, n)
+
+    assert input.startswith(output)
+    assert (len(output) == 0 and len(input) == 0) != (0 < len(output) <= n)
+    assert output + read_fully(port) == input
+
+@given(text(), integers(min_value=1, max_value=2**32 - 1))
+def test_peek(input, n):
+    port = string_to_input_port(input)
+    output = peek(port, n)
+
+    assert input.startswith(output)
+    assert (len(output) == 0 and len(input) == 0) != (0 < len(output) <= n)
+    assert read_fully(port) == input
+
+@given(text(), integers(min_value=1, max_value=2**32 - 1))
+def test_read_fully(input, n):
+    port = string_to_input_port(input)
+    output = read_fully(port)
+
+    assert output == input
diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py
index 76a07a9..5c0ddea 100644
--- a/tests/test_tokenizer.py
+++ b/tests/test_tokenizer.py
@@ -3,354 +3,40 @@ from hypothesis.strategies import just, text, characters, from_regex, one_of, tu
 import io
 
 from actinide.tokenizer import *
+from actinide.ports import *
 
-from .tokens import spaced_token_sequences
+from .tokens import spaced_token_sequences, tokens, nontokens
 
-class ReadablePort(io.StringIO):
-    def __repr__(self):
-        # Slightly friendlier debugging output
-        return f"ReadablePort(str={repr(self.getvalue())}, pos={self.tell()})"
-
-# Many of the following tests proceed by cases, because the underlying behaviour
-# is too complex to treat as a uniform set of properties. The cases are meant to
-# be total, and in principle could be defined as a set of filters on the
-# ``text()`` generator that , combined, exhaust the possible outcomes of that
-# generator.
-#
-# Implementing the tests that way causes Hypothesis to generate a significant
-# number of examples that it then throws away without verifying, because
-# Hypothesis has no insight into filters to use when generating examples.
-# Instead, this test suite specifies generators per-case.
-
-# Cases for tokenize_any:
-
-# We test this a bit differently from the subsequent tokenizer states. Because
-# it's a pure routing state, we can generate lookahead, expected_state pairs and
-# check them in one pass, rather than testing each possible outcome separately.
-# In every case, the input is irrelevant: this state never reads.
-
-def next_token_states():
-    return one_of(
-        tuples(just(''), just(tokenize_eof)),
-        tuples(just(';'), just(tokenize_comment)),
-        tuples(sampled_from('()'), just(tokenize_syntax)),
-        tuples(sampled_from(' \t\n'), just(tokenize_whitespace)),
-        tuples(just('"'), just(tokenize_atom)),
-        tuples(characters(blacklist_characters=' \t\n();"'), just(tokenize_atom)),
-    )
-
-@given(next_token_states(), text())
-def test_tokenize_any(lookahead_next, input):
-    s, expected_state = lookahead_next
-    port = ReadablePort(input)
-    token, lookahead, next = tokenize_any(s, input)
-
-    assert token is None
-    assert lookahead == s
-    assert next == expected_state
-    assert port.tell() == 0
-
-# Since the previous test case is rigged for success, also verify that no input
-# causes tokenize_any to enter an unexpected state or to throw an exception.
-@given(text(), text())
-def test_tokenize_any_fuzz(s, input):
-    port = ReadablePort(input)
-    token, lookahead, next = tokenize_any(s, input)
-
-    assert token is None
-    assert lookahead == s
-    assert next in (tokenize_eof, tokenize_comment, tokenize_syntax, tokenize_whitespace, tokenize_atom)
-    assert port.tell() == 0
-
-# Cases for tokenize_eof:
-
-# * any lookahead, any input: tokenize_eof is a trap state performing no reads,
-#   always returning to itself, and never generating a token.
-@given(text(), text())
-def test_tokenize_eof(s, input):
-    port = ReadablePort(input)
-    token, lookahead, next = tokenize_eof(s, port)
-
-    assert token is None
-    assert lookahead == s
-    assert next == tokenize_eof
-    assert port.tell() == 0
-
-# Cases for tokenize_comment:
-
-# * any lookahead, one or more characters beginning with a non-newline as input:
-#   tokenize_comment continues the current comment, throwing away one character
-#   of input, without generating a token.
-@given(text(), from_regex(r'^[^\n].*'))
-def test_tokenize_comment_continues(s, input):
-    port = ReadablePort(input)
-    token, lookahead, next = tokenize_comment(s, port)
-
-    assert token is None
-    assert port.tell() == 1
-    assert lookahead == input[0]
-    assert next == tokenize_comment
-
-# * any lookahead, one or more characters beginning with a newline as input, and
-# * any lookahead, empty input:
-#   tokenize_comment concludes the current comment and prepares for the next
-#   token, without generating a token.
-@given(text(), just('') | from_regex(r'^\n.*'))
-def test_tokenize_comment_ends(s, input):
-    port = ReadablePort(input)
-    token, lookahead, next = tokenize_comment(s, port)
-
-    assert token is None
-    assert port.tell() == (1 if input else 0)
-    assert lookahead == (input[0] if input else '')
-    assert next == tokenize_any
-
-# Cases for tokenize_syntax:
-
-# * any lookahead, any input: generate the lookahead as a Syntax token and
-#   transition back to tokenize_any to prepare for the next token, with one
-#   character of lookahead ready to go.
-@given(text(), text())
-def test_tokenize_syntax(s, input):
-    port = ReadablePort(input)
-    token, lookahead, next = tokenize_syntax(s, port)
-
-    assert token == s
-    assert port.tell() == (1 if input else 0)
-    assert lookahead == (input[0] if input else '')
-    assert next == tokenize_any
-
-# Cases for test_tokenize_whitespace:
-
-# * any lookahead, any input: throw away the presumed-whitespace lookahead, then
-#   transition back to tokenize_any to prepare for the next token, with one
-#   character of lookahead ready to go, without generating a token.
-@given(text(), text())
-def test_tokenize_whitespace(s, input):
-    port = ReadablePort(input)
-    token, lookahead, next = tokenize_whitespace(s, port)
-
-    assert token is None
-    assert port.tell() == (1 if input else 0)
-    assert lookahead == (input[0] if input else '')
-    assert next == tokenize_any
-
-# Cases for tokenize_nonstring_atom:
-
-# * any lookahead, any non-empty input not beginning with whitespace, syntax, a
-#   comment delimiter, or a string literal: accumulate one character of input
-#   onto the lookahead, then transition back to tokenize_symbol to process the
-#   next character of input, without generating a token.
-@given(text(), from_regex(r'^[^ \n\t();"].*'))
-def test_tokenize_nonstring_atom_continues(s, input):
-    port = ReadablePort(input)
-    token, lookahead, next = tokenize_nonstring_atom(s, port)
-
-    assert token is None
-    assert port.tell() == 1
-    assert lookahead == s + input[0]
-    assert next == tokenize_nonstring_atom
-
-# * any lookahead, a non-empty input beginning with whitespace, syntax, a
-#   comment delimiter, or a string literal, and
-# * any lookahead, empty input:
-#   generate the accumulated input as a Symbol token, then transition back to tokenize_any with one character of lookahead ready to go.
-@given(text(), just('') | from_regex(r'^[ \n\t();"].*'))
-def test_tokenize_tokenize_nonstring_atom_ends(s, input):
-    port = ReadablePort(input)
-    token, lookahead, next = tokenize_nonstring_atom(s, port)
-
-    assert token == s
-    assert port.tell() == (1 if input else 0)
-    assert lookahead == (input[0] if input else '')
-    assert next == tokenize_any
-
-# And now, the _worst_ part of the state machine. Cases for tokenize_string:
-
-# * any lookahead, a non-empty input not beginning with a string delimiter:
-#   begin a non-empty string by transitioning to the tokenize_string_character
-#   state with one character of lookahead, without generating a token.
-@given(text(), from_regex(r'^[^"].*'))
-def test_tokenize_string_continues(s, input):
-    port = ReadablePort(input)
-    token, lookahead, next = tokenize_string(s, port)
-
-    assert token is None
-    assert port.tell() == 1
-    assert lookahead == s + input[0]
-    assert next == tokenize_string_character
-
-# * any lookahad, a non-empty input beginning with a string delimiter: terminate
-#   an empty string by transitioning to the tokenize_string_end state with an
-#   *empty* lookahead, without generating a token.
-@given(text(), from_regex(r'^["].*'))
-def test_tokenize_string_empty(s, input):
-    port = ReadablePort(input)
-    token, lookahead, next = tokenize_string(s, port)
-
-    assert token is None
-    assert port.tell() == 1
-    assert lookahead == s + input[0]
-    assert next == tokenize_string_end
-
-# * any lookahead, empty input: emit a tokenization error, as we've encountered
-#   EOF inside of a string.
-@given(text(), just(''))
-def test_tokenize_string_eof(s, input):
-    try:
-        port = ReadablePort(input)
-        token, lookahead, next = tokenize_string(s, port)
-
-        assert False # must raise
-    except TokenError:
-        assert port.tell() == 0
-
-# Cases for tokenize_string_character:
-
-# * any lookahead, any non-empty input not beginning with a string delimiter or
-#   escape character: append one character of input to the lookahead, then
-#   continue in the tokenize_string_character state without generating a token.
-@given(text(), from_regex(r'^[^\\"].*'))
-def test_tokenize_string_character_continues(s, input):
-    port = ReadablePort(input)
-    token, lookahead, next = tokenize_string_character(s, port)
-
-    assert token is None
-    assert port.tell() == 1
-    assert lookahead == s + input[0]
-    assert next == tokenize_string_character
-
-# * any lookahead, any non-empty input which begins with an escape character:
-#   leave the lookahead unchanged, but transition to the
-#   tokenize_escaped_string_character state to determine which escape character
-#   we're dealing with, without emitting a token.
-@given(text(), from_regex(r'^[\\].*'))
-def test_tokenize_string_character_begins_escape(s, input):
-    port = ReadablePort(input)
-    token, lookahead, next = tokenize_string_character(s, port)
-
-    assert token is None
-    assert port.tell() == 1
-    assert lookahead == s + input[0]
-    assert next == tokenize_escaped_string_character
-
-# * any lookahead, any non-empty input which begins with a string delimiter:
-#   we're at the end of a string. Transition to the tokenize_string_end state
-#   with the current lookahead, without generating a token.
-@given(text(), from_regex(r'^["].*'))
-def test_tokenize_string_character_ends(s, input):
-    port = ReadablePort(input)
-    token, lookahead, next = tokenize_string_character(s, port)
-
-    assert token is None
-    assert port.tell() == 1
-    assert lookahead == s + input[0]
-    assert next == tokenize_string_end
-
-# * any lookahead, empty input: emit a tokenization error, as we've encountered
-#   EOF inside of a string literal.
-@given(text(), just(''))
-def test_tokenize_string_character_eof(s, input):
-    try:
-        port = ReadablePort(input)
-        token, lookahead, next = tokenize_string_character(s, port)
-
-        assert False # must raise
-    except TokenError:
-        assert input == ''
-        assert port.tell() == 0
-
-# Cases for tokenize_escaped_string:
-
-# * any lookahead, any non-empty input beginning with a legal string escaped
-#   character: de-escape the first character of the input, append the result to
-#   the lookahead, then transition back to the tokenize_string_character state.
-@given(text(), from_regex(r'^["\\].*'))
-def test_tokenize_escaped_string_character_valid(s, input):
-    port = ReadablePort(input)
-    token, lookahead, next = tokenize_escaped_string_character(s, port)
-
-    assert token is None
-    assert port.tell() == 1
-    assert lookahead == s + input[0]
-    assert next == tokenize_string_character
-
-# * any lookahead, any non-empty input not beginning with a legal string escaped
-#   character: emit a tokenization error, we've found an invalid string escape.
-@given(text(), from_regex(r'^[^"\\].*'))
-def test_tokenize_escaped_string_character_invalid(s, input):
-    try:
-        port = ReadablePort(input)
-        token, lookahead, next = tokenize_escaped_string_character(s, port)
-
-        assert False # must raise
-    except TokenError:
-        assert port.tell() == 1
-
-# * any lookahead, empty input: emit a tokenization error, we've found an EOF
-#   inside of a string literal.
-@given(text(), just(''))
-def test_tokenize_escaped_string_character_eof(s, input):
-    try:
-        port = ReadablePort(input)
-        token, lookahead, next = tokenize_escaped_string_character(s, port)
-
-        assert False # must raise
-    except TokenError:
-        assert port.tell() == 0
-
-# Cases for tokenize_string_end:
-
-# * any lookahead, any input: generate a String token from the lookahead, then
-#   transition back to the tokenize_any state with one character of lookahead
-#   ready to go.
-@given(text(), text())
-def test_tokenize_string_end(s, input):
-    port = ReadablePort(input)
-    token, lookahead, next = tokenize_string_end(s, port)
-
-    assert token == s
-    assert port.tell() == (1 if input else 0)
-    assert lookahead == (input[0] if input else '')
-    assert next == tokenize_any
-
-# Cases for tokenize_atom:
-
-# * lookahead containing a string delimiter, any input: found a string atom,
-#   transition to the tokenize_string state without reading or generating a
-#   token.
-@given(just('"'), text())
-def test_tokenize_atom_string(s, input):
-    port = ReadablePort(input)
-    token, lookahead, next = tokenize_atom(s, port)
+# Cases for the tokenizer:
 
-    assert token is None
-    assert port.tell() == 0
-    assert lookahead == s
-    assert next == tokenize_string
+# * any single token: reads back that token.
+@given(tokens())
+def test_tokenizer_single_token(input):
+    port = string_to_input_port(input)
 
-# * lookahead containing something other than a string delimiter, any input:
-#   found a nonstring atom, transition to the tokenize_nonstring_atom state
-#   without reading or generating a token.
-@given(from_regex(r'^[^"]'), text())
-def test_tokenize_atom_nonstring(s, input):
-    port = ReadablePort(input)
-    token, lookahead, next = tokenize_atom(s, port)
+    assert read_token(port) == input
 
-    assert token is None
-    assert port.tell() == 0
-    assert lookahead == s
-    assert next == tokenize_nonstring_atom
+# * any input guaranteed not to contain a token: reads back None, consuming the
+#   whole input in the process.
+@given(nontokens())
+def test_tokenizer_no_token(input):
+    port = string_to_input_port(input)
 
-# Cases for the tokenizer:
+    assert read_token(port) == None
 
 # * any sequence of separator-token pairs: if the pairs are coalesced into a
 #   single giant input, does the tokenizer recover the tokens?
 @given(spaced_token_sequences())
-def test_tokenizer(spaced_tokens):
+def test_tokenizer_spaced_sequence(spaced_tokens):
     input = ''.join(''.join(pair) for pair in spaced_tokens)
     tokens = [token for (_, token) in spaced_tokens]
 
-    port = ReadablePort(input)
+    port = string_to_input_port(input)
+    def iterate_read_token(port):
+        token = read_token(port)
+        while token is not None:
+            yield token
+            token = read_token(port)
+
+    assert list(iterate_read_token(port)) == tokens
 
-    assert list(tokenize(port)) == tokens
diff --git a/tests/tokens.py b/tests/tokens.py
index 0027fb2..3eb58b8 100644
--- a/tests/tokens.py
+++ b/tests/tokens.py
@@ -48,6 +48,10 @@ def whitespace_characters():
 def tokens():
     return one_of(symbols(), strings(), open_parens(), close_parens())
 
+# Generates a string which may not be empty, but which does not contain a token.
+def nontokens():
+    return one_of(whitespace(), comments(), just(''))
+
 # Generates at least one character of whitespace.
 def whitespace():
     return text(whitespace_characters(), min_size=1)
author	Owen Jacobson <owen@grimoire.ca>	2017-11-11 01:51:06 -0500
committer	Owen Jacobson <owen@grimoire.ca>	2017-11-11 15:42:13 -0500
commit	16d94a6e50eb81de9d9d438e1cce0746928597f3 (patch)
tree	e1cb628d34c49690128722a33cc1d19d7dcffb23
parent	e4fb8604aa2fc572a3aeeace1c32de7339d346b5 (diff)