5 files changed, 382 insertions, 125 deletions
diff --git a/actinide/tokenizer.py b/actinide/tokenizer.py
index 8fb9d0a..9767033 100644
--- a/actinide/tokenizer.py
+++ b/actinide/tokenizer.py
@@ -1,5 +1,3 @@
-from .types import *
-
 # ## TOKENIZATION
 #
 # The following code implements a state-machine-driven tokenizer which can
@@ -8,19 +6,20 @@ from .types import *
 #
 # * Comments: ``;`` followed by all bytes to EOF or to the end of the line.
 #
-# * Strings: ``"`` through to the next unescaped ``"`` are read, de-escaped, and
-#   returned. The sequences ``\"`` and ``\\`` are treated specially: the former
-#   de-escapes to ``"``, and the latter to ``\``. An unclosed string literal or
-#   an unknown escape sequence is a tokenization error.
-#
 # * Open and close parens: ``(`` and ``)`` are returned as freestanding tokens.
 #
 # * Whitespace: Space, horizontal tab, and newline characters are discarded
 #   during tokenization.
 #
-# * Symbols: Any sequence of characters not included in one of the above classes
+# * Strings: ``"`` through to the next unescaped ``"`` are read and returned.
+#   Sequences within the string beginning with ``\`` indicate an escape, and may
+#   only be followed by the character ``"`` or ``\``. An unclosed string literal
+#   or an unknown escape sequence is a tokenization error.
+#
+# * Atoms: Any sequence of characters not included in one of the above classes
 #   is read and returned as a single token. This includes words, numbers, and
-#   special literals.
+#   special literals. (Strings are, technically, a kind of atom, but the lexer
+#   treats them specially due to their complexity.)
 #
 # Internally, the tokenizer is a state machine which maintains two pieces of
 # state: the "lookahead" (holding data to feed to the next state transition
@@ -34,6 +33,11 @@ from .types import *
 # port) to determine the next state, and return a 3-tuple of ``token`` (may be
 # ``None``), ``lookahead`` (which replaces the previous lookahead), and ``next``
 # (the new next state transition function).
+#
+# This is heavily inspired by various tail-recursive approaches to tokenizing
+# lisp streams. However, the host language does not guarantee tail call
+# optimizations, so we use an explicit trampoline function to drive the state
+# machine instead of calling each parser directly.
 
 class TokenError(Exception):
     '''
@@ -47,20 +51,20 @@ class TokenError(Exception):
 #
 # This is the top-level driver for the state machine that divides the underlying
 # input into tokens. It does no input handling itself, other than reading the
-# first character of the port: so long as the lookahead is non-empty, this calls
-# the next state transition function to determine what to do and how to change
-# the lookahead.
+# first character of the port: this calls the next state transition function to
+# determine what to do and how to change the lookahead.
 #
-# Initially, this is in the ``tokenize_any`` state.
+# Initially, this is in the ``tokenize_any`` state, and exits once it reaches
+# the ``tokenize_eof`` state.
 def tokenize(port):
     lookahead, next = port.read(1), tokenize_any
-    while len(lookahead) > 0:
+    while next != tokenize_eof:
         token, lookahead, next = next(lookahead, port)
         if token is not None:
             yield token
 
-# If the lookahead is exactly one read result, this will correctly determine the
-# next token type and return that state without consuming input. This is
+# If the lookahead is exactly one character, this will correctly determine the
+# next token type and transition to that state without consuming input. This is
 # generally the correct state to transition to any time the next token is
 # unknown - for example, at the end of another token.
 #
@@ -75,9 +79,7 @@ def tokenize_any(lookahead, port):
         return None, lookahead, tokenize_syntax
     if lookahead in ' \t\n':
         return None, lookahead, tokenize_whitespace
-    if lookahead == '"':
-        return None, lookahead, tokenize_string
-    return None, lookahead, tokenize_symbol
+    return None, lookahead, tokenize_atom
 
 # Special trap state. This never produces a token, and always transitions to
 # itself. The lookahead in this state is generally ``''``, and since this never
@@ -89,8 +91,8 @@ def tokenize_any(lookahead, port):
 def tokenize_eof(lookahead, port):
     return None, lookahead, tokenize_eof
 
-# Consumes one read result at a time until it finds an end of line or runs out
-# of input. This throws away comments entirely, at parse time, without
+# Consumes one character at a time until it finds an end of line or runs out of
+# input. This throws away comments entirely, at tokenization time, without
 # considering whether the comment content can be separated into tokens. As this
 # scans the comment, the lookahead will be set to successive characters from the
 # port, but never more than one character at a time.
@@ -104,51 +106,64 @@ def tokenize_comment(lookahead, port):
         return None, next, tokenize_any
     return None, next, tokenize_comment
 
-# Consumes the lookahead and packages it up as a Syntax token. This is generally
-# appropriate for the ``(`` and ``)`` syntactic elements.
+# Generates the entire lookahead as a token. This is generally appropriate for
+# the ``(`` and ``)`` syntactic elements.
 #
 # The resulting lookahead will be the next character of input, and this always
 # dispatches back to ``tokenize_any`` so that the next token (if any) can be
 # determined.
 def tokenize_syntax(lookahead, port):
-    return Syntax(lookahead), port.read(1), tokenize_any
+    return lookahead, port.read(1), tokenize_any
 
-# Consumes and ignores whitespace in the input. This never produces a token, and
+# Consumes and ignores one character of input. This never produces a token, and
 # throws away the lookahead entirely. The resulting lookahead is the next
 # character of input.
 def tokenize_whitespace(lookahead, port):
     return None, port.read(1), tokenize_any
 
-# Consumes characters until it finds a character which cannot be part of a token
-# or until it finds the end of input, accumulating them into a single Symbol
-# token. This is a heavily-overloaded token category, as it contains not only
-# Actinide symbols but also all non-String literals.
+# We've ruled out all non-atom tokens. If the lookahead is a string delimiter,
+# transitions to a state which tokenizes a single string literal; otherwise,
+# transitions to a state which consumes a single non-string atom. In both cases,
+# this leaves the lookahead alone, and generates no token.
+def tokenize_atom(lookahead, port):
+    if lookahead == '"':
+        return None, lookahead, tokenize_string
+    return None, lookahead, tokenize_nonstring_atom
+
+# Consumes characters until it finds a character which cannot be part of a
+# non-string atom, or until it finds the end of input, accumulating them into a
+# single token. This is a heavily-overloaded token category, as it contains not
+# only Actinide symbols but also all non-String literals.
 #
 # While the tokenizer remains in this state, the lookahead accumulates the
 # characters of the token. When this matches a completed token, it produces a
 # Symbol token, and resets the lookahead back to a single read result containing
 # the next character of input.
-def tokenize_symbol(lookahead, port):
+def tokenize_nonstring_atom(lookahead, port):
     next = port.read(1)
     if next == '':
-        return Symbol(lookahead), next, tokenize_any
+        return lookahead, next, tokenize_any
     if next in '"(); \t\n':
-        return Symbol(lookahead), next, tokenize_any
-    return None, lookahead + next, tokenize_symbol
+        return lookahead, next, tokenize_any
+    return None, lookahead + next, tokenize_nonstring_atom
 
 # ### STRINGS
 #
-# The following states handle string literals in the input stream. String
-# literals are fairly simple: they begin with a quote, contain arbitrary
-# characters other than a bare \ or ", and end with a quote. The sequences
-# ``\\`` and ``\"`` are de-escaped by removing the leading backslash and
-# included in the resulting string.
+# The following family of states handles string literals in the input stream.
+# String literals are fairly simple: they begin with a quote, contain arbitrary
+# characters other than a bare \ or ", and end with a quote. (Note that ``\n``
+# is not an escape sequence: unescaped newlines are permitted within string
+# literals.)
 #
 # These states use the lookahead to accumulate the characters of the string. On
 # transition back to ``tokenize_any``, the lookahead is always set back to a
 # single character. If, at any point, these states encounter EOF, they raise a
 # ``TokenError``: no legal token in Actinide begins with a quote mark and ends
 # with EOF.
+#
+# Because tokenization is only concerned with dividing the input into tokens,
+# this machine *does not* strip quotes or replace escape sequences. On success,
+# it generates a token containing the whole the string literal, verbatim.
 
 # The lookahead is assumed to be the opening quote of a string, and discarded.
 # Read forwards one character to determine whether this is an empty string
@@ -161,9 +176,11 @@ def tokenize_string(lookahead, port):
     next = port.read(1)
     if next == '':
         raise TokenError('Unclosed string literal')
+    if next == '\\':
+        return None, lookahead + next, tokenize_escaped_string_character
     if next == '"':
-        return None, '', tokenize_string_end
-    return None, next, tokenize_string_character
+        return None, lookahead + next, tokenize_string_end
+    return None, lookahead + next, tokenize_string_character
 
 # The lookahead contains the body of the string read so far. Reads forwards one
 # character to determine if the string continues, contains an escaped character,
@@ -175,15 +192,14 @@ def tokenize_string_character(lookahead, port):
     if next == '':
         raise TokenError('Unclosed string literal')
     if next == '\\':
-        return None, lookahead, tokenize_escaped_string_character
+        return None, lookahead + next, tokenize_escaped_string_character
     if next == '"':
-        return None, lookahead, tokenize_string_end
+        return None, lookahead + next, tokenize_string_end
     return None, lookahead + next, tokenize_string_character
 
 # The lookahead contains the body of the string so far. Reads forwards one
 # character to determine which, if any, escaped character to process: if it's
-# one we recognize, de-escape it and append it to the string, otherwise raise a
-# TokenError.
+# one we recognize, append it to the string, otherwise raise a TokenError.
 #
 # This never yields a token, and always dispatches back to
 # ``tokenize_string_character`` on a legal escape character.
@@ -191,14 +207,14 @@ def tokenize_escaped_string_character(lookahead, port):
     next = port.read(1)
     if next == '':
         raise TokenError('Unclosed string literal')
-    if next == 'n':
-        return None, lookahead + '\n', tokenize_string_character
+    if next == '"':
+        return None, lookahead + next, tokenize_string_character
     if next == '\\':
-        return None, lookahead + '\\', tokenize_string_character
-    raise TokenError(f"Unknown string escape '\{next}'")
+        return None, lookahead + next, tokenize_string_character
+    raise TokenError(f"Invalid string escape '\\{next}'")
 
 # Package the lookahead (the full string body, de-escaped and without leading
 # and trailing quotes) up as a String token and return it, then transition back
 # to the ``tokenize_any`` state with a single read result in the lookahead.
 def tokenize_string_end(lookahead, port):
-    return String(lookahead), port.read(1), tokenize_any
+    return lookahead, port.read(1), tokenize_any
diff --git a/actinide/types.py b/actinide/types.py
deleted file mode 100644
index 2b618f4..0000000
--- a/actinide/types.py
+++ /dev/null
@@ -1,12 +0,0 @@
-__all__ = ['String', 'Symbol', 'Syntax']
-
-# ## REPRESENTATIONS
-#
-# The following defines specify the Python representations of various Actinide
-# types.
-
-String = str
-class Symbol(str):
-    pass
-class Syntax(str):
-    pass
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/tests/__init__.py
diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py
index a300eb2..76a07a9 100644
--- a/tests/test_tokenizer.py
+++ b/tests/test_tokenizer.py
@@ -1,17 +1,71 @@
-from hypothesis import given, settings, HealthCheck
-from hypothesis.strategies import text, from_regex
+from hypothesis import given, settings, HealthCheck, event
+from hypothesis.strategies import just, text, characters, from_regex, one_of, tuples, sampled_from
 import io
 
 from actinide.tokenizer import *
 
+from .tokens import spaced_token_sequences
+
 class ReadablePort(io.StringIO):
     def __repr__(self):
         # Slightly friendlier debugging output
         return f"ReadablePort(str={repr(self.getvalue())}, pos={self.tell()})"
 
-def not_(f):
-    return lambda *args, **kwargs: not f(*args, **kwargs)
+# Many of the following tests proceed by cases, because the underlying behaviour
+# is too complex to treat as a uniform set of properties. The cases are meant to
+# be total, and in principle could be defined as a set of filters on the
+# ``text()`` generator that , combined, exhaust the possible outcomes of that
+# generator.
+#
+# Implementing the tests that way causes Hypothesis to generate a significant
+# number of examples that it then throws away without verifying, because
+# Hypothesis has no insight into filters to use when generating examples.
+# Instead, this test suite specifies generators per-case.
+
+# Cases for tokenize_any:
+
+# We test this a bit differently from the subsequent tokenizer states. Because
+# it's a pure routing state, we can generate lookahead, expected_state pairs and
+# check them in one pass, rather than testing each possible outcome separately.
+# In every case, the input is irrelevant: this state never reads.
+
+def next_token_states():
+    return one_of(
+        tuples(just(''), just(tokenize_eof)),
+        tuples(just(';'), just(tokenize_comment)),
+        tuples(sampled_from('()'), just(tokenize_syntax)),
+        tuples(sampled_from(' \t\n'), just(tokenize_whitespace)),
+        tuples(just('"'), just(tokenize_atom)),
+        tuples(characters(blacklist_characters=' \t\n();"'), just(tokenize_atom)),
+    )
+
+@given(next_token_states(), text())
+def test_tokenize_any(lookahead_next, input):
+    s, expected_state = lookahead_next
+    port = ReadablePort(input)
+    token, lookahead, next = tokenize_any(s, input)
+
+    assert token is None
+    assert lookahead == s
+    assert next == expected_state
+    assert port.tell() == 0
 
+# Since the previous test case is rigged for success, also verify that no input
+# causes tokenize_any to enter an unexpected state or to throw an exception.
+@given(text(), text())
+def test_tokenize_any_fuzz(s, input):
+    port = ReadablePort(input)
+    token, lookahead, next = tokenize_any(s, input)
+
+    assert token is None
+    assert lookahead == s
+    assert next in (tokenize_eof, tokenize_comment, tokenize_syntax, tokenize_whitespace, tokenize_atom)
+    assert port.tell() == 0
+
+# Cases for tokenize_eof:
+
+# * any lookahead, any input: tokenize_eof is a trap state performing no reads,
+#   always returning to itself, and never generating a token.
 @given(text(), text())
 def test_tokenize_eof(s, input):
     port = ReadablePort(input)
@@ -22,12 +76,12 @@ def test_tokenize_eof(s, input):
     assert next == tokenize_eof
     assert port.tell() == 0
 
-def comment_continues(text):
-    if text == '':
-        return False
-    return text[0] != '\n'
+# Cases for tokenize_comment:
 
-@given(text(), text().filter(comment_continues))
+# * any lookahead, one or more characters beginning with a non-newline as input:
+#   tokenize_comment continues the current comment, throwing away one character
+#   of input, without generating a token.
+@given(text(), from_regex(r'^[^\n].*'))
 def test_tokenize_comment_continues(s, input):
     port = ReadablePort(input)
     token, lookahead, next = tokenize_comment(s, port)
@@ -37,7 +91,11 @@ def test_tokenize_comment_continues(s, input):
     assert lookahead == input[0]
     assert next == tokenize_comment
 
-@given(text(), text().filter(not_(comment_continues)))
+# * any lookahead, one or more characters beginning with a newline as input, and
+# * any lookahead, empty input:
+#   tokenize_comment concludes the current comment and prepares for the next
+#   token, without generating a token.
+@given(text(), just('') | from_regex(r'^\n.*'))
 def test_tokenize_comment_ends(s, input):
     port = ReadablePort(input)
     token, lookahead, next = tokenize_comment(s, port)
@@ -47,17 +105,26 @@ def test_tokenize_comment_ends(s, input):
     assert lookahead == (input[0] if input else '')
     assert next == tokenize_any
 
+# Cases for tokenize_syntax:
+
+# * any lookahead, any input: generate the lookahead as a Syntax token and
+#   transition back to tokenize_any to prepare for the next token, with one
+#   character of lookahead ready to go.
 @given(text(), text())
 def test_tokenize_syntax(s, input):
     port = ReadablePort(input)
     token, lookahead, next = tokenize_syntax(s, port)
 
-    assert token == Syntax(s)
-    assert isinstance(token, Syntax)
+    assert token == s
     assert port.tell() == (1 if input else 0)
     assert lookahead == (input[0] if input else '')
     assert next == tokenize_any
 
+# Cases for test_tokenize_whitespace:
+
+# * any lookahead, any input: throw away the presumed-whitespace lookahead, then
+#   transition back to tokenize_any to prepare for the next token, with one
+#   character of lookahead ready to go, without generating a token.
 @given(text(), text())
 def test_tokenize_whitespace(s, input):
     port = ReadablePort(input)
@@ -68,67 +135,82 @@ def test_tokenize_whitespace(s, input):
     assert lookahead == (input[0] if input else '')
     assert next == tokenize_any
 
-def symbol_continues(text):
-    if text == '':
-        return False
-    return text[0] not in ' \n\t();"'
+# Cases for tokenize_nonstring_atom:
 
-@given(text(), text().filter(symbol_continues))
-def test_tokenize_symbol_continues(s, input):
+# * any lookahead, any non-empty input not beginning with whitespace, syntax, a
+#   comment delimiter, or a string literal: accumulate one character of input
+#   onto the lookahead, then transition back to tokenize_symbol to process the
+#   next character of input, without generating a token.
+@given(text(), from_regex(r'^[^ \n\t();"].*'))
+def test_tokenize_nonstring_atom_continues(s, input):
     port = ReadablePort(input)
-    token, lookahead, next = tokenize_symbol(s, port)
+    token, lookahead, next = tokenize_nonstring_atom(s, port)
 
     assert token is None
     assert port.tell() == 1
     assert lookahead == s + input[0]
-    assert next == tokenize_symbol
-
-@given(text(), text().filter(not_(symbol_continues)))
-def test_tokenize_symbol_ends(s, input):
+    assert next == tokenize_nonstring_atom
+
+# * any lookahead, a non-empty input beginning with whitespace, syntax, a
+#   comment delimiter, or a string literal, and
+# * any lookahead, empty input:
+#   generate the accumulated input as a Symbol token, then transition back to tokenize_any with one character of lookahead ready to go.
+@given(text(), just('') | from_regex(r'^[ \n\t();"].*'))
+def test_tokenize_tokenize_nonstring_atom_ends(s, input):
     port = ReadablePort(input)
-    token, lookahead, next = tokenize_symbol(s, port)
+    token, lookahead, next = tokenize_nonstring_atom(s, port)
 
-    assert token == Symbol(s)
-    assert isinstance(token, Symbol)
+    assert token == s
     assert port.tell() == (1 if input else 0)
     assert lookahead == (input[0] if input else '')
     assert next == tokenize_any
 
-def string_continues(text):
-    if text == '':
-        return False
-    return not text[0] == '"'
+# And now, the _worst_ part of the state machine. Cases for tokenize_string:
 
-@given(text(), text().filter(string_continues))
+# * any lookahead, a non-empty input not beginning with a string delimiter:
+#   begin a non-empty string by transitioning to the tokenize_string_character
+#   state with one character of lookahead, without generating a token.
+@given(text(), from_regex(r'^[^"].*'))
 def test_tokenize_string_continues(s, input):
     port = ReadablePort(input)
     token, lookahead, next = tokenize_string(s, port)
 
     assert token is None
     assert port.tell() == 1
-    assert lookahead == input[0]
+    assert lookahead == s + input[0]
     assert next == tokenize_string_character
 
-@given(text(), text().filter(not_(string_continues)))
-def test_tokenize_string_ends(s, input):
+# * any lookahad, a non-empty input beginning with a string delimiter: terminate
+#   an empty string by transitioning to the tokenize_string_end state with an
+#   *empty* lookahead, without generating a token.
+@given(text(), from_regex(r'^["].*'))
+def test_tokenize_string_empty(s, input):
+    port = ReadablePort(input)
+    token, lookahead, next = tokenize_string(s, port)
+
+    assert token is None
+    assert port.tell() == 1
+    assert lookahead == s + input[0]
+    assert next == tokenize_string_end
+
+# * any lookahead, empty input: emit a tokenization error, as we've encountered
+#   EOF inside of a string.
+@given(text(), just(''))
+def test_tokenize_string_eof(s, input):
     try:
         port = ReadablePort(input)
         token, lookahead, next = tokenize_string(s, port)
 
-        assert token is None
-        assert port.tell() == 1
-        assert lookahead == ''
-        assert next == tokenize_string_end
+        assert False # must raise
     except TokenError:
-        assert input == ''
         assert port.tell() == 0
 
-def is_escape(text):
-    if text == '':
-        return False
-    return text[0] == '\\'
+# Cases for tokenize_string_character:
 
-@given(text(), text().filter(string_continues).filter(not_(is_escape)))
+# * any lookahead, any non-empty input not beginning with a string delimiter or
+#   escape character: append one character of input to the lookahead, then
+#   continue in the tokenize_string_character state without generating a token.
+@given(text(), from_regex(r'^[^\\"].*'))
 def test_tokenize_string_character_continues(s, input):
     port = ReadablePort(input)
     token, lookahead, next = tokenize_string_character(s, port)
@@ -138,56 +220,137 @@ def test_tokenize_string_character_continues(s, input):
     assert lookahead == s + input[0]
     assert next == tokenize_string_character
 
-# Using from_regex() rather than text() because searching randomly for strings
-# that start with a specific character is far, _far_ too slow. (It often fails
-# to find any examples.) I _think_ this preserves the property that this group
-# of three tests are exhaustive, but it's not as obvious as it would be if I
-# could use text() here.
-@given(text(), from_regex(r'\\.*').filter(string_continues).filter(is_escape))
+# * any lookahead, any non-empty input which begins with an escape character:
+#   leave the lookahead unchanged, but transition to the
+#   tokenize_escaped_string_character state to determine which escape character
+#   we're dealing with, without emitting a token.
+@given(text(), from_regex(r'^[\\].*'))
 def test_tokenize_string_character_begins_escape(s, input):
     port = ReadablePort(input)
     token, lookahead, next = tokenize_string_character(s, port)
 
     assert token is None
     assert port.tell() == 1
-    assert lookahead == s
+    assert lookahead == s + input[0]
     assert next == tokenize_escaped_string_character
 
-@given(text(), text().filter(not_(string_continues)))
+# * any lookahead, any non-empty input which begins with a string delimiter:
+#   we're at the end of a string. Transition to the tokenize_string_end state
+#   with the current lookahead, without generating a token.
+@given(text(), from_regex(r'^["].*'))
 def test_tokenize_string_character_ends(s, input):
+    port = ReadablePort(input)
+    token, lookahead, next = tokenize_string_character(s, port)
+
+    assert token is None
+    assert port.tell() == 1
+    assert lookahead == s + input[0]
+    assert next == tokenize_string_end
+
+# * any lookahead, empty input: emit a tokenization error, as we've encountered
+#   EOF inside of a string literal.
+@given(text(), just(''))
+def test_tokenize_string_character_eof(s, input):
     try:
         port = ReadablePort(input)
         token, lookahead, next = tokenize_string_character(s, port)
 
-        assert token is None
-        assert port.tell() == 1
-        assert lookahead == s
-        assert next == tokenize_string_end
+        assert False # must raise
     except TokenError:
         assert input == ''
         assert port.tell() == 0
 
-@given(text(), text())
-def test_tokenize_escaped_string_character(s, input):
+# Cases for tokenize_escaped_string:
+
+# * any lookahead, any non-empty input beginning with a legal string escaped
+#   character: de-escape the first character of the input, append the result to
+#   the lookahead, then transition back to the tokenize_string_character state.
+@given(text(), from_regex(r'^["\\].*'))
+def test_tokenize_escaped_string_character_valid(s, input):
+    port = ReadablePort(input)
+    token, lookahead, next = tokenize_escaped_string_character(s, port)
+
+    assert token is None
+    assert port.tell() == 1
+    assert lookahead == s + input[0]
+    assert next == tokenize_string_character
+
+# * any lookahead, any non-empty input not beginning with a legal string escaped
+#   character: emit a tokenization error, we've found an invalid string escape.
+@given(text(), from_regex(r'^[^"\\].*'))
+def test_tokenize_escaped_string_character_invalid(s, input):
     try:
         port = ReadablePort(input)
         token, lookahead, next = tokenize_escaped_string_character(s, port)
 
-        assert token is None
+        assert False # must raise
+    except TokenError:
         assert port.tell() == 1
-        assert lookahead == s + input[0]
-        assert next == tokenize_string_character
+
+# * any lookahead, empty input: emit a tokenization error, we've found an EOF
+#   inside of a string literal.
+@given(text(), just(''))
+def test_tokenize_escaped_string_character_eof(s, input):
+    try:
+        port = ReadablePort(input)
+        token, lookahead, next = tokenize_escaped_string_character(s, port)
+
+        assert False # must raise
     except TokenError:
-        assert input == '' or input[0] not in '\\n'
-        assert port.tell() == (1 if input else 0)
+        assert port.tell() == 0
+
+# Cases for tokenize_string_end:
 
+# * any lookahead, any input: generate a String token from the lookahead, then
+#   transition back to the tokenize_any state with one character of lookahead
+#   ready to go.
 @given(text(), text())
 def test_tokenize_string_end(s, input):
     port = ReadablePort(input)
     token, lookahead, next = tokenize_string_end(s, port)
 
     assert token == s
-    assert isinstance(token, String)
     assert port.tell() == (1 if input else 0)
     assert lookahead == (input[0] if input else '')
     assert next == tokenize_any
+
+# Cases for tokenize_atom:
+
+# * lookahead containing a string delimiter, any input: found a string atom,
+#   transition to the tokenize_string state without reading or generating a
+#   token.
+@given(just('"'), text())
+def test_tokenize_atom_string(s, input):
+    port = ReadablePort(input)
+    token, lookahead, next = tokenize_atom(s, port)
+
+    assert token is None
+    assert port.tell() == 0
+    assert lookahead == s
+    assert next == tokenize_string
+
+# * lookahead containing something other than a string delimiter, any input:
+#   found a nonstring atom, transition to the tokenize_nonstring_atom state
+#   without reading or generating a token.
+@given(from_regex(r'^[^"]'), text())
+def test_tokenize_atom_nonstring(s, input):
+    port = ReadablePort(input)
+    token, lookahead, next = tokenize_atom(s, port)
+
+    assert token is None
+    assert port.tell() == 0
+    assert lookahead == s
+    assert next == tokenize_nonstring_atom
+
+# Cases for the tokenizer:
+
+# * any sequence of separator-token pairs: if the pairs are coalesced into a
+#   single giant input, does the tokenizer recover the tokens?
+@given(spaced_token_sequences())
+def test_tokenizer(spaced_tokens):
+    input = ''.join(''.join(pair) for pair in spaced_tokens)
+    tokens = [token for (_, token) in spaced_tokens]
+
+    port = ReadablePort(input)
+
+    assert list(tokenize(port)) == tokens
diff --git a/tests/tokens.py b/tests/tokens.py
new file mode 100644
index 0000000..0027fb2
--- /dev/null
+++ b/tests/tokens.py
@@ -0,0 +1,90 @@
+from hypothesis.strategies import just, one_of, characters, text, lists, tuples
+from hypothesis.strategies import composite, recursive
+
+# Generators for token families
+
+# Generates the `(` token.
+def open_parens():
+    return just('(')
+
+# Generates the ')' token.
+def close_parens():
+    return just(')')
+
+# Generates characters that are legal, unescaped, inside of a string.
+def string_bare_characters():
+    return characters(blacklist_characters='\\"')
+
+# Generates legal string escape sequences.
+def string_escaped_characters():
+    return one_of(just('"'), just('\\')).map(lambda c: '\\' + c)
+
+# Generates single-character string representations, including escapes.
+def string_characters():
+    return one_of(string_bare_characters(), string_escaped_characters())
+
+# Generates arbitrary string bodies (strings, without leading or trailing
+# quotes)
+def string_body():
+    return text(string_characters())
+
+# Generates legal strings.
+def strings():
+    return tuples(just('"'), string_body(), just('"')).map(lambda t: ''.join(t))
+
+# Generates characters which are legal within a symbol.
+def symbol_characters():
+    return characters(blacklist_characters=' \t\n();"')
+
+# Generates legal symbols.
+def symbols():
+    return text(symbol_characters(), min_size=1)
+
+# Generates single whitespace characters.
+def whitespace_characters():
+    return one_of(just('\n'), just(' '), just('\t'))
+
+# Generates a single token.
+def tokens():
+    return one_of(symbols(), strings(), open_parens(), close_parens())
+
+# Generates at least one character of whitespace.
+def whitespace():
+    return text(whitespace_characters(), min_size=1)
+
+# Generates characters which can legally appear inside of a comment (anything
+# but a newline).
+def comment_characters():
+    return characters(blacklist_characters='\n')
+
+# Generates a (possibly-empty) comment, terminated with a trailing newline.
+def comments():
+    return tuples(just(';'), text(comment_characters()), just('\n')).map(lambda t: ''.join(t))
+
+# Generates sequences which can be inserted between arbitrary pairs of tokens
+# without changing their meaning.
+def intertokens():
+    return one_of(comments(), whitespace())
+
+# Generate a pair such that the second element is a token, and joining the
+# elements with an empty string produces a string that tokenizes to the second
+# element.
+def spaced_tokens():
+    def spaced(strategy):
+        return tuples(intertokens(), strategy)
+    def unspaced(strategy):
+        return tuples(one_of(just(''), intertokens()), strategy)
+    def spaced_symbols():
+        return spaced(symbols())
+    def spaced_strings():
+        return unspaced(strings())
+    def spaced_open_parens():
+        return unspaced(open_parens())
+    def spaced_close_parens():
+        return unspaced(close_parens())
+
+    return one_of(spaced_symbols(), spaced_strings(), spaced_open_parens(), spaced_close_parens())
+
+# Generats a list of pairs as per spaced_token().
+def spaced_token_sequences():
+    return lists(spaced_tokens())