1 files changed, 227 insertions, 64 deletions
diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py
index a300eb2..76a07a9 100644
--- a/tests/test_tokenizer.py
+++ b/tests/test_tokenizer.py
@@ -1,17 +1,71 @@
-from hypothesis import given, settings, HealthCheck
-from hypothesis.strategies import text, from_regex
+from hypothesis import given, settings, HealthCheck, event
+from hypothesis.strategies import just, text, characters, from_regex, one_of, tuples, sampled_from
 import io
 
 from actinide.tokenizer import *
 
+from .tokens import spaced_token_sequences
+
 class ReadablePort(io.StringIO):
     def __repr__(self):
         # Slightly friendlier debugging output
         return f"ReadablePort(str={repr(self.getvalue())}, pos={self.tell()})"
 
-def not_(f):
-    return lambda *args, **kwargs: not f(*args, **kwargs)
+# Many of the following tests proceed by cases, because the underlying behaviour
+# is too complex to treat as a uniform set of properties. The cases are meant to
+# be total, and in principle could be defined as a set of filters on the
+# ``text()`` generator that , combined, exhaust the possible outcomes of that
+# generator.
+#
+# Implementing the tests that way causes Hypothesis to generate a significant
+# number of examples that it then throws away without verifying, because
+# Hypothesis has no insight into filters to use when generating examples.
+# Instead, this test suite specifies generators per-case.
+
+# Cases for tokenize_any:
+
+# We test this a bit differently from the subsequent tokenizer states. Because
+# it's a pure routing state, we can generate lookahead, expected_state pairs and
+# check them in one pass, rather than testing each possible outcome separately.
+# In every case, the input is irrelevant: this state never reads.
+
+def next_token_states():
+    return one_of(
+        tuples(just(''), just(tokenize_eof)),
+        tuples(just(';'), just(tokenize_comment)),
+        tuples(sampled_from('()'), just(tokenize_syntax)),
+        tuples(sampled_from(' \t\n'), just(tokenize_whitespace)),
+        tuples(just('"'), just(tokenize_atom)),
+        tuples(characters(blacklist_characters=' \t\n();"'), just(tokenize_atom)),
+    )
+
+@given(next_token_states(), text())
+def test_tokenize_any(lookahead_next, input):
+    s, expected_state = lookahead_next
+    port = ReadablePort(input)
+    token, lookahead, next = tokenize_any(s, input)
+
+    assert token is None
+    assert lookahead == s
+    assert next == expected_state
+    assert port.tell() == 0
 
+# Since the previous test case is rigged for success, also verify that no input
+# causes tokenize_any to enter an unexpected state or to throw an exception.
+@given(text(), text())
+def test_tokenize_any_fuzz(s, input):
+    port = ReadablePort(input)
+    token, lookahead, next = tokenize_any(s, input)
+
+    assert token is None
+    assert lookahead == s
+    assert next in (tokenize_eof, tokenize_comment, tokenize_syntax, tokenize_whitespace, tokenize_atom)
+    assert port.tell() == 0
+
+# Cases for tokenize_eof:
+
+# * any lookahead, any input: tokenize_eof is a trap state performing no reads,
+#   always returning to itself, and never generating a token.
 @given(text(), text())
 def test_tokenize_eof(s, input):
     port = ReadablePort(input)
@@ -22,12 +76,12 @@ def test_tokenize_eof(s, input):
     assert next == tokenize_eof
     assert port.tell() == 0
 
-def comment_continues(text):
-    if text == '':
-        return False
-    return text[0] != '\n'
+# Cases for tokenize_comment:
 
-@given(text(), text().filter(comment_continues))
+# * any lookahead, one or more characters beginning with a non-newline as input:
+#   tokenize_comment continues the current comment, throwing away one character
+#   of input, without generating a token.
+@given(text(), from_regex(r'^[^\n].*'))
 def test_tokenize_comment_continues(s, input):
     port = ReadablePort(input)
     token, lookahead, next = tokenize_comment(s, port)
@@ -37,7 +91,11 @@ def test_tokenize_comment_continues(s, input):
     assert lookahead == input[0]
     assert next == tokenize_comment
 
-@given(text(), text().filter(not_(comment_continues)))
+# * any lookahead, one or more characters beginning with a newline as input, and
+# * any lookahead, empty input:
+#   tokenize_comment concludes the current comment and prepares for the next
+#   token, without generating a token.
+@given(text(), just('') | from_regex(r'^\n.*'))
 def test_tokenize_comment_ends(s, input):
     port = ReadablePort(input)
     token, lookahead, next = tokenize_comment(s, port)
@@ -47,17 +105,26 @@ def test_tokenize_comment_ends(s, input):
     assert lookahead == (input[0] if input else '')
     assert next == tokenize_any
 
+# Cases for tokenize_syntax:
+
+# * any lookahead, any input: generate the lookahead as a Syntax token and
+#   transition back to tokenize_any to prepare for the next token, with one
+#   character of lookahead ready to go.
 @given(text(), text())
 def test_tokenize_syntax(s, input):
     port = ReadablePort(input)
     token, lookahead, next = tokenize_syntax(s, port)
 
-    assert token == Syntax(s)
-    assert isinstance(token, Syntax)
+    assert token == s
     assert port.tell() == (1 if input else 0)
     assert lookahead == (input[0] if input else '')
     assert next == tokenize_any
 
+# Cases for test_tokenize_whitespace:
+
+# * any lookahead, any input: throw away the presumed-whitespace lookahead, then
+#   transition back to tokenize_any to prepare for the next token, with one
+#   character of lookahead ready to go, without generating a token.
 @given(text(), text())
 def test_tokenize_whitespace(s, input):
     port = ReadablePort(input)
@@ -68,67 +135,82 @@ def test_tokenize_whitespace(s, input):
     assert lookahead == (input[0] if input else '')
     assert next == tokenize_any
 
-def symbol_continues(text):
-    if text == '':
-        return False
-    return text[0] not in ' \n\t();"'
+# Cases for tokenize_nonstring_atom:
 
-@given(text(), text().filter(symbol_continues))
-def test_tokenize_symbol_continues(s, input):
+# * any lookahead, any non-empty input not beginning with whitespace, syntax, a
+#   comment delimiter, or a string literal: accumulate one character of input
+#   onto the lookahead, then transition back to tokenize_symbol to process the
+#   next character of input, without generating a token.
+@given(text(), from_regex(r'^[^ \n\t();"].*'))
+def test_tokenize_nonstring_atom_continues(s, input):
     port = ReadablePort(input)
-    token, lookahead, next = tokenize_symbol(s, port)
+    token, lookahead, next = tokenize_nonstring_atom(s, port)
 
     assert token is None
     assert port.tell() == 1
     assert lookahead == s + input[0]
-    assert next == tokenize_symbol
-
-@given(text(), text().filter(not_(symbol_continues)))
-def test_tokenize_symbol_ends(s, input):
+    assert next == tokenize_nonstring_atom
+
+# * any lookahead, a non-empty input beginning with whitespace, syntax, a
+#   comment delimiter, or a string literal, and
+# * any lookahead, empty input:
+#   generate the accumulated input as a Symbol token, then transition back to tokenize_any with one character of lookahead ready to go.
+@given(text(), just('') | from_regex(r'^[ \n\t();"].*'))
+def test_tokenize_tokenize_nonstring_atom_ends(s, input):
     port = ReadablePort(input)
-    token, lookahead, next = tokenize_symbol(s, port)
+    token, lookahead, next = tokenize_nonstring_atom(s, port)
 
-    assert token == Symbol(s)
-    assert isinstance(token, Symbol)
+    assert token == s
     assert port.tell() == (1 if input else 0)
     assert lookahead == (input[0] if input else '')
     assert next == tokenize_any
 
-def string_continues(text):
-    if text == '':
-        return False
-    return not text[0] == '"'
+# And now, the _worst_ part of the state machine. Cases for tokenize_string:
 
-@given(text(), text().filter(string_continues))
+# * any lookahead, a non-empty input not beginning with a string delimiter:
+#   begin a non-empty string by transitioning to the tokenize_string_character
+#   state with one character of lookahead, without generating a token.
+@given(text(), from_regex(r'^[^"].*'))
 def test_tokenize_string_continues(s, input):
     port = ReadablePort(input)
     token, lookahead, next = tokenize_string(s, port)
 
     assert token is None
     assert port.tell() == 1
-    assert lookahead == input[0]
+    assert lookahead == s + input[0]
     assert next == tokenize_string_character
 
-@given(text(), text().filter(not_(string_continues)))
-def test_tokenize_string_ends(s, input):
+# * any lookahad, a non-empty input beginning with a string delimiter: terminate
+#   an empty string by transitioning to the tokenize_string_end state with an
+#   *empty* lookahead, without generating a token.
+@given(text(), from_regex(r'^["].*'))
+def test_tokenize_string_empty(s, input):
+    port = ReadablePort(input)
+    token, lookahead, next = tokenize_string(s, port)
+
+    assert token is None
+    assert port.tell() == 1
+    assert lookahead == s + input[0]
+    assert next == tokenize_string_end
+
+# * any lookahead, empty input: emit a tokenization error, as we've encountered
+#   EOF inside of a string.
+@given(text(), just(''))
+def test_tokenize_string_eof(s, input):
     try:
         port = ReadablePort(input)
         token, lookahead, next = tokenize_string(s, port)
 
-        assert token is None
-        assert port.tell() == 1
-        assert lookahead == ''
-        assert next == tokenize_string_end
+        assert False # must raise
     except TokenError:
-        assert input == ''
         assert port.tell() == 0
 
-def is_escape(text):
-    if text == '':
-        return False
-    return text[0] == '\\'
+# Cases for tokenize_string_character:
 
-@given(text(), text().filter(string_continues).filter(not_(is_escape)))
+# * any lookahead, any non-empty input not beginning with a string delimiter or
+#   escape character: append one character of input to the lookahead, then
+#   continue in the tokenize_string_character state without generating a token.
+@given(text(), from_regex(r'^[^\\"].*'))
 def test_tokenize_string_character_continues(s, input):
     port = ReadablePort(input)
     token, lookahead, next = tokenize_string_character(s, port)
@@ -138,56 +220,137 @@ def test_tokenize_string_character_continues(s, input):
     assert lookahead == s + input[0]
     assert next == tokenize_string_character
 
-# Using from_regex() rather than text() because searching randomly for strings
-# that start with a specific character is far, _far_ too slow. (It often fails
-# to find any examples.) I _think_ this preserves the property that this group
-# of three tests are exhaustive, but it's not as obvious as it would be if I
-# could use text() here.
-@given(text(), from_regex(r'\\.*').filter(string_continues).filter(is_escape))
+# * any lookahead, any non-empty input which begins with an escape character:
+#   leave the lookahead unchanged, but transition to the
+#   tokenize_escaped_string_character state to determine which escape character
+#   we're dealing with, without emitting a token.
+@given(text(), from_regex(r'^[\\].*'))
 def test_tokenize_string_character_begins_escape(s, input):
     port = ReadablePort(input)
     token, lookahead, next = tokenize_string_character(s, port)
 
     assert token is None
     assert port.tell() == 1
-    assert lookahead == s
+    assert lookahead == s + input[0]
     assert next == tokenize_escaped_string_character
 
-@given(text(), text().filter(not_(string_continues)))
+# * any lookahead, any non-empty input which begins with a string delimiter:
+#   we're at the end of a string. Transition to the tokenize_string_end state
+#   with the current lookahead, without generating a token.
+@given(text(), from_regex(r'^["].*'))
 def test_tokenize_string_character_ends(s, input):
+    port = ReadablePort(input)
+    token, lookahead, next = tokenize_string_character(s, port)
+
+    assert token is None
+    assert port.tell() == 1
+    assert lookahead == s + input[0]
+    assert next == tokenize_string_end
+
+# * any lookahead, empty input: emit a tokenization error, as we've encountered
+#   EOF inside of a string literal.
+@given(text(), just(''))
+def test_tokenize_string_character_eof(s, input):
     try:
         port = ReadablePort(input)
         token, lookahead, next = tokenize_string_character(s, port)
 
-        assert token is None
-        assert port.tell() == 1
-        assert lookahead == s
-        assert next == tokenize_string_end
+        assert False # must raise
     except TokenError:
         assert input == ''
         assert port.tell() == 0
 
-@given(text(), text())
-def test_tokenize_escaped_string_character(s, input):
+# Cases for tokenize_escaped_string:
+
+# * any lookahead, any non-empty input beginning with a legal string escaped
+#   character: de-escape the first character of the input, append the result to
+#   the lookahead, then transition back to the tokenize_string_character state.
+@given(text(), from_regex(r'^["\\].*'))
+def test_tokenize_escaped_string_character_valid(s, input):
+    port = ReadablePort(input)
+    token, lookahead, next = tokenize_escaped_string_character(s, port)
+
+    assert token is None
+    assert port.tell() == 1
+    assert lookahead == s + input[0]
+    assert next == tokenize_string_character
+
+# * any lookahead, any non-empty input not beginning with a legal string escaped
+#   character: emit a tokenization error, we've found an invalid string escape.
+@given(text(), from_regex(r'^[^"\\].*'))
+def test_tokenize_escaped_string_character_invalid(s, input):
     try:
         port = ReadablePort(input)
         token, lookahead, next = tokenize_escaped_string_character(s, port)
 
-        assert token is None
+        assert False # must raise
+    except TokenError:
         assert port.tell() == 1
-        assert lookahead == s + input[0]
-        assert next == tokenize_string_character
+
+# * any lookahead, empty input: emit a tokenization error, we've found an EOF
+#   inside of a string literal.
+@given(text(), just(''))
+def test_tokenize_escaped_string_character_eof(s, input):
+    try:
+        port = ReadablePort(input)
+        token, lookahead, next = tokenize_escaped_string_character(s, port)
+
+        assert False # must raise
     except TokenError:
-        assert input == '' or input[0] not in '\\n'
-        assert port.tell() == (1 if input else 0)
+        assert port.tell() == 0
+
+# Cases for tokenize_string_end:
 
+# * any lookahead, any input: generate a String token from the lookahead, then
+#   transition back to the tokenize_any state with one character of lookahead
+#   ready to go.
 @given(text(), text())
 def test_tokenize_string_end(s, input):
     port = ReadablePort(input)
     token, lookahead, next = tokenize_string_end(s, port)
 
     assert token == s
-    assert isinstance(token, String)
     assert port.tell() == (1 if input else 0)
     assert lookahead == (input[0] if input else '')
     assert next == tokenize_any
+
+# Cases for tokenize_atom:
+
+# * lookahead containing a string delimiter, any input: found a string atom,
+#   transition to the tokenize_string state without reading or generating a
+#   token.
+@given(just('"'), text())
+def test_tokenize_atom_string(s, input):
+    port = ReadablePort(input)
+    token, lookahead, next = tokenize_atom(s, port)
+
+    assert token is None
+    assert port.tell() == 0
+    assert lookahead == s
+    assert next == tokenize_string
+
+# * lookahead containing something other than a string delimiter, any input:
+#   found a nonstring atom, transition to the tokenize_nonstring_atom state
+#   without reading or generating a token.
+@given(from_regex(r'^[^"]'), text())
+def test_tokenize_atom_nonstring(s, input):
+    port = ReadablePort(input)
+    token, lookahead, next = tokenize_atom(s, port)
+
+    assert token is None
+    assert port.tell() == 0
+    assert lookahead == s
+    assert next == tokenize_nonstring_atom
+
+# Cases for the tokenizer:
+
+# * any sequence of separator-token pairs: if the pairs are coalesced into a
+#   single giant input, does the tokenizer recover the tokens?
+@given(spaced_token_sequences())
+def test_tokenizer(spaced_tokens):
+    input = ''.join(''.join(pair) for pair in spaced_tokens)
+    tokens = [token for (_, token) in spaced_tokens]
+
+    port = ReadablePort(input)
+
+    assert list(tokenize(port)) == tokens