Introduce input ports.

Ports are the lisp abstraction of files and streams. Actinide ports additionally guarantee a peek operation. This makes ``tokenize`` (now ``read_token``) callable as a lisp function, as it takes a port and reads one token from it. This is a substantial refactoring. As most of the state is now captured by closures, it's no longer practical to test individual states as readily. However, the top-level tokenizer tests exercise the full state space.
author: Owen Jacobson <owen@grimoire.ca> 2017-11-11 01:51:06 -0500
committer: Owen Jacobson <owen@grimoire.ca> 2017-11-11 15:42:13 -0500
commit: 16d94a6e50eb81de9d9d438e1cce0746928597f3 (patch)
tree: e1cb628d34c49690128722a33cc1d19d7dcffb23 /tests/test_tokenizer.py
parent: e4fb8604aa2fc572a3aeeace1c32de7339d346b5 (diff)
1 files changed, 23 insertions, 337 deletions
diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py
index 76a07a9..5c0ddea 100644
--- a/tests/test_tokenizer.py
+++ b/tests/test_tokenizer.py
@@ -3,354 +3,40 @@ from hypothesis.strategies import just, text, characters, from_regex, one_of, tu
 import io
 
 from actinide.tokenizer import *
+from actinide.ports import *
 
-from .tokens import spaced_token_sequences
+from .tokens import spaced_token_sequences, tokens, nontokens
 
-class ReadablePort(io.StringIO):
-    def __repr__(self):
-        # Slightly friendlier debugging output
-        return f"ReadablePort(str={repr(self.getvalue())}, pos={self.tell()})"
-
-# Many of the following tests proceed by cases, because the underlying behaviour
-# is too complex to treat as a uniform set of properties. The cases are meant to
-# be total, and in principle could be defined as a set of filters on the
-# ``text()`` generator that , combined, exhaust the possible outcomes of that
-# generator.
-#
-# Implementing the tests that way causes Hypothesis to generate a significant
-# number of examples that it then throws away without verifying, because
-# Hypothesis has no insight into filters to use when generating examples.
-# Instead, this test suite specifies generators per-case.
-
-# Cases for tokenize_any:
-
-# We test this a bit differently from the subsequent tokenizer states. Because
-# it's a pure routing state, we can generate lookahead, expected_state pairs and
-# check them in one pass, rather than testing each possible outcome separately.
-# In every case, the input is irrelevant: this state never reads.
-
-def next_token_states():
-    return one_of(
-        tuples(just(''), just(tokenize_eof)),
-        tuples(just(';'), just(tokenize_comment)),
-        tuples(sampled_from('()'), just(tokenize_syntax)),
-        tuples(sampled_from(' \t\n'), just(tokenize_whitespace)),
-        tuples(just('"'), just(tokenize_atom)),
-        tuples(characters(blacklist_characters=' \t\n();"'), just(tokenize_atom)),
-    )
-
-@given(next_token_states(), text())
-def test_tokenize_any(lookahead_next, input):
-    s, expected_state = lookahead_next
-    port = ReadablePort(input)
-    token, lookahead, next = tokenize_any(s, input)
-
-    assert token is None
-    assert lookahead == s
-    assert next == expected_state
-    assert port.tell() == 0
-
-# Since the previous test case is rigged for success, also verify that no input
-# causes tokenize_any to enter an unexpected state or to throw an exception.
-@given(text(), text())
-def test_tokenize_any_fuzz(s, input):
-    port = ReadablePort(input)
-    token, lookahead, next = tokenize_any(s, input)
-
-    assert token is None
-    assert lookahead == s
-    assert next in (tokenize_eof, tokenize_comment, tokenize_syntax, tokenize_whitespace, tokenize_atom)
-    assert port.tell() == 0
-
-# Cases for tokenize_eof:
-
-# * any lookahead, any input: tokenize_eof is a trap state performing no reads,
-#   always returning to itself, and never generating a token.
-@given(text(), text())
-def test_tokenize_eof(s, input):
-    port = ReadablePort(input)
-    token, lookahead, next = tokenize_eof(s, port)
-
-    assert token is None
-    assert lookahead == s
-    assert next == tokenize_eof
-    assert port.tell() == 0
-
-# Cases for tokenize_comment:
-
-# * any lookahead, one or more characters beginning with a non-newline as input:
-#   tokenize_comment continues the current comment, throwing away one character
-#   of input, without generating a token.
-@given(text(), from_regex(r'^[^\n].*'))
-def test_tokenize_comment_continues(s, input):
-    port = ReadablePort(input)
-    token, lookahead, next = tokenize_comment(s, port)
-
-    assert token is None
-    assert port.tell() == 1
-    assert lookahead == input[0]
-    assert next == tokenize_comment
-
-# * any lookahead, one or more characters beginning with a newline as input, and
-# * any lookahead, empty input:
-#   tokenize_comment concludes the current comment and prepares for the next
-#   token, without generating a token.
-@given(text(), just('') | from_regex(r'^\n.*'))
-def test_tokenize_comment_ends(s, input):
-    port = ReadablePort(input)
-    token, lookahead, next = tokenize_comment(s, port)
-
-    assert token is None
-    assert port.tell() == (1 if input else 0)
-    assert lookahead == (input[0] if input else '')
-    assert next == tokenize_any
-
-# Cases for tokenize_syntax:
-
-# * any lookahead, any input: generate the lookahead as a Syntax token and
-#   transition back to tokenize_any to prepare for the next token, with one
-#   character of lookahead ready to go.
-@given(text(), text())
-def test_tokenize_syntax(s, input):
-    port = ReadablePort(input)
-    token, lookahead, next = tokenize_syntax(s, port)
-
-    assert token == s
-    assert port.tell() == (1 if input else 0)
-    assert lookahead == (input[0] if input else '')
-    assert next == tokenize_any
-
-# Cases for test_tokenize_whitespace:
-
-# * any lookahead, any input: throw away the presumed-whitespace lookahead, then
-#   transition back to tokenize_any to prepare for the next token, with one
-#   character of lookahead ready to go, without generating a token.
-@given(text(), text())
-def test_tokenize_whitespace(s, input):
-    port = ReadablePort(input)
-    token, lookahead, next = tokenize_whitespace(s, port)
-
-    assert token is None
-    assert port.tell() == (1 if input else 0)
-    assert lookahead == (input[0] if input else '')
-    assert next == tokenize_any
-
-# Cases for tokenize_nonstring_atom:
-
-# * any lookahead, any non-empty input not beginning with whitespace, syntax, a
-#   comment delimiter, or a string literal: accumulate one character of input
-#   onto the lookahead, then transition back to tokenize_symbol to process the
-#   next character of input, without generating a token.
-@given(text(), from_regex(r'^[^ \n\t();"].*'))
-def test_tokenize_nonstring_atom_continues(s, input):
-    port = ReadablePort(input)
-    token, lookahead, next = tokenize_nonstring_atom(s, port)
-
-    assert token is None
-    assert port.tell() == 1
-    assert lookahead == s + input[0]
-    assert next == tokenize_nonstring_atom
-
-# * any lookahead, a non-empty input beginning with whitespace, syntax, a
-#   comment delimiter, or a string literal, and
-# * any lookahead, empty input:
-#   generate the accumulated input as a Symbol token, then transition back to tokenize_any with one character of lookahead ready to go.
-@given(text(), just('') | from_regex(r'^[ \n\t();"].*'))
-def test_tokenize_tokenize_nonstring_atom_ends(s, input):
-    port = ReadablePort(input)
-    token, lookahead, next = tokenize_nonstring_atom(s, port)
-
-    assert token == s
-    assert port.tell() == (1 if input else 0)
-    assert lookahead == (input[0] if input else '')
-    assert next == tokenize_any
-
-# And now, the _worst_ part of the state machine. Cases for tokenize_string:
-
-# * any lookahead, a non-empty input not beginning with a string delimiter:
-#   begin a non-empty string by transitioning to the tokenize_string_character
-#   state with one character of lookahead, without generating a token.
-@given(text(), from_regex(r'^[^"].*'))
-def test_tokenize_string_continues(s, input):
-    port = ReadablePort(input)
-    token, lookahead, next = tokenize_string(s, port)
-
-    assert token is None
-    assert port.tell() == 1
-    assert lookahead == s + input[0]
-    assert next == tokenize_string_character
-
-# * any lookahad, a non-empty input beginning with a string delimiter: terminate
-#   an empty string by transitioning to the tokenize_string_end state with an
-#   *empty* lookahead, without generating a token.
-@given(text(), from_regex(r'^["].*'))
-def test_tokenize_string_empty(s, input):
-    port = ReadablePort(input)
-    token, lookahead, next = tokenize_string(s, port)
-
-    assert token is None
-    assert port.tell() == 1
-    assert lookahead == s + input[0]
-    assert next == tokenize_string_end
-
-# * any lookahead, empty input: emit a tokenization error, as we've encountered
-#   EOF inside of a string.
-@given(text(), just(''))
-def test_tokenize_string_eof(s, input):
-    try:
-        port = ReadablePort(input)
-        token, lookahead, next = tokenize_string(s, port)
-
-        assert False # must raise
-    except TokenError:
-        assert port.tell() == 0
-
-# Cases for tokenize_string_character:
-
-# * any lookahead, any non-empty input not beginning with a string delimiter or
-#   escape character: append one character of input to the lookahead, then
-#   continue in the tokenize_string_character state without generating a token.
-@given(text(), from_regex(r'^[^\\"].*'))
-def test_tokenize_string_character_continues(s, input):
-    port = ReadablePort(input)
-    token, lookahead, next = tokenize_string_character(s, port)
-
-    assert token is None
-    assert port.tell() == 1
-    assert lookahead == s + input[0]
-    assert next == tokenize_string_character
-
-# * any lookahead, any non-empty input which begins with an escape character:
-#   leave the lookahead unchanged, but transition to the
-#   tokenize_escaped_string_character state to determine which escape character
-#   we're dealing with, without emitting a token.
-@given(text(), from_regex(r'^[\\].*'))
-def test_tokenize_string_character_begins_escape(s, input):
-    port = ReadablePort(input)
-    token, lookahead, next = tokenize_string_character(s, port)
-
-    assert token is None
-    assert port.tell() == 1
-    assert lookahead == s + input[0]
-    assert next == tokenize_escaped_string_character
-
-# * any lookahead, any non-empty input which begins with a string delimiter:
-#   we're at the end of a string. Transition to the tokenize_string_end state
-#   with the current lookahead, without generating a token.
-@given(text(), from_regex(r'^["].*'))
-def test_tokenize_string_character_ends(s, input):
-    port = ReadablePort(input)
-    token, lookahead, next = tokenize_string_character(s, port)
-
-    assert token is None
-    assert port.tell() == 1
-    assert lookahead == s + input[0]
-    assert next == tokenize_string_end
-
-# * any lookahead, empty input: emit a tokenization error, as we've encountered
-#   EOF inside of a string literal.
-@given(text(), just(''))
-def test_tokenize_string_character_eof(s, input):
-    try:
-        port = ReadablePort(input)
-        token, lookahead, next = tokenize_string_character(s, port)
-
-        assert False # must raise
-    except TokenError:
-        assert input == ''
-        assert port.tell() == 0
-
-# Cases for tokenize_escaped_string:
-
-# * any lookahead, any non-empty input beginning with a legal string escaped
-#   character: de-escape the first character of the input, append the result to
-#   the lookahead, then transition back to the tokenize_string_character state.
-@given(text(), from_regex(r'^["\\].*'))
-def test_tokenize_escaped_string_character_valid(s, input):
-    port = ReadablePort(input)
-    token, lookahead, next = tokenize_escaped_string_character(s, port)
-
-    assert token is None
-    assert port.tell() == 1
-    assert lookahead == s + input[0]
-    assert next == tokenize_string_character
-
-# * any lookahead, any non-empty input not beginning with a legal string escaped
-#   character: emit a tokenization error, we've found an invalid string escape.
-@given(text(), from_regex(r'^[^"\\].*'))
-def test_tokenize_escaped_string_character_invalid(s, input):
-    try:
-        port = ReadablePort(input)
-        token, lookahead, next = tokenize_escaped_string_character(s, port)
-
-        assert False # must raise
-    except TokenError:
-        assert port.tell() == 1
-
-# * any lookahead, empty input: emit a tokenization error, we've found an EOF
-#   inside of a string literal.
-@given(text(), just(''))
-def test_tokenize_escaped_string_character_eof(s, input):
-    try:
-        port = ReadablePort(input)
-        token, lookahead, next = tokenize_escaped_string_character(s, port)
-
-        assert False # must raise
-    except TokenError:
-        assert port.tell() == 0
-
-# Cases for tokenize_string_end:
-
-# * any lookahead, any input: generate a String token from the lookahead, then
-#   transition back to the tokenize_any state with one character of lookahead
-#   ready to go.
-@given(text(), text())
-def test_tokenize_string_end(s, input):
-    port = ReadablePort(input)
-    token, lookahead, next = tokenize_string_end(s, port)
-
-    assert token == s
-    assert port.tell() == (1 if input else 0)
-    assert lookahead == (input[0] if input else '')
-    assert next == tokenize_any
-
-# Cases for tokenize_atom:
-
-# * lookahead containing a string delimiter, any input: found a string atom,
-#   transition to the tokenize_string state without reading or generating a
-#   token.
-@given(just('"'), text())
-def test_tokenize_atom_string(s, input):
-    port = ReadablePort(input)
-    token, lookahead, next = tokenize_atom(s, port)
+# Cases for the tokenizer:
 
-    assert token is None
-    assert port.tell() == 0
-    assert lookahead == s
-    assert next == tokenize_string
+# * any single token: reads back that token.
+@given(tokens())
+def test_tokenizer_single_token(input):
+    port = string_to_input_port(input)
 
-# * lookahead containing something other than a string delimiter, any input:
-#   found a nonstring atom, transition to the tokenize_nonstring_atom state
-#   without reading or generating a token.
-@given(from_regex(r'^[^"]'), text())
-def test_tokenize_atom_nonstring(s, input):
-    port = ReadablePort(input)
-    token, lookahead, next = tokenize_atom(s, port)
+    assert read_token(port) == input
 
-    assert token is None
-    assert port.tell() == 0
-    assert lookahead == s
-    assert next == tokenize_nonstring_atom
+# * any input guaranteed not to contain a token: reads back None, consuming the
+#   whole input in the process.
+@given(nontokens())
+def test_tokenizer_no_token(input):
+    port = string_to_input_port(input)
 
-# Cases for the tokenizer:
+    assert read_token(port) == None
 
 # * any sequence of separator-token pairs: if the pairs are coalesced into a
 #   single giant input, does the tokenizer recover the tokens?
 @given(spaced_token_sequences())
-def test_tokenizer(spaced_tokens):
+def test_tokenizer_spaced_sequence(spaced_tokens):
     input = ''.join(''.join(pair) for pair in spaced_tokens)
     tokens = [token for (_, token) in spaced_tokens]
 
-    port = ReadablePort(input)
+    port = string_to_input_port(input)
+    def iterate_read_token(port):
+        token = read_token(port)
+        while token is not None:
+            yield token
+            token = read_token(port)
+
+    assert list(iterate_read_token(port)) == tokens
 
-    assert list(tokenize(port)) == tokens
author	Owen Jacobson <owen@grimoire.ca>	2017-11-11 01:51:06 -0500
committer	Owen Jacobson <owen@grimoire.ca>	2017-11-11 15:42:13 -0500
commit	16d94a6e50eb81de9d9d438e1cce0746928597f3 (patch)
tree	e1cb628d34c49690128722a33cc1d19d7dcffb23 /tests/test_tokenizer.py
parent	e4fb8604aa2fc572a3aeeace1c32de7339d346b5 (diff)