diff options
| author | Owen Jacobson <owen@grimoire.ca> | 2017-11-11 01:51:06 -0500 |
|---|---|---|
| committer | Owen Jacobson <owen@grimoire.ca> | 2017-11-11 15:42:13 -0500 |
| commit | 16d94a6e50eb81de9d9d438e1cce0746928597f3 (patch) | |
| tree | e1cb628d34c49690128722a33cc1d19d7dcffb23 /tests | |
| parent | e4fb8604aa2fc572a3aeeace1c32de7339d346b5 (diff) | |
Introduce input ports.
Ports are the lisp abstraction of files and streams. Actinide ports additionally guarantee a peek operation.
This makes ``tokenize`` (now ``read_token``) callable as a lisp function, as it
takes a port and reads one token from it. This is a substantial refactoring.
As most of the state is now captured by closures, it's no longer practical to
test individual states as readily. However, the top-level tokenizer tests
exercise the full state space.
Diffstat (limited to 'tests')
| -rw-r--r-- | tests/test_ports.py | 29 | ||||
| -rw-r--r-- | tests/test_tokenizer.py | 360 | ||||
| -rw-r--r-- | tests/tokens.py | 4 |
3 files changed, 56 insertions, 337 deletions
diff --git a/tests/test_ports.py b/tests/test_ports.py new file mode 100644 index 0000000..c2d1e06 --- /dev/null +++ b/tests/test_ports.py @@ -0,0 +1,29 @@ +from hypothesis import given +from hypothesis.strategies import integers, text + +from actinide.ports import * + +@given(text(), integers(min_value=1, max_value=2**32 - 1)) +def test_read(input, n): + port = string_to_input_port(input) + output = read(port, n) + + assert input.startswith(output) + assert (len(output) == 0 and len(input) == 0) != (0 < len(output) <= n) + assert output + read_fully(port) == input + +@given(text(), integers(min_value=1, max_value=2**32 - 1)) +def test_peek(input, n): + port = string_to_input_port(input) + output = peek(port, n) + + assert input.startswith(output) + assert (len(output) == 0 and len(input) == 0) != (0 < len(output) <= n) + assert read_fully(port) == input + +@given(text(), integers(min_value=1, max_value=2**32 - 1)) +def test_read_fully(input, n): + port = string_to_input_port(input) + output = read_fully(port) + + assert output == input diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py index 76a07a9..5c0ddea 100644 --- a/tests/test_tokenizer.py +++ b/tests/test_tokenizer.py @@ -3,354 +3,40 @@ from hypothesis.strategies import just, text, characters, from_regex, one_of, tu import io from actinide.tokenizer import * +from actinide.ports import * -from .tokens import spaced_token_sequences +from .tokens import spaced_token_sequences, tokens, nontokens -class ReadablePort(io.StringIO): - def __repr__(self): - # Slightly friendlier debugging output - return f"ReadablePort(str={repr(self.getvalue())}, pos={self.tell()})" - -# Many of the following tests proceed by cases, because the underlying behaviour -# is too complex to treat as a uniform set of properties. The cases are meant to -# be total, and in principle could be defined as a set of filters on the -# ``text()`` generator that , combined, exhaust the possible outcomes of that -# generator. -# -# Implementing the tests that way causes Hypothesis to generate a significant -# number of examples that it then throws away without verifying, because -# Hypothesis has no insight into filters to use when generating examples. -# Instead, this test suite specifies generators per-case. - -# Cases for tokenize_any: - -# We test this a bit differently from the subsequent tokenizer states. Because -# it's a pure routing state, we can generate lookahead, expected_state pairs and -# check them in one pass, rather than testing each possible outcome separately. -# In every case, the input is irrelevant: this state never reads. - -def next_token_states(): - return one_of( - tuples(just(''), just(tokenize_eof)), - tuples(just(';'), just(tokenize_comment)), - tuples(sampled_from('()'), just(tokenize_syntax)), - tuples(sampled_from(' \t\n'), just(tokenize_whitespace)), - tuples(just('"'), just(tokenize_atom)), - tuples(characters(blacklist_characters=' \t\n();"'), just(tokenize_atom)), - ) - -@given(next_token_states(), text()) -def test_tokenize_any(lookahead_next, input): - s, expected_state = lookahead_next - port = ReadablePort(input) - token, lookahead, next = tokenize_any(s, input) - - assert token is None - assert lookahead == s - assert next == expected_state - assert port.tell() == 0 - -# Since the previous test case is rigged for success, also verify that no input -# causes tokenize_any to enter an unexpected state or to throw an exception. -@given(text(), text()) -def test_tokenize_any_fuzz(s, input): - port = ReadablePort(input) - token, lookahead, next = tokenize_any(s, input) - - assert token is None - assert lookahead == s - assert next in (tokenize_eof, tokenize_comment, tokenize_syntax, tokenize_whitespace, tokenize_atom) - assert port.tell() == 0 - -# Cases for tokenize_eof: - -# * any lookahead, any input: tokenize_eof is a trap state performing no reads, -# always returning to itself, and never generating a token. -@given(text(), text()) -def test_tokenize_eof(s, input): - port = ReadablePort(input) - token, lookahead, next = tokenize_eof(s, port) - - assert token is None - assert lookahead == s - assert next == tokenize_eof - assert port.tell() == 0 - -# Cases for tokenize_comment: - -# * any lookahead, one or more characters beginning with a non-newline as input: -# tokenize_comment continues the current comment, throwing away one character -# of input, without generating a token. -@given(text(), from_regex(r'^[^\n].*')) -def test_tokenize_comment_continues(s, input): - port = ReadablePort(input) - token, lookahead, next = tokenize_comment(s, port) - - assert token is None - assert port.tell() == 1 - assert lookahead == input[0] - assert next == tokenize_comment - -# * any lookahead, one or more characters beginning with a newline as input, and -# * any lookahead, empty input: -# tokenize_comment concludes the current comment and prepares for the next -# token, without generating a token. -@given(text(), just('') | from_regex(r'^\n.*')) -def test_tokenize_comment_ends(s, input): - port = ReadablePort(input) - token, lookahead, next = tokenize_comment(s, port) - - assert token is None - assert port.tell() == (1 if input else 0) - assert lookahead == (input[0] if input else '') - assert next == tokenize_any - -# Cases for tokenize_syntax: - -# * any lookahead, any input: generate the lookahead as a Syntax token and -# transition back to tokenize_any to prepare for the next token, with one -# character of lookahead ready to go. -@given(text(), text()) -def test_tokenize_syntax(s, input): - port = ReadablePort(input) - token, lookahead, next = tokenize_syntax(s, port) - - assert token == s - assert port.tell() == (1 if input else 0) - assert lookahead == (input[0] if input else '') - assert next == tokenize_any - -# Cases for test_tokenize_whitespace: - -# * any lookahead, any input: throw away the presumed-whitespace lookahead, then -# transition back to tokenize_any to prepare for the next token, with one -# character of lookahead ready to go, without generating a token. -@given(text(), text()) -def test_tokenize_whitespace(s, input): - port = ReadablePort(input) - token, lookahead, next = tokenize_whitespace(s, port) - - assert token is None - assert port.tell() == (1 if input else 0) - assert lookahead == (input[0] if input else '') - assert next == tokenize_any - -# Cases for tokenize_nonstring_atom: - -# * any lookahead, any non-empty input not beginning with whitespace, syntax, a -# comment delimiter, or a string literal: accumulate one character of input -# onto the lookahead, then transition back to tokenize_symbol to process the -# next character of input, without generating a token. -@given(text(), from_regex(r'^[^ \n\t();"].*')) -def test_tokenize_nonstring_atom_continues(s, input): - port = ReadablePort(input) - token, lookahead, next = tokenize_nonstring_atom(s, port) - - assert token is None - assert port.tell() == 1 - assert lookahead == s + input[0] - assert next == tokenize_nonstring_atom - -# * any lookahead, a non-empty input beginning with whitespace, syntax, a -# comment delimiter, or a string literal, and -# * any lookahead, empty input: -# generate the accumulated input as a Symbol token, then transition back to tokenize_any with one character of lookahead ready to go. -@given(text(), just('') | from_regex(r'^[ \n\t();"].*')) -def test_tokenize_tokenize_nonstring_atom_ends(s, input): - port = ReadablePort(input) - token, lookahead, next = tokenize_nonstring_atom(s, port) - - assert token == s - assert port.tell() == (1 if input else 0) - assert lookahead == (input[0] if input else '') - assert next == tokenize_any - -# And now, the _worst_ part of the state machine. Cases for tokenize_string: - -# * any lookahead, a non-empty input not beginning with a string delimiter: -# begin a non-empty string by transitioning to the tokenize_string_character -# state with one character of lookahead, without generating a token. -@given(text(), from_regex(r'^[^"].*')) -def test_tokenize_string_continues(s, input): - port = ReadablePort(input) - token, lookahead, next = tokenize_string(s, port) - - assert token is None - assert port.tell() == 1 - assert lookahead == s + input[0] - assert next == tokenize_string_character - -# * any lookahad, a non-empty input beginning with a string delimiter: terminate -# an empty string by transitioning to the tokenize_string_end state with an -# *empty* lookahead, without generating a token. -@given(text(), from_regex(r'^["].*')) -def test_tokenize_string_empty(s, input): - port = ReadablePort(input) - token, lookahead, next = tokenize_string(s, port) - - assert token is None - assert port.tell() == 1 - assert lookahead == s + input[0] - assert next == tokenize_string_end - -# * any lookahead, empty input: emit a tokenization error, as we've encountered -# EOF inside of a string. -@given(text(), just('')) -def test_tokenize_string_eof(s, input): - try: - port = ReadablePort(input) - token, lookahead, next = tokenize_string(s, port) - - assert False # must raise - except TokenError: - assert port.tell() == 0 - -# Cases for tokenize_string_character: - -# * any lookahead, any non-empty input not beginning with a string delimiter or -# escape character: append one character of input to the lookahead, then -# continue in the tokenize_string_character state without generating a token. -@given(text(), from_regex(r'^[^\\"].*')) -def test_tokenize_string_character_continues(s, input): - port = ReadablePort(input) - token, lookahead, next = tokenize_string_character(s, port) - - assert token is None - assert port.tell() == 1 - assert lookahead == s + input[0] - assert next == tokenize_string_character - -# * any lookahead, any non-empty input which begins with an escape character: -# leave the lookahead unchanged, but transition to the -# tokenize_escaped_string_character state to determine which escape character -# we're dealing with, without emitting a token. -@given(text(), from_regex(r'^[\\].*')) -def test_tokenize_string_character_begins_escape(s, input): - port = ReadablePort(input) - token, lookahead, next = tokenize_string_character(s, port) - - assert token is None - assert port.tell() == 1 - assert lookahead == s + input[0] - assert next == tokenize_escaped_string_character - -# * any lookahead, any non-empty input which begins with a string delimiter: -# we're at the end of a string. Transition to the tokenize_string_end state -# with the current lookahead, without generating a token. -@given(text(), from_regex(r'^["].*')) -def test_tokenize_string_character_ends(s, input): - port = ReadablePort(input) - token, lookahead, next = tokenize_string_character(s, port) - - assert token is None - assert port.tell() == 1 - assert lookahead == s + input[0] - assert next == tokenize_string_end - -# * any lookahead, empty input: emit a tokenization error, as we've encountered -# EOF inside of a string literal. -@given(text(), just('')) -def test_tokenize_string_character_eof(s, input): - try: - port = ReadablePort(input) - token, lookahead, next = tokenize_string_character(s, port) - - assert False # must raise - except TokenError: - assert input == '' - assert port.tell() == 0 - -# Cases for tokenize_escaped_string: - -# * any lookahead, any non-empty input beginning with a legal string escaped -# character: de-escape the first character of the input, append the result to -# the lookahead, then transition back to the tokenize_string_character state. -@given(text(), from_regex(r'^["\\].*')) -def test_tokenize_escaped_string_character_valid(s, input): - port = ReadablePort(input) - token, lookahead, next = tokenize_escaped_string_character(s, port) - - assert token is None - assert port.tell() == 1 - assert lookahead == s + input[0] - assert next == tokenize_string_character - -# * any lookahead, any non-empty input not beginning with a legal string escaped -# character: emit a tokenization error, we've found an invalid string escape. -@given(text(), from_regex(r'^[^"\\].*')) -def test_tokenize_escaped_string_character_invalid(s, input): - try: - port = ReadablePort(input) - token, lookahead, next = tokenize_escaped_string_character(s, port) - - assert False # must raise - except TokenError: - assert port.tell() == 1 - -# * any lookahead, empty input: emit a tokenization error, we've found an EOF -# inside of a string literal. -@given(text(), just('')) -def test_tokenize_escaped_string_character_eof(s, input): - try: - port = ReadablePort(input) - token, lookahead, next = tokenize_escaped_string_character(s, port) - - assert False # must raise - except TokenError: - assert port.tell() == 0 - -# Cases for tokenize_string_end: - -# * any lookahead, any input: generate a String token from the lookahead, then -# transition back to the tokenize_any state with one character of lookahead -# ready to go. -@given(text(), text()) -def test_tokenize_string_end(s, input): - port = ReadablePort(input) - token, lookahead, next = tokenize_string_end(s, port) - - assert token == s - assert port.tell() == (1 if input else 0) - assert lookahead == (input[0] if input else '') - assert next == tokenize_any - -# Cases for tokenize_atom: - -# * lookahead containing a string delimiter, any input: found a string atom, -# transition to the tokenize_string state without reading or generating a -# token. -@given(just('"'), text()) -def test_tokenize_atom_string(s, input): - port = ReadablePort(input) - token, lookahead, next = tokenize_atom(s, port) +# Cases for the tokenizer: - assert token is None - assert port.tell() == 0 - assert lookahead == s - assert next == tokenize_string +# * any single token: reads back that token. +@given(tokens()) +def test_tokenizer_single_token(input): + port = string_to_input_port(input) -# * lookahead containing something other than a string delimiter, any input: -# found a nonstring atom, transition to the tokenize_nonstring_atom state -# without reading or generating a token. -@given(from_regex(r'^[^"]'), text()) -def test_tokenize_atom_nonstring(s, input): - port = ReadablePort(input) - token, lookahead, next = tokenize_atom(s, port) + assert read_token(port) == input - assert token is None - assert port.tell() == 0 - assert lookahead == s - assert next == tokenize_nonstring_atom +# * any input guaranteed not to contain a token: reads back None, consuming the +# whole input in the process. +@given(nontokens()) +def test_tokenizer_no_token(input): + port = string_to_input_port(input) -# Cases for the tokenizer: + assert read_token(port) == None # * any sequence of separator-token pairs: if the pairs are coalesced into a # single giant input, does the tokenizer recover the tokens? @given(spaced_token_sequences()) -def test_tokenizer(spaced_tokens): +def test_tokenizer_spaced_sequence(spaced_tokens): input = ''.join(''.join(pair) for pair in spaced_tokens) tokens = [token for (_, token) in spaced_tokens] - port = ReadablePort(input) + port = string_to_input_port(input) + def iterate_read_token(port): + token = read_token(port) + while token is not None: + yield token + token = read_token(port) + + assert list(iterate_read_token(port)) == tokens - assert list(tokenize(port)) == tokens diff --git a/tests/tokens.py b/tests/tokens.py index 0027fb2..3eb58b8 100644 --- a/tests/tokens.py +++ b/tests/tokens.py @@ -48,6 +48,10 @@ def whitespace_characters(): def tokens(): return one_of(symbols(), strings(), open_parens(), close_parens()) +# Generates a string which may not be empty, but which does not contain a token. +def nontokens(): + return one_of(whitespace(), comments(), just('')) + # Generates at least one character of whitespace. def whitespace(): return text(whitespace_characters(), min_size=1) |
