From e4fb8604aa2fc572a3aeeace1c32de7339d346b5 Mon Sep 17 00:00:00 2001 From: Owen Jacobson Date: Fri, 10 Nov 2017 01:25:30 -0500 Subject: Testing fixes. * Add a top-level test that roundtrips sequences of tokens. (This found a real bug. Thanks, Hypothesis!) * Remove type conversion from the tokenizer. This simplifies the code, and makes testing considerably easier. * Fix some bugs in string literal parsing (again: Thanks, Hypothesis!) Document the test cases, and the case-by-case strategy, better. This also involved prying apart some tests that cover multiple cases. Stop treating empty strings as if they were EOFs. (Thanks, Hypothesis!) fixup! Stop treating empty strings as if they were EOFs. (Thanks, Hypothesis!) Remove type conversion from the tokenizer. It turns out that this made the tokenizer harder to test, because it was doing too many things. The tokenizer now _only_ divides the input port into tokens, without parsing or converting those tokens. Fix up tests for fuck --- tests/__init__.py | 0 tests/test_tokenizer.py | 291 +++++++++++++++++++++++++++++++++++++----------- tests/tokens.py | 90 +++++++++++++++ 3 files changed, 317 insertions(+), 64 deletions(-) create mode 100644 tests/__init__.py create mode 100644 tests/tokens.py (limited to 'tests') diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py index a300eb2..76a07a9 100644 --- a/tests/test_tokenizer.py +++ b/tests/test_tokenizer.py @@ -1,17 +1,71 @@ -from hypothesis import given, settings, HealthCheck -from hypothesis.strategies import text, from_regex +from hypothesis import given, settings, HealthCheck, event +from hypothesis.strategies import just, text, characters, from_regex, one_of, tuples, sampled_from import io from actinide.tokenizer import * +from .tokens import spaced_token_sequences + class ReadablePort(io.StringIO): def __repr__(self): # Slightly friendlier debugging output return f"ReadablePort(str={repr(self.getvalue())}, pos={self.tell()})" -def not_(f): - return lambda *args, **kwargs: not f(*args, **kwargs) +# Many of the following tests proceed by cases, because the underlying behaviour +# is too complex to treat as a uniform set of properties. The cases are meant to +# be total, and in principle could be defined as a set of filters on the +# ``text()`` generator that , combined, exhaust the possible outcomes of that +# generator. +# +# Implementing the tests that way causes Hypothesis to generate a significant +# number of examples that it then throws away without verifying, because +# Hypothesis has no insight into filters to use when generating examples. +# Instead, this test suite specifies generators per-case. + +# Cases for tokenize_any: + +# We test this a bit differently from the subsequent tokenizer states. Because +# it's a pure routing state, we can generate lookahead, expected_state pairs and +# check them in one pass, rather than testing each possible outcome separately. +# In every case, the input is irrelevant: this state never reads. + +def next_token_states(): + return one_of( + tuples(just(''), just(tokenize_eof)), + tuples(just(';'), just(tokenize_comment)), + tuples(sampled_from('()'), just(tokenize_syntax)), + tuples(sampled_from(' \t\n'), just(tokenize_whitespace)), + tuples(just('"'), just(tokenize_atom)), + tuples(characters(blacklist_characters=' \t\n();"'), just(tokenize_atom)), + ) + +@given(next_token_states(), text()) +def test_tokenize_any(lookahead_next, input): + s, expected_state = lookahead_next + port = ReadablePort(input) + token, lookahead, next = tokenize_any(s, input) + + assert token is None + assert lookahead == s + assert next == expected_state + assert port.tell() == 0 +# Since the previous test case is rigged for success, also verify that no input +# causes tokenize_any to enter an unexpected state or to throw an exception. +@given(text(), text()) +def test_tokenize_any_fuzz(s, input): + port = ReadablePort(input) + token, lookahead, next = tokenize_any(s, input) + + assert token is None + assert lookahead == s + assert next in (tokenize_eof, tokenize_comment, tokenize_syntax, tokenize_whitespace, tokenize_atom) + assert port.tell() == 0 + +# Cases for tokenize_eof: + +# * any lookahead, any input: tokenize_eof is a trap state performing no reads, +# always returning to itself, and never generating a token. @given(text(), text()) def test_tokenize_eof(s, input): port = ReadablePort(input) @@ -22,12 +76,12 @@ def test_tokenize_eof(s, input): assert next == tokenize_eof assert port.tell() == 0 -def comment_continues(text): - if text == '': - return False - return text[0] != '\n' +# Cases for tokenize_comment: -@given(text(), text().filter(comment_continues)) +# * any lookahead, one or more characters beginning with a non-newline as input: +# tokenize_comment continues the current comment, throwing away one character +# of input, without generating a token. +@given(text(), from_regex(r'^[^\n].*')) def test_tokenize_comment_continues(s, input): port = ReadablePort(input) token, lookahead, next = tokenize_comment(s, port) @@ -37,7 +91,11 @@ def test_tokenize_comment_continues(s, input): assert lookahead == input[0] assert next == tokenize_comment -@given(text(), text().filter(not_(comment_continues))) +# * any lookahead, one or more characters beginning with a newline as input, and +# * any lookahead, empty input: +# tokenize_comment concludes the current comment and prepares for the next +# token, without generating a token. +@given(text(), just('') | from_regex(r'^\n.*')) def test_tokenize_comment_ends(s, input): port = ReadablePort(input) token, lookahead, next = tokenize_comment(s, port) @@ -47,17 +105,26 @@ def test_tokenize_comment_ends(s, input): assert lookahead == (input[0] if input else '') assert next == tokenize_any +# Cases for tokenize_syntax: + +# * any lookahead, any input: generate the lookahead as a Syntax token and +# transition back to tokenize_any to prepare for the next token, with one +# character of lookahead ready to go. @given(text(), text()) def test_tokenize_syntax(s, input): port = ReadablePort(input) token, lookahead, next = tokenize_syntax(s, port) - assert token == Syntax(s) - assert isinstance(token, Syntax) + assert token == s assert port.tell() == (1 if input else 0) assert lookahead == (input[0] if input else '') assert next == tokenize_any +# Cases for test_tokenize_whitespace: + +# * any lookahead, any input: throw away the presumed-whitespace lookahead, then +# transition back to tokenize_any to prepare for the next token, with one +# character of lookahead ready to go, without generating a token. @given(text(), text()) def test_tokenize_whitespace(s, input): port = ReadablePort(input) @@ -68,67 +135,82 @@ def test_tokenize_whitespace(s, input): assert lookahead == (input[0] if input else '') assert next == tokenize_any -def symbol_continues(text): - if text == '': - return False - return text[0] not in ' \n\t();"' +# Cases for tokenize_nonstring_atom: -@given(text(), text().filter(symbol_continues)) -def test_tokenize_symbol_continues(s, input): +# * any lookahead, any non-empty input not beginning with whitespace, syntax, a +# comment delimiter, or a string literal: accumulate one character of input +# onto the lookahead, then transition back to tokenize_symbol to process the +# next character of input, without generating a token. +@given(text(), from_regex(r'^[^ \n\t();"].*')) +def test_tokenize_nonstring_atom_continues(s, input): port = ReadablePort(input) - token, lookahead, next = tokenize_symbol(s, port) + token, lookahead, next = tokenize_nonstring_atom(s, port) assert token is None assert port.tell() == 1 assert lookahead == s + input[0] - assert next == tokenize_symbol - -@given(text(), text().filter(not_(symbol_continues))) -def test_tokenize_symbol_ends(s, input): + assert next == tokenize_nonstring_atom + +# * any lookahead, a non-empty input beginning with whitespace, syntax, a +# comment delimiter, or a string literal, and +# * any lookahead, empty input: +# generate the accumulated input as a Symbol token, then transition back to tokenize_any with one character of lookahead ready to go. +@given(text(), just('') | from_regex(r'^[ \n\t();"].*')) +def test_tokenize_tokenize_nonstring_atom_ends(s, input): port = ReadablePort(input) - token, lookahead, next = tokenize_symbol(s, port) + token, lookahead, next = tokenize_nonstring_atom(s, port) - assert token == Symbol(s) - assert isinstance(token, Symbol) + assert token == s assert port.tell() == (1 if input else 0) assert lookahead == (input[0] if input else '') assert next == tokenize_any -def string_continues(text): - if text == '': - return False - return not text[0] == '"' +# And now, the _worst_ part of the state machine. Cases for tokenize_string: -@given(text(), text().filter(string_continues)) +# * any lookahead, a non-empty input not beginning with a string delimiter: +# begin a non-empty string by transitioning to the tokenize_string_character +# state with one character of lookahead, without generating a token. +@given(text(), from_regex(r'^[^"].*')) def test_tokenize_string_continues(s, input): port = ReadablePort(input) token, lookahead, next = tokenize_string(s, port) assert token is None assert port.tell() == 1 - assert lookahead == input[0] + assert lookahead == s + input[0] assert next == tokenize_string_character -@given(text(), text().filter(not_(string_continues))) -def test_tokenize_string_ends(s, input): +# * any lookahad, a non-empty input beginning with a string delimiter: terminate +# an empty string by transitioning to the tokenize_string_end state with an +# *empty* lookahead, without generating a token. +@given(text(), from_regex(r'^["].*')) +def test_tokenize_string_empty(s, input): + port = ReadablePort(input) + token, lookahead, next = tokenize_string(s, port) + + assert token is None + assert port.tell() == 1 + assert lookahead == s + input[0] + assert next == tokenize_string_end + +# * any lookahead, empty input: emit a tokenization error, as we've encountered +# EOF inside of a string. +@given(text(), just('')) +def test_tokenize_string_eof(s, input): try: port = ReadablePort(input) token, lookahead, next = tokenize_string(s, port) - assert token is None - assert port.tell() == 1 - assert lookahead == '' - assert next == tokenize_string_end + assert False # must raise except TokenError: - assert input == '' assert port.tell() == 0 -def is_escape(text): - if text == '': - return False - return text[0] == '\\' +# Cases for tokenize_string_character: -@given(text(), text().filter(string_continues).filter(not_(is_escape))) +# * any lookahead, any non-empty input not beginning with a string delimiter or +# escape character: append one character of input to the lookahead, then +# continue in the tokenize_string_character state without generating a token. +@given(text(), from_regex(r'^[^\\"].*')) def test_tokenize_string_character_continues(s, input): port = ReadablePort(input) token, lookahead, next = tokenize_string_character(s, port) @@ -138,56 +220,137 @@ def test_tokenize_string_character_continues(s, input): assert lookahead == s + input[0] assert next == tokenize_string_character -# Using from_regex() rather than text() because searching randomly for strings -# that start with a specific character is far, _far_ too slow. (It often fails -# to find any examples.) I _think_ this preserves the property that this group -# of three tests are exhaustive, but it's not as obvious as it would be if I -# could use text() here. -@given(text(), from_regex(r'\\.*').filter(string_continues).filter(is_escape)) +# * any lookahead, any non-empty input which begins with an escape character: +# leave the lookahead unchanged, but transition to the +# tokenize_escaped_string_character state to determine which escape character +# we're dealing with, without emitting a token. +@given(text(), from_regex(r'^[\\].*')) def test_tokenize_string_character_begins_escape(s, input): port = ReadablePort(input) token, lookahead, next = tokenize_string_character(s, port) assert token is None assert port.tell() == 1 - assert lookahead == s + assert lookahead == s + input[0] assert next == tokenize_escaped_string_character -@given(text(), text().filter(not_(string_continues))) +# * any lookahead, any non-empty input which begins with a string delimiter: +# we're at the end of a string. Transition to the tokenize_string_end state +# with the current lookahead, without generating a token. +@given(text(), from_regex(r'^["].*')) def test_tokenize_string_character_ends(s, input): + port = ReadablePort(input) + token, lookahead, next = tokenize_string_character(s, port) + + assert token is None + assert port.tell() == 1 + assert lookahead == s + input[0] + assert next == tokenize_string_end + +# * any lookahead, empty input: emit a tokenization error, as we've encountered +# EOF inside of a string literal. +@given(text(), just('')) +def test_tokenize_string_character_eof(s, input): try: port = ReadablePort(input) token, lookahead, next = tokenize_string_character(s, port) - assert token is None - assert port.tell() == 1 - assert lookahead == s - assert next == tokenize_string_end + assert False # must raise except TokenError: assert input == '' assert port.tell() == 0 -@given(text(), text()) -def test_tokenize_escaped_string_character(s, input): +# Cases for tokenize_escaped_string: + +# * any lookahead, any non-empty input beginning with a legal string escaped +# character: de-escape the first character of the input, append the result to +# the lookahead, then transition back to the tokenize_string_character state. +@given(text(), from_regex(r'^["\\].*')) +def test_tokenize_escaped_string_character_valid(s, input): + port = ReadablePort(input) + token, lookahead, next = tokenize_escaped_string_character(s, port) + + assert token is None + assert port.tell() == 1 + assert lookahead == s + input[0] + assert next == tokenize_string_character + +# * any lookahead, any non-empty input not beginning with a legal string escaped +# character: emit a tokenization error, we've found an invalid string escape. +@given(text(), from_regex(r'^[^"\\].*')) +def test_tokenize_escaped_string_character_invalid(s, input): try: port = ReadablePort(input) token, lookahead, next = tokenize_escaped_string_character(s, port) - assert token is None + assert False # must raise + except TokenError: assert port.tell() == 1 - assert lookahead == s + input[0] - assert next == tokenize_string_character + +# * any lookahead, empty input: emit a tokenization error, we've found an EOF +# inside of a string literal. +@given(text(), just('')) +def test_tokenize_escaped_string_character_eof(s, input): + try: + port = ReadablePort(input) + token, lookahead, next = tokenize_escaped_string_character(s, port) + + assert False # must raise except TokenError: - assert input == '' or input[0] not in '\\n' - assert port.tell() == (1 if input else 0) + assert port.tell() == 0 + +# Cases for tokenize_string_end: +# * any lookahead, any input: generate a String token from the lookahead, then +# transition back to the tokenize_any state with one character of lookahead +# ready to go. @given(text(), text()) def test_tokenize_string_end(s, input): port = ReadablePort(input) token, lookahead, next = tokenize_string_end(s, port) assert token == s - assert isinstance(token, String) assert port.tell() == (1 if input else 0) assert lookahead == (input[0] if input else '') assert next == tokenize_any + +# Cases for tokenize_atom: + +# * lookahead containing a string delimiter, any input: found a string atom, +# transition to the tokenize_string state without reading or generating a +# token. +@given(just('"'), text()) +def test_tokenize_atom_string(s, input): + port = ReadablePort(input) + token, lookahead, next = tokenize_atom(s, port) + + assert token is None + assert port.tell() == 0 + assert lookahead == s + assert next == tokenize_string + +# * lookahead containing something other than a string delimiter, any input: +# found a nonstring atom, transition to the tokenize_nonstring_atom state +# without reading or generating a token. +@given(from_regex(r'^[^"]'), text()) +def test_tokenize_atom_nonstring(s, input): + port = ReadablePort(input) + token, lookahead, next = tokenize_atom(s, port) + + assert token is None + assert port.tell() == 0 + assert lookahead == s + assert next == tokenize_nonstring_atom + +# Cases for the tokenizer: + +# * any sequence of separator-token pairs: if the pairs are coalesced into a +# single giant input, does the tokenizer recover the tokens? +@given(spaced_token_sequences()) +def test_tokenizer(spaced_tokens): + input = ''.join(''.join(pair) for pair in spaced_tokens) + tokens = [token for (_, token) in spaced_tokens] + + port = ReadablePort(input) + + assert list(tokenize(port)) == tokens diff --git a/tests/tokens.py b/tests/tokens.py new file mode 100644 index 0000000..0027fb2 --- /dev/null +++ b/tests/tokens.py @@ -0,0 +1,90 @@ +from hypothesis.strategies import just, one_of, characters, text, lists, tuples +from hypothesis.strategies import composite, recursive + +# Generators for token families + +# Generates the `(` token. +def open_parens(): + return just('(') + +# Generates the ')' token. +def close_parens(): + return just(')') + +# Generates characters that are legal, unescaped, inside of a string. +def string_bare_characters(): + return characters(blacklist_characters='\\"') + +# Generates legal string escape sequences. +def string_escaped_characters(): + return one_of(just('"'), just('\\')).map(lambda c: '\\' + c) + +# Generates single-character string representations, including escapes. +def string_characters(): + return one_of(string_bare_characters(), string_escaped_characters()) + +# Generates arbitrary string bodies (strings, without leading or trailing +# quotes) +def string_body(): + return text(string_characters()) + +# Generates legal strings. +def strings(): + return tuples(just('"'), string_body(), just('"')).map(lambda t: ''.join(t)) + +# Generates characters which are legal within a symbol. +def symbol_characters(): + return characters(blacklist_characters=' \t\n();"') + +# Generates legal symbols. +def symbols(): + return text(symbol_characters(), min_size=1) + +# Generates single whitespace characters. +def whitespace_characters(): + return one_of(just('\n'), just(' '), just('\t')) + +# Generates a single token. +def tokens(): + return one_of(symbols(), strings(), open_parens(), close_parens()) + +# Generates at least one character of whitespace. +def whitespace(): + return text(whitespace_characters(), min_size=1) + +# Generates characters which can legally appear inside of a comment (anything +# but a newline). +def comment_characters(): + return characters(blacklist_characters='\n') + +# Generates a (possibly-empty) comment, terminated with a trailing newline. +def comments(): + return tuples(just(';'), text(comment_characters()), just('\n')).map(lambda t: ''.join(t)) + +# Generates sequences which can be inserted between arbitrary pairs of tokens +# without changing their meaning. +def intertokens(): + return one_of(comments(), whitespace()) + +# Generate a pair such that the second element is a token, and joining the +# elements with an empty string produces a string that tokenizes to the second +# element. +def spaced_tokens(): + def spaced(strategy): + return tuples(intertokens(), strategy) + def unspaced(strategy): + return tuples(one_of(just(''), intertokens()), strategy) + def spaced_symbols(): + return spaced(symbols()) + def spaced_strings(): + return unspaced(strings()) + def spaced_open_parens(): + return unspaced(open_parens()) + def spaced_close_parens(): + return unspaced(close_parens()) + + return one_of(spaced_symbols(), spaced_strings(), spaced_open_parens(), spaced_close_parens()) + +# Generats a list of pairs as per spaced_token(). +def spaced_token_sequences(): + return lists(spaced_tokens()) -- cgit v1.2.3