diff options
| author | Owen Jacobson <owen@grimoire.ca> | 2017-11-10 01:25:30 -0500 |
|---|---|---|
| committer | Owen Jacobson <owen@grimoire.ca> | 2017-11-11 01:16:01 -0500 |
| commit | e4fb8604aa2fc572a3aeeace1c32de7339d346b5 (patch) | |
| tree | 9f58493ab73ada22943bf009b3a910c3236dca8d | |
| parent | f33c395f833567b665d14fe0c577799605e8091e (diff) | |
Testing fixes.
* Add a top-level test that roundtrips sequences of tokens. (This found a real bug. Thanks, Hypothesis!)
* Remove type conversion from the tokenizer. This simplifies the code, and makes testing considerably easier.
* Fix some bugs in string literal parsing (again: Thanks, Hypothesis!)
Document the test cases, and the case-by-case strategy, better.
This also involved prying apart some tests that cover multiple cases.
Stop treating empty strings as if they were EOFs. (Thanks, Hypothesis!)
fixup! Stop treating empty strings as if they were EOFs. (Thanks, Hypothesis!)
Remove type conversion from the tokenizer.
It turns out that this made the tokenizer harder to test, because it was doing too many things. The tokenizer now _only_ divides the input port into tokens, without parsing or converting those tokens.
Fix up tests for fuck
| -rw-r--r-- | actinide/tokenizer.py | 114 | ||||
| -rw-r--r-- | actinide/types.py | 12 | ||||
| -rw-r--r-- | tests/__init__.py | 0 | ||||
| -rw-r--r-- | tests/test_tokenizer.py | 291 | ||||
| -rw-r--r-- | tests/tokens.py | 90 |
5 files changed, 382 insertions, 125 deletions
diff --git a/actinide/tokenizer.py b/actinide/tokenizer.py index 8fb9d0a..9767033 100644 --- a/actinide/tokenizer.py +++ b/actinide/tokenizer.py @@ -1,5 +1,3 @@ -from .types import * - # ## TOKENIZATION # # The following code implements a state-machine-driven tokenizer which can @@ -8,19 +6,20 @@ from .types import * # # * Comments: ``;`` followed by all bytes to EOF or to the end of the line. # -# * Strings: ``"`` through to the next unescaped ``"`` are read, de-escaped, and -# returned. The sequences ``\"`` and ``\\`` are treated specially: the former -# de-escapes to ``"``, and the latter to ``\``. An unclosed string literal or -# an unknown escape sequence is a tokenization error. -# # * Open and close parens: ``(`` and ``)`` are returned as freestanding tokens. # # * Whitespace: Space, horizontal tab, and newline characters are discarded # during tokenization. # -# * Symbols: Any sequence of characters not included in one of the above classes +# * Strings: ``"`` through to the next unescaped ``"`` are read and returned. +# Sequences within the string beginning with ``\`` indicate an escape, and may +# only be followed by the character ``"`` or ``\``. An unclosed string literal +# or an unknown escape sequence is a tokenization error. +# +# * Atoms: Any sequence of characters not included in one of the above classes # is read and returned as a single token. This includes words, numbers, and -# special literals. +# special literals. (Strings are, technically, a kind of atom, but the lexer +# treats them specially due to their complexity.) # # Internally, the tokenizer is a state machine which maintains two pieces of # state: the "lookahead" (holding data to feed to the next state transition @@ -34,6 +33,11 @@ from .types import * # port) to determine the next state, and return a 3-tuple of ``token`` (may be # ``None``), ``lookahead`` (which replaces the previous lookahead), and ``next`` # (the new next state transition function). +# +# This is heavily inspired by various tail-recursive approaches to tokenizing +# lisp streams. However, the host language does not guarantee tail call +# optimizations, so we use an explicit trampoline function to drive the state +# machine instead of calling each parser directly. class TokenError(Exception): ''' @@ -47,20 +51,20 @@ class TokenError(Exception): # # This is the top-level driver for the state machine that divides the underlying # input into tokens. It does no input handling itself, other than reading the -# first character of the port: so long as the lookahead is non-empty, this calls -# the next state transition function to determine what to do and how to change -# the lookahead. +# first character of the port: this calls the next state transition function to +# determine what to do and how to change the lookahead. # -# Initially, this is in the ``tokenize_any`` state. +# Initially, this is in the ``tokenize_any`` state, and exits once it reaches +# the ``tokenize_eof`` state. def tokenize(port): lookahead, next = port.read(1), tokenize_any - while len(lookahead) > 0: + while next != tokenize_eof: token, lookahead, next = next(lookahead, port) if token is not None: yield token -# If the lookahead is exactly one read result, this will correctly determine the -# next token type and return that state without consuming input. This is +# If the lookahead is exactly one character, this will correctly determine the +# next token type and transition to that state without consuming input. This is # generally the correct state to transition to any time the next token is # unknown - for example, at the end of another token. # @@ -75,9 +79,7 @@ def tokenize_any(lookahead, port): return None, lookahead, tokenize_syntax if lookahead in ' \t\n': return None, lookahead, tokenize_whitespace - if lookahead == '"': - return None, lookahead, tokenize_string - return None, lookahead, tokenize_symbol + return None, lookahead, tokenize_atom # Special trap state. This never produces a token, and always transitions to # itself. The lookahead in this state is generally ``''``, and since this never @@ -89,8 +91,8 @@ def tokenize_any(lookahead, port): def tokenize_eof(lookahead, port): return None, lookahead, tokenize_eof -# Consumes one read result at a time until it finds an end of line or runs out -# of input. This throws away comments entirely, at parse time, without +# Consumes one character at a time until it finds an end of line or runs out of +# input. This throws away comments entirely, at tokenization time, without # considering whether the comment content can be separated into tokens. As this # scans the comment, the lookahead will be set to successive characters from the # port, but never more than one character at a time. @@ -104,51 +106,64 @@ def tokenize_comment(lookahead, port): return None, next, tokenize_any return None, next, tokenize_comment -# Consumes the lookahead and packages it up as a Syntax token. This is generally -# appropriate for the ``(`` and ``)`` syntactic elements. +# Generates the entire lookahead as a token. This is generally appropriate for +# the ``(`` and ``)`` syntactic elements. # # The resulting lookahead will be the next character of input, and this always # dispatches back to ``tokenize_any`` so that the next token (if any) can be # determined. def tokenize_syntax(lookahead, port): - return Syntax(lookahead), port.read(1), tokenize_any + return lookahead, port.read(1), tokenize_any -# Consumes and ignores whitespace in the input. This never produces a token, and +# Consumes and ignores one character of input. This never produces a token, and # throws away the lookahead entirely. The resulting lookahead is the next # character of input. def tokenize_whitespace(lookahead, port): return None, port.read(1), tokenize_any -# Consumes characters until it finds a character which cannot be part of a token -# or until it finds the end of input, accumulating them into a single Symbol -# token. This is a heavily-overloaded token category, as it contains not only -# Actinide symbols but also all non-String literals. +# We've ruled out all non-atom tokens. If the lookahead is a string delimiter, +# transitions to a state which tokenizes a single string literal; otherwise, +# transitions to a state which consumes a single non-string atom. In both cases, +# this leaves the lookahead alone, and generates no token. +def tokenize_atom(lookahead, port): + if lookahead == '"': + return None, lookahead, tokenize_string + return None, lookahead, tokenize_nonstring_atom + +# Consumes characters until it finds a character which cannot be part of a +# non-string atom, or until it finds the end of input, accumulating them into a +# single token. This is a heavily-overloaded token category, as it contains not +# only Actinide symbols but also all non-String literals. # # While the tokenizer remains in this state, the lookahead accumulates the # characters of the token. When this matches a completed token, it produces a # Symbol token, and resets the lookahead back to a single read result containing # the next character of input. -def tokenize_symbol(lookahead, port): +def tokenize_nonstring_atom(lookahead, port): next = port.read(1) if next == '': - return Symbol(lookahead), next, tokenize_any + return lookahead, next, tokenize_any if next in '"(); \t\n': - return Symbol(lookahead), next, tokenize_any - return None, lookahead + next, tokenize_symbol + return lookahead, next, tokenize_any + return None, lookahead + next, tokenize_nonstring_atom # ### STRINGS # -# The following states handle string literals in the input stream. String -# literals are fairly simple: they begin with a quote, contain arbitrary -# characters other than a bare \ or ", and end with a quote. The sequences -# ``\\`` and ``\"`` are de-escaped by removing the leading backslash and -# included in the resulting string. +# The following family of states handles string literals in the input stream. +# String literals are fairly simple: they begin with a quote, contain arbitrary +# characters other than a bare \ or ", and end with a quote. (Note that ``\n`` +# is not an escape sequence: unescaped newlines are permitted within string +# literals.) # # These states use the lookahead to accumulate the characters of the string. On # transition back to ``tokenize_any``, the lookahead is always set back to a # single character. If, at any point, these states encounter EOF, they raise a # ``TokenError``: no legal token in Actinide begins with a quote mark and ends # with EOF. +# +# Because tokenization is only concerned with dividing the input into tokens, +# this machine *does not* strip quotes or replace escape sequences. On success, +# it generates a token containing the whole the string literal, verbatim. # The lookahead is assumed to be the opening quote of a string, and discarded. # Read forwards one character to determine whether this is an empty string @@ -161,9 +176,11 @@ def tokenize_string(lookahead, port): next = port.read(1) if next == '': raise TokenError('Unclosed string literal') + if next == '\\': + return None, lookahead + next, tokenize_escaped_string_character if next == '"': - return None, '', tokenize_string_end - return None, next, tokenize_string_character + return None, lookahead + next, tokenize_string_end + return None, lookahead + next, tokenize_string_character # The lookahead contains the body of the string read so far. Reads forwards one # character to determine if the string continues, contains an escaped character, @@ -175,15 +192,14 @@ def tokenize_string_character(lookahead, port): if next == '': raise TokenError('Unclosed string literal') if next == '\\': - return None, lookahead, tokenize_escaped_string_character + return None, lookahead + next, tokenize_escaped_string_character if next == '"': - return None, lookahead, tokenize_string_end + return None, lookahead + next, tokenize_string_end return None, lookahead + next, tokenize_string_character # The lookahead contains the body of the string so far. Reads forwards one # character to determine which, if any, escaped character to process: if it's -# one we recognize, de-escape it and append it to the string, otherwise raise a -# TokenError. +# one we recognize, append it to the string, otherwise raise a TokenError. # # This never yields a token, and always dispatches back to # ``tokenize_string_character`` on a legal escape character. @@ -191,14 +207,14 @@ def tokenize_escaped_string_character(lookahead, port): next = port.read(1) if next == '': raise TokenError('Unclosed string literal') - if next == 'n': - return None, lookahead + '\n', tokenize_string_character + if next == '"': + return None, lookahead + next, tokenize_string_character if next == '\\': - return None, lookahead + '\\', tokenize_string_character - raise TokenError(f"Unknown string escape '\{next}'") + return None, lookahead + next, tokenize_string_character + raise TokenError(f"Invalid string escape '\\{next}'") # Package the lookahead (the full string body, de-escaped and without leading # and trailing quotes) up as a String token and return it, then transition back # to the ``tokenize_any`` state with a single read result in the lookahead. def tokenize_string_end(lookahead, port): - return String(lookahead), port.read(1), tokenize_any + return lookahead, port.read(1), tokenize_any diff --git a/actinide/types.py b/actinide/types.py deleted file mode 100644 index 2b618f4..0000000 --- a/actinide/types.py +++ /dev/null @@ -1,12 +0,0 @@ -__all__ = ['String', 'Symbol', 'Syntax'] - -# ## REPRESENTATIONS -# -# The following defines specify the Python representations of various Actinide -# types. - -String = str -class Symbol(str): - pass -class Syntax(str): - pass diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/tests/__init__.py diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py index a300eb2..76a07a9 100644 --- a/tests/test_tokenizer.py +++ b/tests/test_tokenizer.py @@ -1,17 +1,71 @@ -from hypothesis import given, settings, HealthCheck -from hypothesis.strategies import text, from_regex +from hypothesis import given, settings, HealthCheck, event +from hypothesis.strategies import just, text, characters, from_regex, one_of, tuples, sampled_from import io from actinide.tokenizer import * +from .tokens import spaced_token_sequences + class ReadablePort(io.StringIO): def __repr__(self): # Slightly friendlier debugging output return f"ReadablePort(str={repr(self.getvalue())}, pos={self.tell()})" -def not_(f): - return lambda *args, **kwargs: not f(*args, **kwargs) +# Many of the following tests proceed by cases, because the underlying behaviour +# is too complex to treat as a uniform set of properties. The cases are meant to +# be total, and in principle could be defined as a set of filters on the +# ``text()`` generator that , combined, exhaust the possible outcomes of that +# generator. +# +# Implementing the tests that way causes Hypothesis to generate a significant +# number of examples that it then throws away without verifying, because +# Hypothesis has no insight into filters to use when generating examples. +# Instead, this test suite specifies generators per-case. + +# Cases for tokenize_any: + +# We test this a bit differently from the subsequent tokenizer states. Because +# it's a pure routing state, we can generate lookahead, expected_state pairs and +# check them in one pass, rather than testing each possible outcome separately. +# In every case, the input is irrelevant: this state never reads. + +def next_token_states(): + return one_of( + tuples(just(''), just(tokenize_eof)), + tuples(just(';'), just(tokenize_comment)), + tuples(sampled_from('()'), just(tokenize_syntax)), + tuples(sampled_from(' \t\n'), just(tokenize_whitespace)), + tuples(just('"'), just(tokenize_atom)), + tuples(characters(blacklist_characters=' \t\n();"'), just(tokenize_atom)), + ) + +@given(next_token_states(), text()) +def test_tokenize_any(lookahead_next, input): + s, expected_state = lookahead_next + port = ReadablePort(input) + token, lookahead, next = tokenize_any(s, input) + + assert token is None + assert lookahead == s + assert next == expected_state + assert port.tell() == 0 +# Since the previous test case is rigged for success, also verify that no input +# causes tokenize_any to enter an unexpected state or to throw an exception. +@given(text(), text()) +def test_tokenize_any_fuzz(s, input): + port = ReadablePort(input) + token, lookahead, next = tokenize_any(s, input) + + assert token is None + assert lookahead == s + assert next in (tokenize_eof, tokenize_comment, tokenize_syntax, tokenize_whitespace, tokenize_atom) + assert port.tell() == 0 + +# Cases for tokenize_eof: + +# * any lookahead, any input: tokenize_eof is a trap state performing no reads, +# always returning to itself, and never generating a token. @given(text(), text()) def test_tokenize_eof(s, input): port = ReadablePort(input) @@ -22,12 +76,12 @@ def test_tokenize_eof(s, input): assert next == tokenize_eof assert port.tell() == 0 -def comment_continues(text): - if text == '': - return False - return text[0] != '\n' +# Cases for tokenize_comment: -@given(text(), text().filter(comment_continues)) +# * any lookahead, one or more characters beginning with a non-newline as input: +# tokenize_comment continues the current comment, throwing away one character +# of input, without generating a token. +@given(text(), from_regex(r'^[^\n].*')) def test_tokenize_comment_continues(s, input): port = ReadablePort(input) token, lookahead, next = tokenize_comment(s, port) @@ -37,7 +91,11 @@ def test_tokenize_comment_continues(s, input): assert lookahead == input[0] assert next == tokenize_comment -@given(text(), text().filter(not_(comment_continues))) +# * any lookahead, one or more characters beginning with a newline as input, and +# * any lookahead, empty input: +# tokenize_comment concludes the current comment and prepares for the next +# token, without generating a token. +@given(text(), just('') | from_regex(r'^\n.*')) def test_tokenize_comment_ends(s, input): port = ReadablePort(input) token, lookahead, next = tokenize_comment(s, port) @@ -47,17 +105,26 @@ def test_tokenize_comment_ends(s, input): assert lookahead == (input[0] if input else '') assert next == tokenize_any +# Cases for tokenize_syntax: + +# * any lookahead, any input: generate the lookahead as a Syntax token and +# transition back to tokenize_any to prepare for the next token, with one +# character of lookahead ready to go. @given(text(), text()) def test_tokenize_syntax(s, input): port = ReadablePort(input) token, lookahead, next = tokenize_syntax(s, port) - assert token == Syntax(s) - assert isinstance(token, Syntax) + assert token == s assert port.tell() == (1 if input else 0) assert lookahead == (input[0] if input else '') assert next == tokenize_any +# Cases for test_tokenize_whitespace: + +# * any lookahead, any input: throw away the presumed-whitespace lookahead, then +# transition back to tokenize_any to prepare for the next token, with one +# character of lookahead ready to go, without generating a token. @given(text(), text()) def test_tokenize_whitespace(s, input): port = ReadablePort(input) @@ -68,67 +135,82 @@ def test_tokenize_whitespace(s, input): assert lookahead == (input[0] if input else '') assert next == tokenize_any -def symbol_continues(text): - if text == '': - return False - return text[0] not in ' \n\t();"' +# Cases for tokenize_nonstring_atom: -@given(text(), text().filter(symbol_continues)) -def test_tokenize_symbol_continues(s, input): +# * any lookahead, any non-empty input not beginning with whitespace, syntax, a +# comment delimiter, or a string literal: accumulate one character of input +# onto the lookahead, then transition back to tokenize_symbol to process the +# next character of input, without generating a token. +@given(text(), from_regex(r'^[^ \n\t();"].*')) +def test_tokenize_nonstring_atom_continues(s, input): port = ReadablePort(input) - token, lookahead, next = tokenize_symbol(s, port) + token, lookahead, next = tokenize_nonstring_atom(s, port) assert token is None assert port.tell() == 1 assert lookahead == s + input[0] - assert next == tokenize_symbol - -@given(text(), text().filter(not_(symbol_continues))) -def test_tokenize_symbol_ends(s, input): + assert next == tokenize_nonstring_atom + +# * any lookahead, a non-empty input beginning with whitespace, syntax, a +# comment delimiter, or a string literal, and +# * any lookahead, empty input: +# generate the accumulated input as a Symbol token, then transition back to tokenize_any with one character of lookahead ready to go. +@given(text(), just('') | from_regex(r'^[ \n\t();"].*')) +def test_tokenize_tokenize_nonstring_atom_ends(s, input): port = ReadablePort(input) - token, lookahead, next = tokenize_symbol(s, port) + token, lookahead, next = tokenize_nonstring_atom(s, port) - assert token == Symbol(s) - assert isinstance(token, Symbol) + assert token == s assert port.tell() == (1 if input else 0) assert lookahead == (input[0] if input else '') assert next == tokenize_any -def string_continues(text): - if text == '': - return False - return not text[0] == '"' +# And now, the _worst_ part of the state machine. Cases for tokenize_string: -@given(text(), text().filter(string_continues)) +# * any lookahead, a non-empty input not beginning with a string delimiter: +# begin a non-empty string by transitioning to the tokenize_string_character +# state with one character of lookahead, without generating a token. +@given(text(), from_regex(r'^[^"].*')) def test_tokenize_string_continues(s, input): port = ReadablePort(input) token, lookahead, next = tokenize_string(s, port) assert token is None assert port.tell() == 1 - assert lookahead == input[0] + assert lookahead == s + input[0] assert next == tokenize_string_character -@given(text(), text().filter(not_(string_continues))) -def test_tokenize_string_ends(s, input): +# * any lookahad, a non-empty input beginning with a string delimiter: terminate +# an empty string by transitioning to the tokenize_string_end state with an +# *empty* lookahead, without generating a token. +@given(text(), from_regex(r'^["].*')) +def test_tokenize_string_empty(s, input): + port = ReadablePort(input) + token, lookahead, next = tokenize_string(s, port) + + assert token is None + assert port.tell() == 1 + assert lookahead == s + input[0] + assert next == tokenize_string_end + +# * any lookahead, empty input: emit a tokenization error, as we've encountered +# EOF inside of a string. +@given(text(), just('')) +def test_tokenize_string_eof(s, input): try: port = ReadablePort(input) token, lookahead, next = tokenize_string(s, port) - assert token is None - assert port.tell() == 1 - assert lookahead == '' - assert next == tokenize_string_end + assert False # must raise except TokenError: - assert input == '' assert port.tell() == 0 -def is_escape(text): - if text == '': - return False - return text[0] == '\\' +# Cases for tokenize_string_character: -@given(text(), text().filter(string_continues).filter(not_(is_escape))) +# * any lookahead, any non-empty input not beginning with a string delimiter or +# escape character: append one character of input to the lookahead, then +# continue in the tokenize_string_character state without generating a token. +@given(text(), from_regex(r'^[^\\"].*')) def test_tokenize_string_character_continues(s, input): port = ReadablePort(input) token, lookahead, next = tokenize_string_character(s, port) @@ -138,56 +220,137 @@ def test_tokenize_string_character_continues(s, input): assert lookahead == s + input[0] assert next == tokenize_string_character -# Using from_regex() rather than text() because searching randomly for strings -# that start with a specific character is far, _far_ too slow. (It often fails -# to find any examples.) I _think_ this preserves the property that this group -# of three tests are exhaustive, but it's not as obvious as it would be if I -# could use text() here. -@given(text(), from_regex(r'\\.*').filter(string_continues).filter(is_escape)) +# * any lookahead, any non-empty input which begins with an escape character: +# leave the lookahead unchanged, but transition to the +# tokenize_escaped_string_character state to determine which escape character +# we're dealing with, without emitting a token. +@given(text(), from_regex(r'^[\\].*')) def test_tokenize_string_character_begins_escape(s, input): port = ReadablePort(input) token, lookahead, next = tokenize_string_character(s, port) assert token is None assert port.tell() == 1 - assert lookahead == s + assert lookahead == s + input[0] assert next == tokenize_escaped_string_character -@given(text(), text().filter(not_(string_continues))) +# * any lookahead, any non-empty input which begins with a string delimiter: +# we're at the end of a string. Transition to the tokenize_string_end state +# with the current lookahead, without generating a token. +@given(text(), from_regex(r'^["].*')) def test_tokenize_string_character_ends(s, input): + port = ReadablePort(input) + token, lookahead, next = tokenize_string_character(s, port) + + assert token is None + assert port.tell() == 1 + assert lookahead == s + input[0] + assert next == tokenize_string_end + +# * any lookahead, empty input: emit a tokenization error, as we've encountered +# EOF inside of a string literal. +@given(text(), just('')) +def test_tokenize_string_character_eof(s, input): try: port = ReadablePort(input) token, lookahead, next = tokenize_string_character(s, port) - assert token is None - assert port.tell() == 1 - assert lookahead == s - assert next == tokenize_string_end + assert False # must raise except TokenError: assert input == '' assert port.tell() == 0 -@given(text(), text()) -def test_tokenize_escaped_string_character(s, input): +# Cases for tokenize_escaped_string: + +# * any lookahead, any non-empty input beginning with a legal string escaped +# character: de-escape the first character of the input, append the result to +# the lookahead, then transition back to the tokenize_string_character state. +@given(text(), from_regex(r'^["\\].*')) +def test_tokenize_escaped_string_character_valid(s, input): + port = ReadablePort(input) + token, lookahead, next = tokenize_escaped_string_character(s, port) + + assert token is None + assert port.tell() == 1 + assert lookahead == s + input[0] + assert next == tokenize_string_character + +# * any lookahead, any non-empty input not beginning with a legal string escaped +# character: emit a tokenization error, we've found an invalid string escape. +@given(text(), from_regex(r'^[^"\\].*')) +def test_tokenize_escaped_string_character_invalid(s, input): try: port = ReadablePort(input) token, lookahead, next = tokenize_escaped_string_character(s, port) - assert token is None + assert False # must raise + except TokenError: assert port.tell() == 1 - assert lookahead == s + input[0] - assert next == tokenize_string_character + +# * any lookahead, empty input: emit a tokenization error, we've found an EOF +# inside of a string literal. +@given(text(), just('')) +def test_tokenize_escaped_string_character_eof(s, input): + try: + port = ReadablePort(input) + token, lookahead, next = tokenize_escaped_string_character(s, port) + + assert False # must raise except TokenError: - assert input == '' or input[0] not in '\\n' - assert port.tell() == (1 if input else 0) + assert port.tell() == 0 + +# Cases for tokenize_string_end: +# * any lookahead, any input: generate a String token from the lookahead, then +# transition back to the tokenize_any state with one character of lookahead +# ready to go. @given(text(), text()) def test_tokenize_string_end(s, input): port = ReadablePort(input) token, lookahead, next = tokenize_string_end(s, port) assert token == s - assert isinstance(token, String) assert port.tell() == (1 if input else 0) assert lookahead == (input[0] if input else '') assert next == tokenize_any + +# Cases for tokenize_atom: + +# * lookahead containing a string delimiter, any input: found a string atom, +# transition to the tokenize_string state without reading or generating a +# token. +@given(just('"'), text()) +def test_tokenize_atom_string(s, input): + port = ReadablePort(input) + token, lookahead, next = tokenize_atom(s, port) + + assert token is None + assert port.tell() == 0 + assert lookahead == s + assert next == tokenize_string + +# * lookahead containing something other than a string delimiter, any input: +# found a nonstring atom, transition to the tokenize_nonstring_atom state +# without reading or generating a token. +@given(from_regex(r'^[^"]'), text()) +def test_tokenize_atom_nonstring(s, input): + port = ReadablePort(input) + token, lookahead, next = tokenize_atom(s, port) + + assert token is None + assert port.tell() == 0 + assert lookahead == s + assert next == tokenize_nonstring_atom + +# Cases for the tokenizer: + +# * any sequence of separator-token pairs: if the pairs are coalesced into a +# single giant input, does the tokenizer recover the tokens? +@given(spaced_token_sequences()) +def test_tokenizer(spaced_tokens): + input = ''.join(''.join(pair) for pair in spaced_tokens) + tokens = [token for (_, token) in spaced_tokens] + + port = ReadablePort(input) + + assert list(tokenize(port)) == tokens diff --git a/tests/tokens.py b/tests/tokens.py new file mode 100644 index 0000000..0027fb2 --- /dev/null +++ b/tests/tokens.py @@ -0,0 +1,90 @@ +from hypothesis.strategies import just, one_of, characters, text, lists, tuples +from hypothesis.strategies import composite, recursive + +# Generators for token families + +# Generates the `(` token. +def open_parens(): + return just('(') + +# Generates the ')' token. +def close_parens(): + return just(')') + +# Generates characters that are legal, unescaped, inside of a string. +def string_bare_characters(): + return characters(blacklist_characters='\\"') + +# Generates legal string escape sequences. +def string_escaped_characters(): + return one_of(just('"'), just('\\')).map(lambda c: '\\' + c) + +# Generates single-character string representations, including escapes. +def string_characters(): + return one_of(string_bare_characters(), string_escaped_characters()) + +# Generates arbitrary string bodies (strings, without leading or trailing +# quotes) +def string_body(): + return text(string_characters()) + +# Generates legal strings. +def strings(): + return tuples(just('"'), string_body(), just('"')).map(lambda t: ''.join(t)) + +# Generates characters which are legal within a symbol. +def symbol_characters(): + return characters(blacklist_characters=' \t\n();"') + +# Generates legal symbols. +def symbols(): + return text(symbol_characters(), min_size=1) + +# Generates single whitespace characters. +def whitespace_characters(): + return one_of(just('\n'), just(' '), just('\t')) + +# Generates a single token. +def tokens(): + return one_of(symbols(), strings(), open_parens(), close_parens()) + +# Generates at least one character of whitespace. +def whitespace(): + return text(whitespace_characters(), min_size=1) + +# Generates characters which can legally appear inside of a comment (anything +# but a newline). +def comment_characters(): + return characters(blacklist_characters='\n') + +# Generates a (possibly-empty) comment, terminated with a trailing newline. +def comments(): + return tuples(just(';'), text(comment_characters()), just('\n')).map(lambda t: ''.join(t)) + +# Generates sequences which can be inserted between arbitrary pairs of tokens +# without changing their meaning. +def intertokens(): + return one_of(comments(), whitespace()) + +# Generate a pair such that the second element is a token, and joining the +# elements with an empty string produces a string that tokenizes to the second +# element. +def spaced_tokens(): + def spaced(strategy): + return tuples(intertokens(), strategy) + def unspaced(strategy): + return tuples(one_of(just(''), intertokens()), strategy) + def spaced_symbols(): + return spaced(symbols()) + def spaced_strings(): + return unspaced(strings()) + def spaced_open_parens(): + return unspaced(open_parens()) + def spaced_close_parens(): + return unspaced(close_parens()) + + return one_of(spaced_symbols(), spaced_strings(), spaced_open_parens(), spaced_close_parens()) + +# Generats a list of pairs as per spaced_token(). +def spaced_token_sequences(): + return lists(spaced_tokens()) |
