diff options
| -rw-r--r-- | actinide/tokenizer.py | 114 | ||||
| -rw-r--r-- | actinide/types.py | 12 | ||||
| -rw-r--r-- | tests/__init__.py | 0 | ||||
| -rw-r--r-- | tests/test_tokenizer.py | 291 | ||||
| -rw-r--r-- | tests/tokens.py | 90 |
5 files changed, 382 insertions, 125 deletions
diff --git a/actinide/tokenizer.py b/actinide/tokenizer.py index 8fb9d0a..9767033 100644 --- a/actinide/tokenizer.py +++ b/actinide/tokenizer.py @@ -1,5 +1,3 @@ -from .types import * - # ## TOKENIZATION # # The following code implements a state-machine-driven tokenizer which can @@ -8,19 +6,20 @@ from .types import * # # * Comments: ``;`` followed by all bytes to EOF or to the end of the line. # -# * Strings: ``"`` through to the next unescaped ``"`` are read, de-escaped, and -# returned. The sequences ``\"`` and ``\\`` are treated specially: the former -# de-escapes to ``"``, and the latter to ``\``. An unclosed string literal or -# an unknown escape sequence is a tokenization error. -# # * Open and close parens: ``(`` and ``)`` are returned as freestanding tokens. # # * Whitespace: Space, horizontal tab, and newline characters are discarded # during tokenization. # -# * Symbols: Any sequence of characters not included in one of the above classes +# * Strings: ``"`` through to the next unescaped ``"`` are read and returned. +# Sequences within the string beginning with ``\`` indicate an escape, and may +# only be followed by the character ``"`` or ``\``. An unclosed string literal +# or an unknown escape sequence is a tokenization error. +# +# * Atoms: Any sequence of characters not included in one of the above classes # is read and returned as a single token. This includes words, numbers, and -# special literals. +# special literals. (Strings are, technically, a kind of atom, but the lexer +# treats them specially due to their complexity.) # # Internally, the tokenizer is a state machine which maintains two pieces of # state: the "lookahead" (holding data to feed to the next state transition @@ -34,6 +33,11 @@ from .types import * # port) to determine the next state, and return a 3-tuple of ``token`` (may be # ``None``), ``lookahead`` (which replaces the previous lookahead), and ``next`` # (the new next state transition function). +# +# This is heavily inspired by various tail-recursive approaches to tokenizing +# lisp streams. However, the host language does not guarantee tail call +# optimizations, so we use an explicit trampoline function to drive the state +# machine instead of calling each parser directly. class TokenError(Exception): ''' @@ -47,20 +51,20 @@ class TokenError(Exception): # # This is the top-level driver for the state machine that divides the underlying # input into tokens. It does no input handling itself, other than reading the -# first character of the port: so long as the lookahead is non-empty, this calls -# the next state transition function to determine what to do and how to change -# the lookahead. +# first character of the port: this calls the next state transition function to +# determine what to do and how to change the lookahead. # -# Initially, this is in the ``tokenize_any`` state. +# Initially, this is in the ``tokenize_any`` state, and exits once it reaches +# the ``tokenize_eof`` state. def tokenize(port): lookahead, next = port.read(1), tokenize_any - while len(lookahead) > 0: + while next != tokenize_eof: token, lookahead, next = next(lookahead, port) if token is not None: yield token -# If the lookahead is exactly one read result, this will correctly determine the -# next token type and return that state without consuming input. This is +# If the lookahead is exactly one character, this will correctly determine the +# next token type and transition to that state without consuming input. This is # generally the correct state to transition to any time the next token is # unknown - for example, at the end of another token. # @@ -75,9 +79,7 @@ def tokenize_any(lookahead, port): return None, lookahead, tokenize_syntax if lookahead in ' \t\n': return None, lookahead, tokenize_whitespace - if lookahead == '"': - return None, lookahead, tokenize_string - return None, lookahead, tokenize_symbol + return None, lookahead, tokenize_atom # Special trap state. This never produces a token, and always transitions to # itself. The lookahead in this state is generally ``''``, and since this never @@ -89,8 +91,8 @@ def tokenize_any(lookahead, port): def tokenize_eof(lookahead, port): return None, lookahead, tokenize_eof -# Consumes one read result at a time until it finds an end of line or runs out -# of input. This throws away comments entirely, at parse time, without +# Consumes one character at a time until it finds an end of line or runs out of +# input. This throws away comments entirely, at tokenization time, without # considering whether the comment content can be separated into tokens. As this # scans the comment, the lookahead will be set to successive characters from the # port, but never more than one character at a time. @@ -104,51 +106,64 @@ def tokenize_comment(lookahead, port): return None, next, tokenize_any return None, next, tokenize_comment -# Consumes the lookahead and packages it up as a Syntax token. This is generally -# appropriate for the ``(`` and ``)`` syntactic elements. +# Generates the entire lookahead as a token. This is generally appropriate for +# the ``(`` and ``)`` syntactic elements. # # The resulting lookahead will be the next character of input, and this always # dispatches back to ``tokenize_any`` so that the next token (if any) can be # determined. def tokenize_syntax(lookahead, port): - return Syntax(lookahead), port.read(1), tokenize_any + return lookahead, port.read(1), tokenize_any -# Consumes and ignores whitespace in the input. This never produces a token, and +# Consumes and ignores one character of input. This never produces a token, and # throws away the lookahead entirely. The resulting lookahead is the next # character of input. def tokenize_whitespace(lookahead, port): return None, port.read(1), tokenize_any -# Consumes characters until it finds a character which cannot be part of a token -# or until it finds the end of input, accumulating them into a single Symbol -# token. This is a heavily-overloaded token category, as it contains not only -# Actinide symbols but also all non-String literals. +# We've ruled out all non-atom tokens. If the lookahead is a string delimiter, +# transitions to a state which tokenizes a single string literal; otherwise, +# transitions to a state which consumes a single non-string atom. In both cases, +# this leaves the lookahead alone, and generates no token. +def tokenize_atom(lookahead, port): + if lookahead == '"': + return None, lookahead, tokenize_string + return None, lookahead, tokenize_nonstring_atom + +# Consumes characters until it finds a character which cannot be part of a +# non-string atom, or until it finds the end of input, accumulating them into a +# single token. This is a heavily-overloaded token category, as it contains not +# only Actinide symbols but also all non-String literals. # # While the tokenizer remains in this state, the lookahead accumulates the # characters of the token. When this matches a completed token, it produces a # Symbol token, and resets the lookahead back to a single read result containing # the next character of input. -def tokenize_symbol(lookahead, port): +def tokenize_nonstring_atom(lookahead, port): next = port.read(1) if next == '': - return Symbol(lookahead), next, tokenize_any + return lookahead, next, tokenize_any if next in '"(); \t\n': - return Symbol(lookahead), next, tokenize_any - return None, lookahead + next, tokenize_symbol + return lookahead, next, tokenize_any + return None, lookahead + next, tokenize_nonstring_atom # ### STRINGS # -# The following states handle string literals in the input stream. String -# literals are fairly simple: they begin with a quote, contain arbitrary -# characters other than a bare \ or ", and end with a quote. The sequences -# ``\\`` and ``\"`` are de-escaped by removing the leading backslash and -# included in the resulting string. +# The following family of states handles string literals in the input stream. +# String literals are fairly simple: they begin with a quote, contain arbitrary +# characters other than a bare \ or ", and end with a quote. (Note that ``\n`` +# is not an escape sequence: unescaped newlines are permitted within string +# literals.) # # These states use the lookahead to accumulate the characters of the string. On # transition back to ``tokenize_any``, the lookahead is always set back to a # single character. If, at any point, these states encounter EOF, they raise a # ``TokenError``: no legal token in Actinide begins with a quote mark and ends # with EOF. +# +# Because tokenization is only concerned with dividing the input into tokens, +# this machine *does not* strip quotes or replace escape sequences. On success, +# it generates a token containing the whole the string literal, verbatim. # The lookahead is assumed to be the opening quote of a string, and discarded. # Read forwards one character to determine whether this is an empty string @@ -161,9 +176,11 @@ def tokenize_string(lookahead, port): next = port.read(1) if next == '': raise TokenError('Unclosed string literal') + if next == '\\': + return None, lookahead + next, tokenize_escaped_string_character if next == '"': - return None, '', tokenize_string_end - return None, next, tokenize_string_character + return None, lookahead + next, tokenize_string_end + return None, lookahead + next, tokenize_string_character # The lookahead contains the body of the string read so far. Reads forwards one # character to determine if the string continues, contains an escaped character, @@ -175,15 +192,14 @@ def tokenize_string_character(lookahead, port): if next == '': raise TokenError('Unclosed string literal') if next == '\\': - return None, lookahead, tokenize_escaped_string_character + return None, lookahead + next, tokenize_escaped_string_character if next == '"': - return None, lookahead, tokenize_string_end + return None, lookahead + next, tokenize_string_end return None, lookahead + next, tokenize_string_character # The lookahead contains the body of the string so far. Reads forwards one # character to determine which, if any, escaped character to process: if it's -# one we recognize, de-escape it and append it to the string, otherwise raise a -# TokenError. +# one we recognize, append it to the string, otherwise raise a TokenError. # # This never yields a token, and always dispatches back to # ``tokenize_string_character`` on a legal escape character. @@ -191,14 +207,14 @@ def tokenize_escaped_string_character(lookahead, port): next = port.read(1) if next == '': raise TokenError('Unclosed string literal') - if next == 'n': - return None, lookahead + '\n', tokenize_string_character + if next == '"': + return None, lookahead + next, tokenize_string_character if next == '\\': - return None, lookahead + '\\', tokenize_string_character - raise TokenError(f"Unknown string escape '\{next}'") + return None, lookahead + next, tokenize_string_character + raise TokenError(f"Invalid string escape '\\{next}'") # Package the lookahead (the full string body, de-escaped and without leading # and trailing quotes) up as a String token and return it, then transition back # to the ``tokenize_any`` state with a single read result in the lookahead. def tokenize_string_end(lookahead, port): - return String(lookahead), port.read(1), tokenize_any + return lookahead, port.read(1), tokenize_any diff --git a/actinide/types.py b/actinide/types.py deleted file mode 100644 index 2b618f4..0000000 --- a/actinide/types.py +++ /dev/null @@ -1,12 +0,0 @@ -__all__ = ['String', 'Symbol', 'Syntax'] - -# ## REPRESENTATIONS -# -# The following defines specify the Python representations of various Actinide -# types. - -String = str -class Symbol(str): - pass -class Syntax(str): - pass diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/tests/__init__.py diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py index a300eb2..76a07a9 100644 --- a/tests/test_tokenizer.py +++ b/tests/test_tokenizer.py @@ -1,17 +1,71 @@ -from hypothesis import given, settings, HealthCheck -from hypothesis.strategies import text, from_regex +from hypothesis import given, settings, HealthCheck, event +from hypothesis.strategies import just, text, characters, from_regex, one_of, tuples, sampled_from import io from actinide.tokenizer import * +from .tokens import spaced_token_sequences + class ReadablePort(io.StringIO): def __repr__(self): # Slightly friendlier debugging output return f"ReadablePort(str={repr(self.getvalue())}, pos={self.tell()})" -def not_(f): - return lambda *args, **kwargs: not f(*args, **kwargs) +# Many of the following tests proceed by cases, because the underlying behaviour +# is too complex to treat as a uniform set of properties. The cases are meant to +# be total, and in principle could be defined as a set of filters on the +# ``text()`` generator that , combined, exhaust the possible outcomes of that +# generator. +# +# Implementing the tests that way causes Hypothesis to generate a significant +# number of examples that it then throws away without verifying, because +# Hypothesis has no insight into filters to use when generating examples. +# Instead, this test suite specifies generators per-case. + +# Cases for tokenize_any: + +# We test this a bit differently from the subsequent tokenizer states. Because +# it's a pure routing state, we can generate lookahead, expected_state pairs and +# check them in one pass, rather than testing each possible outcome separately. +# In every case, the input is irrelevant: this state never reads. + +def next_token_states(): + return one_of( + tuples(just(''), just(tokenize_eof)), + tuples(just(';'), just(tokenize_comment)), + tuples(sampled_from('()'), just(tokenize_syntax)), + tuples(sampled_from(' \t\n'), just(tokenize_whitespace)), + tuples(just('"'), just(tokenize_atom)), + tuples(characters(blacklist_characters=' \t\n();"'), just(tokenize_atom)), + ) + +@given(next_token_states(), text()) +def test_tokenize_any(lookahead_next, input): + s, expected_state = lookahead_next + port = ReadablePort(input) + token, lookahead, next = tokenize_any(s, input) + + assert token is None + assert lookahead == s + assert next == expected_state + assert port.tell() == 0 +# Since the previous test case is rigged for success, also verify that no input +# causes tokenize_any to enter an unexpected state or to throw an exception. +@given(text(), text()) +def test_tokenize_any_fuzz(s, input): + port = ReadablePort(input) + token, lookahead, next = tokenize_any(s, input) + + assert token is None + assert lookahead == s + assert next in (tokenize_eof, tokenize_comment, tokenize_syntax, tokenize_whitespace, tokenize_atom) + assert port.tell() == 0 + +# Cases for tokenize_eof: + +# * any lookahead, any input: tokenize_eof is a trap state performing no reads, +# always returning to itself, and never generating a token. @given(text(), text()) def test_tokenize_eof(s, input): port = ReadablePort(input) @@ -22,12 +76,12 @@ def test_tokenize_eof(s, input): assert next == tokenize_eof assert port.tell() == 0 -def comment_continues(text): - if text == '': - return False - return text[0] != '\n' +# Cases for tokenize_comment: -@given(text(), text().filter(comment_continues)) +# * any lookahead, one or more characters beginning with a non-newline as input: +# tokenize_comment continues the current comment, throwing away one character +# of input, without generating a token. +@given(text(), from_regex(r'^[^\n].*')) def test_tokenize_comment_continues(s, input): port = ReadablePort(input) token, lookahead, next = tokenize_comment(s, port) @@ -37,7 +91,11 @@ def test_tokenize_comment_continues(s, input): assert lookahead == input[0] assert next == tokenize_comment -@given(text(), text().filter(not_(comment_continues))) +# * any lookahead, one or more characters beginning with a newline as input, and +# * any lookahead, empty input: +# tokenize_comment concludes the current comment and prepares for the next +# token, without generating a token. +@given(text(), just('') | from_regex(r'^\n.*')) def test_tokenize_comment_ends(s, input): port = ReadablePort(input) token, lookahead, next = tokenize_comment(s, port) @@ -47,17 +105,26 @@ def test_tokenize_comment_ends(s, input): assert lookahead == (input[0] if input else '') assert next == tokenize_any +# Cases for tokenize_syntax: + +# * any lookahead, any input: generate the lookahead as a Syntax token and +# transition back to tokenize_any to prepare for the next token, with one +# character of lookahead ready to go. @given(text(), text()) def test_tokenize_syntax(s, input): port = ReadablePort(input) token, lookahead, next = tokenize_syntax(s, port) - assert token == Syntax(s) - assert isinstance(token, Syntax) + assert token == s assert port.tell() == (1 if input else 0) assert lookahead == (input[0] if input else '') assert next == tokenize_any +# Cases for test_tokenize_whitespace: + +# * any lookahead, any input: throw away the presumed-whitespace lookahead, then +# transition back to tokenize_any to prepare for the next token, with one +# character of lookahead ready to go, without generating a token. @given(text(), text()) def test_tokenize_whitespace(s, input): port = ReadablePort(input) @@ -68,67 +135,82 @@ def test_tokenize_whitespace(s, input): assert lookahead == (input[0] if input else '') assert next == tokenize_any -def symbol_continues(text): - if text == '': - return False - return text[0] not in ' \n\t();"' +# Cases for tokenize_nonstring_atom: -@given(text(), text().filter(symbol_continues)) -def test_tokenize_symbol_continues(s, input): +# * any lookahead, any non-empty input not beginning with whitespace, syntax, a +# comment delimiter, or a string literal: accumulate one character of input +# onto the lookahead, then transition back to tokenize_symbol to process the +# next character of input, without generating a token. +@given(text(), from_regex(r'^[^ \n\t();"].*')) +def test_tokenize_nonstring_atom_continues(s, input): port = ReadablePort(input) - token, lookahead, next = tokenize_symbol(s, port) + token, lookahead, next = tokenize_nonstring_atom(s, port) assert token is None assert port.tell() == 1 assert lookahead == s + input[0] - assert next == tokenize_symbol - -@given(text(), text().filter(not_(symbol_continues))) -def test_tokenize_symbol_ends(s, input): + assert next == tokenize_nonstring_atom + +# * any lookahead, a non-empty input beginning with whitespace, syntax, a +# comment delimiter, or a string literal, and +# * any lookahead, empty input: +# generate the accumulated input as a Symbol token, then transition back to tokenize_any with one character of lookahead ready to go. +@given(text(), just('') | from_regex(r'^[ \n\t();"].*')) +def test_tokenize_tokenize_nonstring_atom_ends(s, input): port = ReadablePort(input) - token, lookahead, next = tokenize_symbol(s, port) + token, lookahead, next = tokenize_nonstring_atom(s, port) - assert token == Symbol(s) - assert isinstance(token, Symbol) + assert token == s assert port.tell() == (1 if input else 0) assert lookahead == (input[0] if input else '') assert next == tokenize_any -def string_continues(text): - if text == '': - return False - return not text[0] == '"' +# And now, the _worst_ part of the state machine. Cases for tokenize_string: -@given(text(), text().filter(string_continues)) +# * any lookahead, a non-empty input not beginning with a string delimiter: +# begin a non-empty string by transitioning to the tokenize_string_character +# state with one character of lookahead, without generating a token. +@given(text(), from_regex(r'^[^"].*')) def test_tokenize_string_continues(s, input): port = ReadablePort(input) token, lookahead, next = tokenize_string(s, port) assert token is None assert port.tell() == 1 - assert lookahead == input[0] + assert lookahead == s + input[0] assert next == tokenize_string_character -@given(text(), text().filter(not_(string_continues))) -def test_tokenize_string_ends(s, input): +# * any lookahad, a non-empty input beginning with a string delimiter: terminate +# an empty string by transitioning to the tokenize_string_end state with an +# *empty* lookahead, without generating a token. +@given(text(), from_regex(r'^["].*')) +def test_tokenize_string_empty(s, input): + port = ReadablePort(input) + token, lookahead, next = tokenize_string(s, port) + + assert token is None + assert port.tell() == 1 + assert lookahead == s + input[0] + assert next == tokenize_string_end + +# * any lookahead, empty input: emit a tokenization error, as we've encountered +# EOF inside of a string. +@given(text(), just('')) +def test_tokenize_string_eof(s, input): try: port = ReadablePort(input) token, lookahead, next = tokenize_string(s, port) - assert token is None - assert port.tell() == 1 - assert lookahead == '' - assert next == tokenize_string_end + assert False # must raise except TokenError: - assert input == '' assert port.tell() == 0 -def is_escape(text): - if text == '': - return False - return text[0] == '\\' +# Cases for tokenize_string_character: -@given(text(), text().filter(string_continues).filter(not_(is_escape))) +# * any lookahead, any non-empty input not beginning with a string delimiter or +# escape character: append one character of input to the lookahead, then +# continue in the tokenize_string_character state without generating a token. +@given(text(), from_regex(r'^[^\\"].*')) def test_tokenize_string_character_continues(s, input): port = ReadablePort(input) token, lookahead, next = tokenize_string_character(s, port) @@ -138,56 +220,137 @@ def test_tokenize_string_character_continues(s, input): assert lookahead == s + input[0] assert next == tokenize_string_character -# Using from_regex() rather than text() because searching randomly for strings -# that start with a specific character is far, _far_ too slow. (It often fails -# to find any examples.) I _think_ this preserves the property that this group -# of three tests are exhaustive, but it's not as obvious as it would be if I -# could use text() here. -@given(text(), from_regex(r'\\.*').filter(string_continues).filter(is_escape)) +# * any lookahead, any non-empty input which begins with an escape character: +# leave the lookahead unchanged, but transition to the +# tokenize_escaped_string_character state to determine which escape character +# we're dealing with, without emitting a token. +@given(text(), from_regex(r'^[\\].*')) def test_tokenize_string_character_begins_escape(s, input): port = ReadablePort(input) token, lookahead, next = tokenize_string_character(s, port) assert token is None assert port.tell() == 1 - assert lookahead == s + assert lookahead == s + input[0] assert next == tokenize_escaped_string_character -@given(text(), text().filter(not_(string_continues))) +# * any lookahead, any non-empty input which begins with a string delimiter: +# we're at the end of a string. Transition to the tokenize_string_end state +# with the current lookahead, without generating a token. +@given(text(), from_regex(r'^["].*')) def test_tokenize_string_character_ends(s, input): + port = ReadablePort(input) + token, lookahead, next = tokenize_string_character(s, port) + + assert token is None + assert port.tell() == 1 + assert lookahead == s + input[0] + assert next == tokenize_string_end + +# * any lookahead, empty input: emit a tokenization error, as we've encountered +# EOF inside of a string literal. +@given(text(), just('')) +def test_tokenize_string_character_eof(s, input): try: port = ReadablePort(input) token, lookahead, next = tokenize_string_character(s, port) - assert token is None - assert port.tell() == 1 - assert lookahead == s - assert next == tokenize_string_end + assert False # must raise except TokenError: assert input == '' assert port.tell() == 0 -@given(text(), text()) -def test_tokenize_escaped_string_character(s, input): +# Cases for tokenize_escaped_string: + +# * any lookahead, any non-empty input beginning with a legal string escaped +# character: de-escape the first character of the input, append the result to +# the lookahead, then transition back to the tokenize_string_character state. +@given(text(), from_regex(r'^["\\].*')) +def test_tokenize_escaped_string_character_valid(s, input): + port = ReadablePort(input) + token, lookahead, next = tokenize_escaped_string_character(s, port) + + assert token is None + assert port.tell() == 1 + assert lookahead == s + input[0] + assert next == tokenize_string_character + +# * any lookahead, any non-empty input not beginning with a legal string escaped +# character: emit a tokenization error, we've found an invalid string escape. +@given(text(), from_regex(r'^[^"\\].*')) +def test_tokenize_escaped_string_character_invalid(s, input): try: port = ReadablePort(input) token, lookahead, next = tokenize_escaped_string_character(s, port) - assert token is None + assert False # must raise + except TokenError: assert port.tell() == 1 - assert lookahead == s + input[0] - assert next == tokenize_string_character + +# * any lookahead, empty input: emit a tokenization error, we've found an EOF +# inside of a string literal. +@given(text(), just('')) +def test_tokenize_escaped_string_character_eof(s, input): + try: + port = ReadablePort(input) + token, lookahead, next = tokenize_escaped_string_character(s, port) + + assert False # must raise except TokenError: - assert input == '' or input[0] not in '\\n' - assert port.tell() == (1 if input else 0) + assert port.tell() == 0 + +# Cases for tokenize_string_end: +# * any lookahead, any input: generate a String token from the lookahead, then +# transition back to the tokenize_any state with one character of lookahead +# ready to go. @given(text(), text()) def test_tokenize_string_end(s, input): port = ReadablePort(input) token, lookahead, next = tokenize_string_end(s, port) assert token == s - assert isinstance(token, String) assert port.tell() == (1 if input else 0) assert lookahead == (input[0] if input else '') assert next == tokenize_any + +# Cases for tokenize_atom: + +# * lookahead containing a string delimiter, any input: found a string atom, +# transition to the tokenize_string state without reading or generating a +# token. +@given(just('"'), text()) +def test_tokenize_atom_string(s, input): + port = ReadablePort(input) + token, lookahead, next = tokenize_atom(s, port) + + assert token is None + assert port.tell() == 0 + assert lookahead == s + assert next == tokenize_string + +# * lookahead containing something other than a string delimiter, any input: +# found a nonstring atom, transition to the tokenize_nonstring_atom state +# without reading or generating a token. +@given(from_regex(r'^[^"]'), text()) +def test_tokenize_atom_nonstring(s, input): + port = ReadablePort(input) + token, lookahead, next = tokenize_atom(s, port) + + assert token is None + assert port.tell() == 0 + assert lookahead == s + assert next == tokenize_nonstring_atom + +# Cases for the tokenizer: + +# * any sequence of separator-token pairs: if the pairs are coalesced into a +# single giant input, does the tokenizer recover the tokens? +@given(spaced_token_sequences()) +def test_tokenizer(spaced_tokens): + input = ''.join(''.join(pair) for pair in spaced_tokens) + tokens = [token for (_, token) in spaced_tokens] + + port = ReadablePort(input) + + assert list(tokenize(port)) == tokens diff --git a/tests/tokens.py b/tests/tokens.py new file mode 100644 index 0000000..0027fb2 --- /dev/null +++ b/tests/tokens.py @@ -0,0 +1,90 @@ +from hypothesis.strategies import just, one_of, characters, text, lists, tuples +from hypothesis.strategies import composite, recursive + +# Generators for token families + +# Generates the `(` token. +def open_parens(): + return just('(') + +# Generates the ')' token. +def close_parens(): + return just(')') + +# Generates characters that are legal, unescaped, inside of a string. +def string_bare_characters(): + return characters(blacklist_characters='\\"') + +# Generates legal string escape sequences. +def string_escaped_characters(): + return one_of(just('"'), just('\\')).map(lambda c: '\\' + c) + +# Generates single-character string representations, including escapes. +def string_characters(): + return one_of(string_bare_characters(), string_escaped_characters()) + +# Generates arbitrary string bodies (strings, without leading or trailing +# quotes) +def string_body(): + return text(string_characters()) + +# Generates legal strings. +def strings(): + return tuples(just('"'), string_body(), just('"')).map(lambda t: ''.join(t)) + +# Generates characters which are legal within a symbol. +def symbol_characters(): + return characters(blacklist_characters=' \t\n();"') + +# Generates legal symbols. +def symbols(): + return text(symbol_characters(), min_size=1) + +# Generates single whitespace characters. +def whitespace_characters(): + return one_of(just('\n'), just(' '), just('\t')) + +# Generates a single token. +def tokens(): + return one_of(symbols(), strings(), open_parens(), close_parens()) + +# Generates at least one character of whitespace. +def whitespace(): + return text(whitespace_characters(), min_size=1) + +# Generates characters which can legally appear inside of a comment (anything +# but a newline). +def comment_characters(): + return characters(blacklist_characters='\n') + +# Generates a (possibly-empty) comment, terminated with a trailing newline. +def comments(): + return tuples(just(';'), text(comment_characters()), just('\n')).map(lambda t: ''.join(t)) + +# Generates sequences which can be inserted between arbitrary pairs of tokens +# without changing their meaning. +def intertokens(): + return one_of(comments(), whitespace()) + +# Generate a pair such that the second element is a token, and joining the +# elements with an empty string produces a string that tokenizes to the second +# element. +def spaced_tokens(): + def spaced(strategy): + return tuples(intertokens(), strategy) + def unspaced(strategy): + return tuples(one_of(just(''), intertokens()), strategy) + def spaced_symbols(): + return spaced(symbols()) + def spaced_strings(): + return unspaced(strings()) + def spaced_open_parens(): + return unspaced(open_parens()) + def spaced_close_parens(): + return unspaced(close_parens()) + + return one_of(spaced_symbols(), spaced_strings(), spaced_open_parens(), spaced_close_parens()) + +# Generats a list of pairs as per spaced_token(). +def spaced_token_sequences(): + return lists(spaced_tokens()) |
