diff options
| author | Owen Jacobson <owen@grimoire.ca> | 2017-11-11 01:51:06 -0500 |
|---|---|---|
| committer | Owen Jacobson <owen@grimoire.ca> | 2017-11-11 15:42:13 -0500 |
| commit | 16d94a6e50eb81de9d9d438e1cce0746928597f3 (patch) | |
| tree | e1cb628d34c49690128722a33cc1d19d7dcffb23 | |
| parent | e4fb8604aa2fc572a3aeeace1c32de7339d346b5 (diff) | |
Introduce input ports.
Ports are the lisp abstraction of files and streams. Actinide ports additionally guarantee a peek operation.
This makes ``tokenize`` (now ``read_token``) callable as a lisp function, as it
takes a port and reads one token from it. This is a substantial refactoring.
As most of the state is now captured by closures, it's no longer practical to
test individual states as readily. However, the top-level tokenizer tests
exercise the full state space.
| -rw-r--r-- | README.rst | 2 | ||||
| -rw-r--r-- | actinide/ports.py | 54 | ||||
| -rw-r--r-- | actinide/tokenizer.py | 283 | ||||
| -rwxr-xr-x | bin/actinide-repl | 6 | ||||
| -rw-r--r-- | tests/test_ports.py | 29 | ||||
| -rw-r--r-- | tests/test_tokenizer.py | 360 | ||||
| -rw-r--r-- | tests/tokens.py | 4 |
7 files changed, 256 insertions, 482 deletions
@@ -67,7 +67,7 @@ Freestanding REPL ***************** **Note: this section is presently incorrect - the ``actinide-repl`` command -instead contains a test harness for the tokenizer.** +doesn't exist.** The Actinide interpreter can be started interactively using the ``actinide-repl`` command. In this mode, Actinide forms can be entered diff --git a/actinide/ports.py b/actinide/ports.py new file mode 100644 index 0000000..7748cd7 --- /dev/null +++ b/actinide/ports.py @@ -0,0 +1,54 @@ +import io + +# ## PORTS +# +# A port is a handle which characters can either be read from (an "input port") +# or written to (an "output port"). +# +# Actinide uses a very limited subset of the full Scheme ports system, and does +# not support the creation of most kinds of port at runtime. + +# A port. Under the hood, this wraps a Python file-like object in character +# mode, and guarantees support for peek and other operations needed by the +# Actinide runtime. +class Port(object): + def __init__(self, file): + self.file = file + self.peek_buffer = '' + + # Read up to ``n`` bytes from the port without consuming them. + def peek(self, n): + if not self.peek_buffer: + self.peek_buffer = self.file.read(n) + return self.peek_buffer + + # Read up to ``n`` bytes from the port, consuming them. + def read(self, n): + if self.peek_buffer: + read_result, self.peek_buffer = self.peek_buffer[:n], self.peek_buffer[n:] + return read_result + return self.file.read(n) + + # Read all remaining input, consuming it. + def read_fully(self): + return self.peek_buffer + self.file.read() + +# Read at least 1 and up to ``n`` characters from a port. This consumes them +# from the port: they are no longer available to future peeks or reads. ``n`` +# must be strictly positive. +def read(port, n): + return port.read(n) + +# Read all remaining input from a port, consuming it. +def read_fully(port): + return port.read_fully() + +# Read at least 1 and up to ``n`` characters from a port, without consuming +# them. They will be available on future peeks and reads. ``n`` must be strictly +# positive. +def peek(port, n): + return port.peek(n) + +# Create an input port from a string. +def string_to_input_port(string): + return Port(io.StringIO(string)) diff --git a/actinide/tokenizer.py b/actinide/tokenizer.py index 9767033..a69d35d 100644 --- a/actinide/tokenizer.py +++ b/actinide/tokenizer.py @@ -1,3 +1,5 @@ +from .ports import read, peek + # ## TOKENIZATION # # The following code implements a state-machine-driven tokenizer which can @@ -21,18 +23,14 @@ # special literals. (Strings are, technically, a kind of atom, but the lexer # treats them specially due to their complexity.) # -# Internally, the tokenizer is a state machine which maintains two pieces of -# state: the "lookahead" (holding data to feed to the next state transition -# function) and the next state transition function. The top-level ``tokenize`` -# function acts as a trampoline, repeatedly calling ``next`` until input is -# exhausted, yielding tokens any time ``next`` includes a token in its return -# value. +# Internally, the tokenizer is a state machine where each state (``next``) is a +# function taking a port as input. The top-level ``tokenize`` function acts as a +# trampoline, repeatedly calling ``next`` until input is exhausted or until +# ``next`` includes a token in its return value. # -# The various ``next`` functions take the current lookahead and the port, -# perform whatever logic is needed (including, potentially, reading from the -# port) to determine the next state, and return a 3-tuple of ``token`` (may be -# ``None``), ``lookahead`` (which replaces the previous lookahead), and ``next`` -# (the new next state transition function). +# The various ``next`` functions take the the port, perform whatever logic is +# needed to determine the next state, and return a 2-tuple of ``token`` (may be +# ``None``) and ``next`` (the new next state transition function). # # This is heavily inspired by various tail-recursive approaches to tokenizing # lisp streams. However, the host language does not guarantee tail call @@ -46,50 +44,47 @@ class TokenError(Exception): ''' pass -# Tokenize a port, producing a generator that yields successive tokens as it's -# advanced. +# Read one token from a port. # # This is the top-level driver for the state machine that divides the underlying -# input into tokens. It does no input handling itself, other than reading the -# first character of the port: this calls the next state transition function to -# determine what to do and how to change the lookahead. +# input into tokens. It does no input handling itself: it calls the next state +# transition function to determine what to do and how to change the lookahead, +# and relies on that function to perform any necessary input on the port. # # Initially, this is in the ``tokenize_any`` state, and exits once it reaches -# the ``tokenize_eof`` state. -def tokenize(port): - lookahead, next = port.read(1), tokenize_any +# the ``tokenize_eof`` state or once it reads a complete token. +# +# This never reads past the end of the current token, relying on ``peek`` to +# determine whether it should continue reading from the port. +def read_token(port): + next = tokenize_any while next != tokenize_eof: - token, lookahead, next = next(lookahead, port) + token, next = next(port) if token is not None: - yield token + return token -# If the lookahead is exactly one character, this will correctly determine the -# next token type and transition to that state without consuming input. This is -# generally the correct state to transition to any time the next token is -# unknown - for example, at the end of another token. +# Looks ahead one character in the port to determine what kind of token appears +# next in the port. This is an appropriate state to transition to at any time +# when the next token is not known, such as at the end of a token. # # This never produces a token directly. It can transition to the tokenizer state -# for any token type, as well as to the trap state for EOF. -def tokenize_any(lookahead, port): +# for any token type, for any non-token type, or the trap state for EOF. +def tokenize_any(port): + lookahead = peek_next(port) if lookahead == '': - return None, lookahead, tokenize_eof + return None, tokenize_eof if lookahead == ';': - return None, lookahead, tokenize_comment + return None, tokenize_comment if lookahead in '()': - return None, lookahead, tokenize_syntax + return None, tokenize_syntax if lookahead in ' \t\n': - return None, lookahead, tokenize_whitespace - return None, lookahead, tokenize_atom + return None, tokenize_whitespace + return None, tokenize_atom -# Special trap state. This never produces a token, and always transitions to -# itself. The lookahead in this state is generally ``''``, and since this never -# performs any further reads, it will remain that value indefinitely. -# -# The top-level parser exits in this situation by examining ``lookahead``, but -# it's possible to reach this state from string literal tokenization or after a -# comment. -def tokenize_eof(lookahead, port): - return None, lookahead, tokenize_eof +# EOF trap state This never produces a token, and always transitions to itself +# without reading any input. The tokenizer cannot exit this state. +def tokenize_eof(port): + return None, tokenize_eof # Consumes one character at a time until it finds an end of line or runs out of # input. This throws away comments entirely, at tokenization time, without @@ -98,123 +93,135 @@ def tokenize_eof(lookahead, port): # port, but never more than one character at a time. # # This never produces a token. -def tokenize_comment(lookahead, port): - next = port.read(1) + +# Consumes one character and throws it away, transitioning back to tokenize_any +# once it encounters either an end of line or the end of the input. This +# consumes commments, and as it never generates a token, discards them. +def tokenize_comment(port): + next = read_next(port) if next == '': - return None, next, tokenize_any + return None, tokenize_any if next == '\n': - return None, next, tokenize_any - return None, next, tokenize_comment - -# Generates the entire lookahead as a token. This is generally appropriate for -# the ``(`` and ``)`` syntactic elements. -# -# The resulting lookahead will be the next character of input, and this always -# dispatches back to ``tokenize_any`` so that the next token (if any) can be -# determined. -def tokenize_syntax(lookahead, port): - return lookahead, port.read(1), tokenize_any - -# Consumes and ignores one character of input. This never produces a token, and -# throws away the lookahead entirely. The resulting lookahead is the next -# character of input. -def tokenize_whitespace(lookahead, port): - return None, port.read(1), tokenize_any - -# We've ruled out all non-atom tokens. If the lookahead is a string delimiter, -# transitions to a state which tokenizes a single string literal; otherwise, -# transitions to a state which consumes a single non-string atom. In both cases, -# this leaves the lookahead alone, and generates no token. -def tokenize_atom(lookahead, port): + return None, tokenize_any + return None, tokenize_comment + +# Consumes one character, returning it as a token, before transitioning back to +# the ``tokenize_any`` state. This correctly tokenizes the ``(`` and ``)`` +# tokens if they are at the front of the port. +def tokenize_syntax(port): + return read_next(port), tokenize_any + +# Consumes and ignores one character of input. This never produces a token. This +# is appropriate for discarding whitespace in the port. +def tokenize_whitespace(port): + read_next(port) + return None, tokenize_any + +# Looks ahead one character into the port to determine which kind of atom to +# tokenize: if the input begins with a quote, tokenize a string literal; +# otherwise, tokenize a non-string atom such as a symbol or numeric literal. +# This never generates a token directly. +def tokenize_atom(port): + lookahead = peek_next(port) if lookahead == '"': - return None, lookahead, tokenize_string - return None, lookahead, tokenize_nonstring_atom - -# Consumes characters until it finds a character which cannot be part of a -# non-string atom, or until it finds the end of input, accumulating them into a -# single token. This is a heavily-overloaded token category, as it contains not -# only Actinide symbols but also all non-String literals. -# -# While the tokenizer remains in this state, the lookahead accumulates the -# characters of the token. When this matches a completed token, it produces a -# Symbol token, and resets the lookahead back to a single read result containing -# the next character of input. -def tokenize_nonstring_atom(lookahead, port): - next = port.read(1) - if next == '': - return lookahead, next, tokenize_any - if next in '"(); \t\n': - return lookahead, next, tokenize_any - return None, lookahead + next, tokenize_nonstring_atom + return None, tokenize_string + return None, tokenize_nonstring_atom('') + +# A state factory returning states that build non-string atoms. The resulting +# state family consumes characters until it finds a character which cannot be +# part of a non-string atom, or until it finds the end of input, accumulating +# them into a single token. When either of those cases arise, the resulting +# state generates the accumulated token and returns to the ``tokenize_any`` +# state to prepare for the next token. +def tokenize_nonstring_atom(state): + def tokenize_nonstring_atom_next(port): + next = peek_next(port) + if next == '': + return state, tokenize_any + if next in '"(); \t\n': + return state, tokenize_any + return None, tokenize_nonstring_atom(state + read_next(port)) + return tokenize_nonstring_atom_next # ### STRINGS # -# The following family of states handles string literals in the input stream. -# String literals are fairly simple: they begin with a quote, contain arbitrary -# characters other than a bare \ or ", and end with a quote. (Note that ``\n`` -# is not an escape sequence: unescaped newlines are permitted within string -# literals.) +# The following family of states and state factories handles string literals in +# the input stream. String literals are fairly simple: they begin with a quote, +# contain arbitrary characters other than a bare \ or ", and end with a quote. +# (Note that ``\n`` is not an escape sequence: unescaped newlines are permitted +# within string literals.) # -# These states use the lookahead to accumulate the characters of the string. On -# transition back to ``tokenize_any``, the lookahead is always set back to a -# single character. If, at any point, these states encounter EOF, they raise a -# ``TokenError``: no legal token in Actinide begins with a quote mark and ends -# with EOF. +# These states accumulate the characters of the string. On transition back to +# ``tokenize_any``, the accumulated characters are returned as a token. If, at +# any point, these states encounter EOF or an invalid escape sequence, they +# raise a ``TokenError``: no legal token in Actinide begins with a quote mark +# and ends with EOF, and no legal token includes an invalid escape sequence. # # Because tokenization is only concerned with dividing the input into tokens, # this machine *does not* strip quotes or replace escape sequences. On success, # it generates a token containing the whole the string literal, verbatim. -# The lookahead is assumed to be the opening quote of a string, and discarded. -# Read forwards one character to determine whether this is an empty string -# literal or not, then proceed either to ``tokenize_string_end`` for an empty -# string, or to ``tokenize_string_character`` for a non-empty string. +# Reads the first character of a string literal, and looks ahead one character +# to determine how the string proceeds so that it can transition to an +# appropriate state. # -# This never yields a token. The lookahead is set to the characters of the +# This never generates a token. The lookahead is set to the characters of the # string read so far. -def tokenize_string(lookahead, port): - next = port.read(1) +def tokenize_string(port): + quote = read_next(port) + next = peek_next(port) if next == '': raise TokenError('Unclosed string literal') if next == '\\': - return None, lookahead + next, tokenize_escaped_string_character + return None, tokenize_escaped_string_character(quote + read_next(port)) if next == '"': - return None, lookahead + next, tokenize_string_end - return None, lookahead + next, tokenize_string_character + return None, tokenize_string_end(quote) + return None, tokenize_string_character(quote) -# The lookahead contains the body of the string read so far. Reads forwards one -# character to determine if the string continues, contains an escaped character, -# or ends. +# A state factory returning states which accumulate string characters. The +# returned states look ahead one character to determine how to proceed, and read +# one token under most circumstances. # # This never yields a token. -def tokenize_string_character(lookahead, port): - next = port.read(1) - if next == '': - raise TokenError('Unclosed string literal') - if next == '\\': - return None, lookahead + next, tokenize_escaped_string_character - if next == '"': - return None, lookahead + next, tokenize_string_end - return None, lookahead + next, tokenize_string_character +def tokenize_string_character(state): + def tokenize_string_character_next(port): + next = peek_next(port) + if next == '': + raise TokenError('Unclosed string literal') + if next == '\\': + return None, tokenize_escaped_string_character(state + read_next(port)) + if next == '"': + return None, tokenize_string_end(state) + return None, tokenize_string_character(state + read_next(port)) + return tokenize_string_character_next -# The lookahead contains the body of the string so far. Reads forwards one -# character to determine which, if any, escaped character to process: if it's -# one we recognize, append it to the string, otherwise raise a TokenError. +# A state factory returning states which only recognize valid string escaped +# characters (``\\`` and ``"``). If they encounter a valid character, they +# accumulate it onto the string being read and continue reading the string; +# otherwise, they reject the string by raising a TokenError. # -# This never yields a token, and always dispatches back to -# ``tokenize_string_character`` on a legal escape character. -def tokenize_escaped_string_character(lookahead, port): - next = port.read(1) - if next == '': - raise TokenError('Unclosed string literal') - if next == '"': - return None, lookahead + next, tokenize_string_character - if next == '\\': - return None, lookahead + next, tokenize_string_character - raise TokenError(f"Invalid string escape '\\{next}'") - -# Package the lookahead (the full string body, de-escaped and without leading -# and trailing quotes) up as a String token and return it, then transition back -# to the ``tokenize_any`` state with a single read result in the lookahead. -def tokenize_string_end(lookahead, port): - return lookahead, port.read(1), tokenize_any +# This never yields a token. +def tokenize_escaped_string_character(state): + def tokenize_escaped_string_character_next(port): + next = read_next(port) + print(f'Esc: state={repr(state)} next={repr(next)} peek={repr(peek_next(port))}') + if next == '': + raise TokenError('Unclosed string literal') + if next in '\\"': + return None, tokenize_string_character(state + next) + raise TokenError(f"Invalid string escape '\\{next}'") + return tokenize_escaped_string_character_next + +# A state factory which terminates a string literal. These states read off the +# closing quote mark, and generates the accumulated string as a token before +# transitioning back to the ``tokenize_any`` state. +def tokenize_string_end(state): + def tokenize_string_end_next(port): + return state + read_next(port), tokenize_any + return tokenize_string_end_next + +def read_next(port): + return read(port, 1) + +def peek_next(port): + return peek(port, 1) diff --git a/bin/actinide-repl b/bin/actinide-repl deleted file mode 100755 index 7909d36..0000000 --- a/bin/actinide-repl +++ /dev/null @@ -1,6 +0,0 @@ -#!/usr/bin/env python - -import sys -import actinide.tokenizer as at - -print(repr(list(at.tokenize(sys.stdin)))) diff --git a/tests/test_ports.py b/tests/test_ports.py new file mode 100644 index 0000000..c2d1e06 --- /dev/null +++ b/tests/test_ports.py @@ -0,0 +1,29 @@ +from hypothesis import given +from hypothesis.strategies import integers, text + +from actinide.ports import * + +@given(text(), integers(min_value=1, max_value=2**32 - 1)) +def test_read(input, n): + port = string_to_input_port(input) + output = read(port, n) + + assert input.startswith(output) + assert (len(output) == 0 and len(input) == 0) != (0 < len(output) <= n) + assert output + read_fully(port) == input + +@given(text(), integers(min_value=1, max_value=2**32 - 1)) +def test_peek(input, n): + port = string_to_input_port(input) + output = peek(port, n) + + assert input.startswith(output) + assert (len(output) == 0 and len(input) == 0) != (0 < len(output) <= n) + assert read_fully(port) == input + +@given(text(), integers(min_value=1, max_value=2**32 - 1)) +def test_read_fully(input, n): + port = string_to_input_port(input) + output = read_fully(port) + + assert output == input diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py index 76a07a9..5c0ddea 100644 --- a/tests/test_tokenizer.py +++ b/tests/test_tokenizer.py @@ -3,354 +3,40 @@ from hypothesis.strategies import just, text, characters, from_regex, one_of, tu import io from actinide.tokenizer import * +from actinide.ports import * -from .tokens import spaced_token_sequences +from .tokens import spaced_token_sequences, tokens, nontokens -class ReadablePort(io.StringIO): - def __repr__(self): - # Slightly friendlier debugging output - return f"ReadablePort(str={repr(self.getvalue())}, pos={self.tell()})" - -# Many of the following tests proceed by cases, because the underlying behaviour -# is too complex to treat as a uniform set of properties. The cases are meant to -# be total, and in principle could be defined as a set of filters on the -# ``text()`` generator that , combined, exhaust the possible outcomes of that -# generator. -# -# Implementing the tests that way causes Hypothesis to generate a significant -# number of examples that it then throws away without verifying, because -# Hypothesis has no insight into filters to use when generating examples. -# Instead, this test suite specifies generators per-case. - -# Cases for tokenize_any: - -# We test this a bit differently from the subsequent tokenizer states. Because -# it's a pure routing state, we can generate lookahead, expected_state pairs and -# check them in one pass, rather than testing each possible outcome separately. -# In every case, the input is irrelevant: this state never reads. - -def next_token_states(): - return one_of( - tuples(just(''), just(tokenize_eof)), - tuples(just(';'), just(tokenize_comment)), - tuples(sampled_from('()'), just(tokenize_syntax)), - tuples(sampled_from(' \t\n'), just(tokenize_whitespace)), - tuples(just('"'), just(tokenize_atom)), - tuples(characters(blacklist_characters=' \t\n();"'), just(tokenize_atom)), - ) - -@given(next_token_states(), text()) -def test_tokenize_any(lookahead_next, input): - s, expected_state = lookahead_next - port = ReadablePort(input) - token, lookahead, next = tokenize_any(s, input) - - assert token is None - assert lookahead == s - assert next == expected_state - assert port.tell() == 0 - -# Since the previous test case is rigged for success, also verify that no input -# causes tokenize_any to enter an unexpected state or to throw an exception. -@given(text(), text()) -def test_tokenize_any_fuzz(s, input): - port = ReadablePort(input) - token, lookahead, next = tokenize_any(s, input) - - assert token is None - assert lookahead == s - assert next in (tokenize_eof, tokenize_comment, tokenize_syntax, tokenize_whitespace, tokenize_atom) - assert port.tell() == 0 - -# Cases for tokenize_eof: - -# * any lookahead, any input: tokenize_eof is a trap state performing no reads, -# always returning to itself, and never generating a token. -@given(text(), text()) -def test_tokenize_eof(s, input): - port = ReadablePort(input) - token, lookahead, next = tokenize_eof(s, port) - - assert token is None - assert lookahead == s - assert next == tokenize_eof - assert port.tell() == 0 - -# Cases for tokenize_comment: - -# * any lookahead, one or more characters beginning with a non-newline as input: -# tokenize_comment continues the current comment, throwing away one character -# of input, without generating a token. -@given(text(), from_regex(r'^[^\n].*')) -def test_tokenize_comment_continues(s, input): - port = ReadablePort(input) - token, lookahead, next = tokenize_comment(s, port) - - assert token is None - assert port.tell() == 1 - assert lookahead == input[0] - assert next == tokenize_comment - -# * any lookahead, one or more characters beginning with a newline as input, and -# * any lookahead, empty input: -# tokenize_comment concludes the current comment and prepares for the next -# token, without generating a token. -@given(text(), just('') | from_regex(r'^\n.*')) -def test_tokenize_comment_ends(s, input): - port = ReadablePort(input) - token, lookahead, next = tokenize_comment(s, port) - - assert token is None - assert port.tell() == (1 if input else 0) - assert lookahead == (input[0] if input else '') - assert next == tokenize_any - -# Cases for tokenize_syntax: - -# * any lookahead, any input: generate the lookahead as a Syntax token and -# transition back to tokenize_any to prepare for the next token, with one -# character of lookahead ready to go. -@given(text(), text()) -def test_tokenize_syntax(s, input): - port = ReadablePort(input) - token, lookahead, next = tokenize_syntax(s, port) - - assert token == s - assert port.tell() == (1 if input else 0) - assert lookahead == (input[0] if input else '') - assert next == tokenize_any - -# Cases for test_tokenize_whitespace: - -# * any lookahead, any input: throw away the presumed-whitespace lookahead, then -# transition back to tokenize_any to prepare for the next token, with one -# character of lookahead ready to go, without generating a token. -@given(text(), text()) -def test_tokenize_whitespace(s, input): - port = ReadablePort(input) - token, lookahead, next = tokenize_whitespace(s, port) - - assert token is None - assert port.tell() == (1 if input else 0) - assert lookahead == (input[0] if input else '') - assert next == tokenize_any - -# Cases for tokenize_nonstring_atom: - -# * any lookahead, any non-empty input not beginning with whitespace, syntax, a -# comment delimiter, or a string literal: accumulate one character of input -# onto the lookahead, then transition back to tokenize_symbol to process the -# next character of input, without generating a token. -@given(text(), from_regex(r'^[^ \n\t();"].*')) -def test_tokenize_nonstring_atom_continues(s, input): - port = ReadablePort(input) - token, lookahead, next = tokenize_nonstring_atom(s, port) - - assert token is None - assert port.tell() == 1 - assert lookahead == s + input[0] - assert next == tokenize_nonstring_atom - -# * any lookahead, a non-empty input beginning with whitespace, syntax, a -# comment delimiter, or a string literal, and -# * any lookahead, empty input: -# generate the accumulated input as a Symbol token, then transition back to tokenize_any with one character of lookahead ready to go. -@given(text(), just('') | from_regex(r'^[ \n\t();"].*')) -def test_tokenize_tokenize_nonstring_atom_ends(s, input): - port = ReadablePort(input) - token, lookahead, next = tokenize_nonstring_atom(s, port) - - assert token == s - assert port.tell() == (1 if input else 0) - assert lookahead == (input[0] if input else '') - assert next == tokenize_any - -# And now, the _worst_ part of the state machine. Cases for tokenize_string: - -# * any lookahead, a non-empty input not beginning with a string delimiter: -# begin a non-empty string by transitioning to the tokenize_string_character -# state with one character of lookahead, without generating a token. -@given(text(), from_regex(r'^[^"].*')) -def test_tokenize_string_continues(s, input): - port = ReadablePort(input) - token, lookahead, next = tokenize_string(s, port) - - assert token is None - assert port.tell() == 1 - assert lookahead == s + input[0] - assert next == tokenize_string_character - -# * any lookahad, a non-empty input beginning with a string delimiter: terminate -# an empty string by transitioning to the tokenize_string_end state with an -# *empty* lookahead, without generating a token. -@given(text(), from_regex(r'^["].*')) -def test_tokenize_string_empty(s, input): - port = ReadablePort(input) - token, lookahead, next = tokenize_string(s, port) - - assert token is None - assert port.tell() == 1 - assert lookahead == s + input[0] - assert next == tokenize_string_end - -# * any lookahead, empty input: emit a tokenization error, as we've encountered -# EOF inside of a string. -@given(text(), just('')) -def test_tokenize_string_eof(s, input): - try: - port = ReadablePort(input) - token, lookahead, next = tokenize_string(s, port) - - assert False # must raise - except TokenError: - assert port.tell() == 0 - -# Cases for tokenize_string_character: - -# * any lookahead, any non-empty input not beginning with a string delimiter or -# escape character: append one character of input to the lookahead, then -# continue in the tokenize_string_character state without generating a token. -@given(text(), from_regex(r'^[^\\"].*')) -def test_tokenize_string_character_continues(s, input): - port = ReadablePort(input) - token, lookahead, next = tokenize_string_character(s, port) - - assert token is None - assert port.tell() == 1 - assert lookahead == s + input[0] - assert next == tokenize_string_character - -# * any lookahead, any non-empty input which begins with an escape character: -# leave the lookahead unchanged, but transition to the -# tokenize_escaped_string_character state to determine which escape character -# we're dealing with, without emitting a token. -@given(text(), from_regex(r'^[\\].*')) -def test_tokenize_string_character_begins_escape(s, input): - port = ReadablePort(input) - token, lookahead, next = tokenize_string_character(s, port) - - assert token is None - assert port.tell() == 1 - assert lookahead == s + input[0] - assert next == tokenize_escaped_string_character - -# * any lookahead, any non-empty input which begins with a string delimiter: -# we're at the end of a string. Transition to the tokenize_string_end state -# with the current lookahead, without generating a token. -@given(text(), from_regex(r'^["].*')) -def test_tokenize_string_character_ends(s, input): - port = ReadablePort(input) - token, lookahead, next = tokenize_string_character(s, port) - - assert token is None - assert port.tell() == 1 - assert lookahead == s + input[0] - assert next == tokenize_string_end - -# * any lookahead, empty input: emit a tokenization error, as we've encountered -# EOF inside of a string literal. -@given(text(), just('')) -def test_tokenize_string_character_eof(s, input): - try: - port = ReadablePort(input) - token, lookahead, next = tokenize_string_character(s, port) - - assert False # must raise - except TokenError: - assert input == '' - assert port.tell() == 0 - -# Cases for tokenize_escaped_string: - -# * any lookahead, any non-empty input beginning with a legal string escaped -# character: de-escape the first character of the input, append the result to -# the lookahead, then transition back to the tokenize_string_character state. -@given(text(), from_regex(r'^["\\].*')) -def test_tokenize_escaped_string_character_valid(s, input): - port = ReadablePort(input) - token, lookahead, next = tokenize_escaped_string_character(s, port) - - assert token is None - assert port.tell() == 1 - assert lookahead == s + input[0] - assert next == tokenize_string_character - -# * any lookahead, any non-empty input not beginning with a legal string escaped -# character: emit a tokenization error, we've found an invalid string escape. -@given(text(), from_regex(r'^[^"\\].*')) -def test_tokenize_escaped_string_character_invalid(s, input): - try: - port = ReadablePort(input) - token, lookahead, next = tokenize_escaped_string_character(s, port) - - assert False # must raise - except TokenError: - assert port.tell() == 1 - -# * any lookahead, empty input: emit a tokenization error, we've found an EOF -# inside of a string literal. -@given(text(), just('')) -def test_tokenize_escaped_string_character_eof(s, input): - try: - port = ReadablePort(input) - token, lookahead, next = tokenize_escaped_string_character(s, port) - - assert False # must raise - except TokenError: - assert port.tell() == 0 - -# Cases for tokenize_string_end: - -# * any lookahead, any input: generate a String token from the lookahead, then -# transition back to the tokenize_any state with one character of lookahead -# ready to go. -@given(text(), text()) -def test_tokenize_string_end(s, input): - port = ReadablePort(input) - token, lookahead, next = tokenize_string_end(s, port) - - assert token == s - assert port.tell() == (1 if input else 0) - assert lookahead == (input[0] if input else '') - assert next == tokenize_any - -# Cases for tokenize_atom: - -# * lookahead containing a string delimiter, any input: found a string atom, -# transition to the tokenize_string state without reading or generating a -# token. -@given(just('"'), text()) -def test_tokenize_atom_string(s, input): - port = ReadablePort(input) - token, lookahead, next = tokenize_atom(s, port) +# Cases for the tokenizer: - assert token is None - assert port.tell() == 0 - assert lookahead == s - assert next == tokenize_string +# * any single token: reads back that token. +@given(tokens()) +def test_tokenizer_single_token(input): + port = string_to_input_port(input) -# * lookahead containing something other than a string delimiter, any input: -# found a nonstring atom, transition to the tokenize_nonstring_atom state -# without reading or generating a token. -@given(from_regex(r'^[^"]'), text()) -def test_tokenize_atom_nonstring(s, input): - port = ReadablePort(input) - token, lookahead, next = tokenize_atom(s, port) + assert read_token(port) == input - assert token is None - assert port.tell() == 0 - assert lookahead == s - assert next == tokenize_nonstring_atom +# * any input guaranteed not to contain a token: reads back None, consuming the +# whole input in the process. +@given(nontokens()) +def test_tokenizer_no_token(input): + port = string_to_input_port(input) -# Cases for the tokenizer: + assert read_token(port) == None # * any sequence of separator-token pairs: if the pairs are coalesced into a # single giant input, does the tokenizer recover the tokens? @given(spaced_token_sequences()) -def test_tokenizer(spaced_tokens): +def test_tokenizer_spaced_sequence(spaced_tokens): input = ''.join(''.join(pair) for pair in spaced_tokens) tokens = [token for (_, token) in spaced_tokens] - port = ReadablePort(input) + port = string_to_input_port(input) + def iterate_read_token(port): + token = read_token(port) + while token is not None: + yield token + token = read_token(port) + + assert list(iterate_read_token(port)) == tokens - assert list(tokenize(port)) == tokens diff --git a/tests/tokens.py b/tests/tokens.py index 0027fb2..3eb58b8 100644 --- a/tests/tokens.py +++ b/tests/tokens.py @@ -48,6 +48,10 @@ def whitespace_characters(): def tokens(): return one_of(symbols(), strings(), open_parens(), close_parens()) +# Generates a string which may not be empty, but which does not contain a token. +def nontokens(): + return one_of(whitespace(), comments(), just('')) + # Generates at least one character of whitespace. def whitespace(): return text(whitespace_characters(), min_size=1) |
