summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--actinide/tokenizer.py114
-rw-r--r--actinide/types.py12
-rw-r--r--tests/__init__.py0
-rw-r--r--tests/test_tokenizer.py291
-rw-r--r--tests/tokens.py90
5 files changed, 382 insertions, 125 deletions
diff --git a/actinide/tokenizer.py b/actinide/tokenizer.py
index 8fb9d0a..9767033 100644
--- a/actinide/tokenizer.py
+++ b/actinide/tokenizer.py
@@ -1,5 +1,3 @@
-from .types import *
-
# ## TOKENIZATION
#
# The following code implements a state-machine-driven tokenizer which can
@@ -8,19 +6,20 @@ from .types import *
#
# * Comments: ``;`` followed by all bytes to EOF or to the end of the line.
#
-# * Strings: ``"`` through to the next unescaped ``"`` are read, de-escaped, and
-# returned. The sequences ``\"`` and ``\\`` are treated specially: the former
-# de-escapes to ``"``, and the latter to ``\``. An unclosed string literal or
-# an unknown escape sequence is a tokenization error.
-#
# * Open and close parens: ``(`` and ``)`` are returned as freestanding tokens.
#
# * Whitespace: Space, horizontal tab, and newline characters are discarded
# during tokenization.
#
-# * Symbols: Any sequence of characters not included in one of the above classes
+# * Strings: ``"`` through to the next unescaped ``"`` are read and returned.
+# Sequences within the string beginning with ``\`` indicate an escape, and may
+# only be followed by the character ``"`` or ``\``. An unclosed string literal
+# or an unknown escape sequence is a tokenization error.
+#
+# * Atoms: Any sequence of characters not included in one of the above classes
# is read and returned as a single token. This includes words, numbers, and
-# special literals.
+# special literals. (Strings are, technically, a kind of atom, but the lexer
+# treats them specially due to their complexity.)
#
# Internally, the tokenizer is a state machine which maintains two pieces of
# state: the "lookahead" (holding data to feed to the next state transition
@@ -34,6 +33,11 @@ from .types import *
# port) to determine the next state, and return a 3-tuple of ``token`` (may be
# ``None``), ``lookahead`` (which replaces the previous lookahead), and ``next``
# (the new next state transition function).
+#
+# This is heavily inspired by various tail-recursive approaches to tokenizing
+# lisp streams. However, the host language does not guarantee tail call
+# optimizations, so we use an explicit trampoline function to drive the state
+# machine instead of calling each parser directly.
class TokenError(Exception):
'''
@@ -47,20 +51,20 @@ class TokenError(Exception):
#
# This is the top-level driver for the state machine that divides the underlying
# input into tokens. It does no input handling itself, other than reading the
-# first character of the port: so long as the lookahead is non-empty, this calls
-# the next state transition function to determine what to do and how to change
-# the lookahead.
+# first character of the port: this calls the next state transition function to
+# determine what to do and how to change the lookahead.
#
-# Initially, this is in the ``tokenize_any`` state.
+# Initially, this is in the ``tokenize_any`` state, and exits once it reaches
+# the ``tokenize_eof`` state.
def tokenize(port):
lookahead, next = port.read(1), tokenize_any
- while len(lookahead) > 0:
+ while next != tokenize_eof:
token, lookahead, next = next(lookahead, port)
if token is not None:
yield token
-# If the lookahead is exactly one read result, this will correctly determine the
-# next token type and return that state without consuming input. This is
+# If the lookahead is exactly one character, this will correctly determine the
+# next token type and transition to that state without consuming input. This is
# generally the correct state to transition to any time the next token is
# unknown - for example, at the end of another token.
#
@@ -75,9 +79,7 @@ def tokenize_any(lookahead, port):
return None, lookahead, tokenize_syntax
if lookahead in ' \t\n':
return None, lookahead, tokenize_whitespace
- if lookahead == '"':
- return None, lookahead, tokenize_string
- return None, lookahead, tokenize_symbol
+ return None, lookahead, tokenize_atom
# Special trap state. This never produces a token, and always transitions to
# itself. The lookahead in this state is generally ``''``, and since this never
@@ -89,8 +91,8 @@ def tokenize_any(lookahead, port):
def tokenize_eof(lookahead, port):
return None, lookahead, tokenize_eof
-# Consumes one read result at a time until it finds an end of line or runs out
-# of input. This throws away comments entirely, at parse time, without
+# Consumes one character at a time until it finds an end of line or runs out of
+# input. This throws away comments entirely, at tokenization time, without
# considering whether the comment content can be separated into tokens. As this
# scans the comment, the lookahead will be set to successive characters from the
# port, but never more than one character at a time.
@@ -104,51 +106,64 @@ def tokenize_comment(lookahead, port):
return None, next, tokenize_any
return None, next, tokenize_comment
-# Consumes the lookahead and packages it up as a Syntax token. This is generally
-# appropriate for the ``(`` and ``)`` syntactic elements.
+# Generates the entire lookahead as a token. This is generally appropriate for
+# the ``(`` and ``)`` syntactic elements.
#
# The resulting lookahead will be the next character of input, and this always
# dispatches back to ``tokenize_any`` so that the next token (if any) can be
# determined.
def tokenize_syntax(lookahead, port):
- return Syntax(lookahead), port.read(1), tokenize_any
+ return lookahead, port.read(1), tokenize_any
-# Consumes and ignores whitespace in the input. This never produces a token, and
+# Consumes and ignores one character of input. This never produces a token, and
# throws away the lookahead entirely. The resulting lookahead is the next
# character of input.
def tokenize_whitespace(lookahead, port):
return None, port.read(1), tokenize_any
-# Consumes characters until it finds a character which cannot be part of a token
-# or until it finds the end of input, accumulating them into a single Symbol
-# token. This is a heavily-overloaded token category, as it contains not only
-# Actinide symbols but also all non-String literals.
+# We've ruled out all non-atom tokens. If the lookahead is a string delimiter,
+# transitions to a state which tokenizes a single string literal; otherwise,
+# transitions to a state which consumes a single non-string atom. In both cases,
+# this leaves the lookahead alone, and generates no token.
+def tokenize_atom(lookahead, port):
+ if lookahead == '"':
+ return None, lookahead, tokenize_string
+ return None, lookahead, tokenize_nonstring_atom
+
+# Consumes characters until it finds a character which cannot be part of a
+# non-string atom, or until it finds the end of input, accumulating them into a
+# single token. This is a heavily-overloaded token category, as it contains not
+# only Actinide symbols but also all non-String literals.
#
# While the tokenizer remains in this state, the lookahead accumulates the
# characters of the token. When this matches a completed token, it produces a
# Symbol token, and resets the lookahead back to a single read result containing
# the next character of input.
-def tokenize_symbol(lookahead, port):
+def tokenize_nonstring_atom(lookahead, port):
next = port.read(1)
if next == '':
- return Symbol(lookahead), next, tokenize_any
+ return lookahead, next, tokenize_any
if next in '"(); \t\n':
- return Symbol(lookahead), next, tokenize_any
- return None, lookahead + next, tokenize_symbol
+ return lookahead, next, tokenize_any
+ return None, lookahead + next, tokenize_nonstring_atom
# ### STRINGS
#
-# The following states handle string literals in the input stream. String
-# literals are fairly simple: they begin with a quote, contain arbitrary
-# characters other than a bare \ or ", and end with a quote. The sequences
-# ``\\`` and ``\"`` are de-escaped by removing the leading backslash and
-# included in the resulting string.
+# The following family of states handles string literals in the input stream.
+# String literals are fairly simple: they begin with a quote, contain arbitrary
+# characters other than a bare \ or ", and end with a quote. (Note that ``\n``
+# is not an escape sequence: unescaped newlines are permitted within string
+# literals.)
#
# These states use the lookahead to accumulate the characters of the string. On
# transition back to ``tokenize_any``, the lookahead is always set back to a
# single character. If, at any point, these states encounter EOF, they raise a
# ``TokenError``: no legal token in Actinide begins with a quote mark and ends
# with EOF.
+#
+# Because tokenization is only concerned with dividing the input into tokens,
+# this machine *does not* strip quotes or replace escape sequences. On success,
+# it generates a token containing the whole the string literal, verbatim.
# The lookahead is assumed to be the opening quote of a string, and discarded.
# Read forwards one character to determine whether this is an empty string
@@ -161,9 +176,11 @@ def tokenize_string(lookahead, port):
next = port.read(1)
if next == '':
raise TokenError('Unclosed string literal')
+ if next == '\\':
+ return None, lookahead + next, tokenize_escaped_string_character
if next == '"':
- return None, '', tokenize_string_end
- return None, next, tokenize_string_character
+ return None, lookahead + next, tokenize_string_end
+ return None, lookahead + next, tokenize_string_character
# The lookahead contains the body of the string read so far. Reads forwards one
# character to determine if the string continues, contains an escaped character,
@@ -175,15 +192,14 @@ def tokenize_string_character(lookahead, port):
if next == '':
raise TokenError('Unclosed string literal')
if next == '\\':
- return None, lookahead, tokenize_escaped_string_character
+ return None, lookahead + next, tokenize_escaped_string_character
if next == '"':
- return None, lookahead, tokenize_string_end
+ return None, lookahead + next, tokenize_string_end
return None, lookahead + next, tokenize_string_character
# The lookahead contains the body of the string so far. Reads forwards one
# character to determine which, if any, escaped character to process: if it's
-# one we recognize, de-escape it and append it to the string, otherwise raise a
-# TokenError.
+# one we recognize, append it to the string, otherwise raise a TokenError.
#
# This never yields a token, and always dispatches back to
# ``tokenize_string_character`` on a legal escape character.
@@ -191,14 +207,14 @@ def tokenize_escaped_string_character(lookahead, port):
next = port.read(1)
if next == '':
raise TokenError('Unclosed string literal')
- if next == 'n':
- return None, lookahead + '\n', tokenize_string_character
+ if next == '"':
+ return None, lookahead + next, tokenize_string_character
if next == '\\':
- return None, lookahead + '\\', tokenize_string_character
- raise TokenError(f"Unknown string escape '\{next}'")
+ return None, lookahead + next, tokenize_string_character
+ raise TokenError(f"Invalid string escape '\\{next}'")
# Package the lookahead (the full string body, de-escaped and without leading
# and trailing quotes) up as a String token and return it, then transition back
# to the ``tokenize_any`` state with a single read result in the lookahead.
def tokenize_string_end(lookahead, port):
- return String(lookahead), port.read(1), tokenize_any
+ return lookahead, port.read(1), tokenize_any
diff --git a/actinide/types.py b/actinide/types.py
deleted file mode 100644
index 2b618f4..0000000
--- a/actinide/types.py
+++ /dev/null
@@ -1,12 +0,0 @@
-__all__ = ['String', 'Symbol', 'Syntax']
-
-# ## REPRESENTATIONS
-#
-# The following defines specify the Python representations of various Actinide
-# types.
-
-String = str
-class Symbol(str):
- pass
-class Syntax(str):
- pass
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/tests/__init__.py
diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py
index a300eb2..76a07a9 100644
--- a/tests/test_tokenizer.py
+++ b/tests/test_tokenizer.py
@@ -1,17 +1,71 @@
-from hypothesis import given, settings, HealthCheck
-from hypothesis.strategies import text, from_regex
+from hypothesis import given, settings, HealthCheck, event
+from hypothesis.strategies import just, text, characters, from_regex, one_of, tuples, sampled_from
import io
from actinide.tokenizer import *
+from .tokens import spaced_token_sequences
+
class ReadablePort(io.StringIO):
def __repr__(self):
# Slightly friendlier debugging output
return f"ReadablePort(str={repr(self.getvalue())}, pos={self.tell()})"
-def not_(f):
- return lambda *args, **kwargs: not f(*args, **kwargs)
+# Many of the following tests proceed by cases, because the underlying behaviour
+# is too complex to treat as a uniform set of properties. The cases are meant to
+# be total, and in principle could be defined as a set of filters on the
+# ``text()`` generator that , combined, exhaust the possible outcomes of that
+# generator.
+#
+# Implementing the tests that way causes Hypothesis to generate a significant
+# number of examples that it then throws away without verifying, because
+# Hypothesis has no insight into filters to use when generating examples.
+# Instead, this test suite specifies generators per-case.
+
+# Cases for tokenize_any:
+
+# We test this a bit differently from the subsequent tokenizer states. Because
+# it's a pure routing state, we can generate lookahead, expected_state pairs and
+# check them in one pass, rather than testing each possible outcome separately.
+# In every case, the input is irrelevant: this state never reads.
+
+def next_token_states():
+ return one_of(
+ tuples(just(''), just(tokenize_eof)),
+ tuples(just(';'), just(tokenize_comment)),
+ tuples(sampled_from('()'), just(tokenize_syntax)),
+ tuples(sampled_from(' \t\n'), just(tokenize_whitespace)),
+ tuples(just('"'), just(tokenize_atom)),
+ tuples(characters(blacklist_characters=' \t\n();"'), just(tokenize_atom)),
+ )
+
+@given(next_token_states(), text())
+def test_tokenize_any(lookahead_next, input):
+ s, expected_state = lookahead_next
+ port = ReadablePort(input)
+ token, lookahead, next = tokenize_any(s, input)
+
+ assert token is None
+ assert lookahead == s
+ assert next == expected_state
+ assert port.tell() == 0
+# Since the previous test case is rigged for success, also verify that no input
+# causes tokenize_any to enter an unexpected state or to throw an exception.
+@given(text(), text())
+def test_tokenize_any_fuzz(s, input):
+ port = ReadablePort(input)
+ token, lookahead, next = tokenize_any(s, input)
+
+ assert token is None
+ assert lookahead == s
+ assert next in (tokenize_eof, tokenize_comment, tokenize_syntax, tokenize_whitespace, tokenize_atom)
+ assert port.tell() == 0
+
+# Cases for tokenize_eof:
+
+# * any lookahead, any input: tokenize_eof is a trap state performing no reads,
+# always returning to itself, and never generating a token.
@given(text(), text())
def test_tokenize_eof(s, input):
port = ReadablePort(input)
@@ -22,12 +76,12 @@ def test_tokenize_eof(s, input):
assert next == tokenize_eof
assert port.tell() == 0
-def comment_continues(text):
- if text == '':
- return False
- return text[0] != '\n'
+# Cases for tokenize_comment:
-@given(text(), text().filter(comment_continues))
+# * any lookahead, one or more characters beginning with a non-newline as input:
+# tokenize_comment continues the current comment, throwing away one character
+# of input, without generating a token.
+@given(text(), from_regex(r'^[^\n].*'))
def test_tokenize_comment_continues(s, input):
port = ReadablePort(input)
token, lookahead, next = tokenize_comment(s, port)
@@ -37,7 +91,11 @@ def test_tokenize_comment_continues(s, input):
assert lookahead == input[0]
assert next == tokenize_comment
-@given(text(), text().filter(not_(comment_continues)))
+# * any lookahead, one or more characters beginning with a newline as input, and
+# * any lookahead, empty input:
+# tokenize_comment concludes the current comment and prepares for the next
+# token, without generating a token.
+@given(text(), just('') | from_regex(r'^\n.*'))
def test_tokenize_comment_ends(s, input):
port = ReadablePort(input)
token, lookahead, next = tokenize_comment(s, port)
@@ -47,17 +105,26 @@ def test_tokenize_comment_ends(s, input):
assert lookahead == (input[0] if input else '')
assert next == tokenize_any
+# Cases for tokenize_syntax:
+
+# * any lookahead, any input: generate the lookahead as a Syntax token and
+# transition back to tokenize_any to prepare for the next token, with one
+# character of lookahead ready to go.
@given(text(), text())
def test_tokenize_syntax(s, input):
port = ReadablePort(input)
token, lookahead, next = tokenize_syntax(s, port)
- assert token == Syntax(s)
- assert isinstance(token, Syntax)
+ assert token == s
assert port.tell() == (1 if input else 0)
assert lookahead == (input[0] if input else '')
assert next == tokenize_any
+# Cases for test_tokenize_whitespace:
+
+# * any lookahead, any input: throw away the presumed-whitespace lookahead, then
+# transition back to tokenize_any to prepare for the next token, with one
+# character of lookahead ready to go, without generating a token.
@given(text(), text())
def test_tokenize_whitespace(s, input):
port = ReadablePort(input)
@@ -68,67 +135,82 @@ def test_tokenize_whitespace(s, input):
assert lookahead == (input[0] if input else '')
assert next == tokenize_any
-def symbol_continues(text):
- if text == '':
- return False
- return text[0] not in ' \n\t();"'
+# Cases for tokenize_nonstring_atom:
-@given(text(), text().filter(symbol_continues))
-def test_tokenize_symbol_continues(s, input):
+# * any lookahead, any non-empty input not beginning with whitespace, syntax, a
+# comment delimiter, or a string literal: accumulate one character of input
+# onto the lookahead, then transition back to tokenize_symbol to process the
+# next character of input, without generating a token.
+@given(text(), from_regex(r'^[^ \n\t();"].*'))
+def test_tokenize_nonstring_atom_continues(s, input):
port = ReadablePort(input)
- token, lookahead, next = tokenize_symbol(s, port)
+ token, lookahead, next = tokenize_nonstring_atom(s, port)
assert token is None
assert port.tell() == 1
assert lookahead == s + input[0]
- assert next == tokenize_symbol
-
-@given(text(), text().filter(not_(symbol_continues)))
-def test_tokenize_symbol_ends(s, input):
+ assert next == tokenize_nonstring_atom
+
+# * any lookahead, a non-empty input beginning with whitespace, syntax, a
+# comment delimiter, or a string literal, and
+# * any lookahead, empty input:
+# generate the accumulated input as a Symbol token, then transition back to tokenize_any with one character of lookahead ready to go.
+@given(text(), just('') | from_regex(r'^[ \n\t();"].*'))
+def test_tokenize_tokenize_nonstring_atom_ends(s, input):
port = ReadablePort(input)
- token, lookahead, next = tokenize_symbol(s, port)
+ token, lookahead, next = tokenize_nonstring_atom(s, port)
- assert token == Symbol(s)
- assert isinstance(token, Symbol)
+ assert token == s
assert port.tell() == (1 if input else 0)
assert lookahead == (input[0] if input else '')
assert next == tokenize_any
-def string_continues(text):
- if text == '':
- return False
- return not text[0] == '"'
+# And now, the _worst_ part of the state machine. Cases for tokenize_string:
-@given(text(), text().filter(string_continues))
+# * any lookahead, a non-empty input not beginning with a string delimiter:
+# begin a non-empty string by transitioning to the tokenize_string_character
+# state with one character of lookahead, without generating a token.
+@given(text(), from_regex(r'^[^"].*'))
def test_tokenize_string_continues(s, input):
port = ReadablePort(input)
token, lookahead, next = tokenize_string(s, port)
assert token is None
assert port.tell() == 1
- assert lookahead == input[0]
+ assert lookahead == s + input[0]
assert next == tokenize_string_character
-@given(text(), text().filter(not_(string_continues)))
-def test_tokenize_string_ends(s, input):
+# * any lookahad, a non-empty input beginning with a string delimiter: terminate
+# an empty string by transitioning to the tokenize_string_end state with an
+# *empty* lookahead, without generating a token.
+@given(text(), from_regex(r'^["].*'))
+def test_tokenize_string_empty(s, input):
+ port = ReadablePort(input)
+ token, lookahead, next = tokenize_string(s, port)
+
+ assert token is None
+ assert port.tell() == 1
+ assert lookahead == s + input[0]
+ assert next == tokenize_string_end
+
+# * any lookahead, empty input: emit a tokenization error, as we've encountered
+# EOF inside of a string.
+@given(text(), just(''))
+def test_tokenize_string_eof(s, input):
try:
port = ReadablePort(input)
token, lookahead, next = tokenize_string(s, port)
- assert token is None
- assert port.tell() == 1
- assert lookahead == ''
- assert next == tokenize_string_end
+ assert False # must raise
except TokenError:
- assert input == ''
assert port.tell() == 0
-def is_escape(text):
- if text == '':
- return False
- return text[0] == '\\'
+# Cases for tokenize_string_character:
-@given(text(), text().filter(string_continues).filter(not_(is_escape)))
+# * any lookahead, any non-empty input not beginning with a string delimiter or
+# escape character: append one character of input to the lookahead, then
+# continue in the tokenize_string_character state without generating a token.
+@given(text(), from_regex(r'^[^\\"].*'))
def test_tokenize_string_character_continues(s, input):
port = ReadablePort(input)
token, lookahead, next = tokenize_string_character(s, port)
@@ -138,56 +220,137 @@ def test_tokenize_string_character_continues(s, input):
assert lookahead == s + input[0]
assert next == tokenize_string_character
-# Using from_regex() rather than text() because searching randomly for strings
-# that start with a specific character is far, _far_ too slow. (It often fails
-# to find any examples.) I _think_ this preserves the property that this group
-# of three tests are exhaustive, but it's not as obvious as it would be if I
-# could use text() here.
-@given(text(), from_regex(r'\\.*').filter(string_continues).filter(is_escape))
+# * any lookahead, any non-empty input which begins with an escape character:
+# leave the lookahead unchanged, but transition to the
+# tokenize_escaped_string_character state to determine which escape character
+# we're dealing with, without emitting a token.
+@given(text(), from_regex(r'^[\\].*'))
def test_tokenize_string_character_begins_escape(s, input):
port = ReadablePort(input)
token, lookahead, next = tokenize_string_character(s, port)
assert token is None
assert port.tell() == 1
- assert lookahead == s
+ assert lookahead == s + input[0]
assert next == tokenize_escaped_string_character
-@given(text(), text().filter(not_(string_continues)))
+# * any lookahead, any non-empty input which begins with a string delimiter:
+# we're at the end of a string. Transition to the tokenize_string_end state
+# with the current lookahead, without generating a token.
+@given(text(), from_regex(r'^["].*'))
def test_tokenize_string_character_ends(s, input):
+ port = ReadablePort(input)
+ token, lookahead, next = tokenize_string_character(s, port)
+
+ assert token is None
+ assert port.tell() == 1
+ assert lookahead == s + input[0]
+ assert next == tokenize_string_end
+
+# * any lookahead, empty input: emit a tokenization error, as we've encountered
+# EOF inside of a string literal.
+@given(text(), just(''))
+def test_tokenize_string_character_eof(s, input):
try:
port = ReadablePort(input)
token, lookahead, next = tokenize_string_character(s, port)
- assert token is None
- assert port.tell() == 1
- assert lookahead == s
- assert next == tokenize_string_end
+ assert False # must raise
except TokenError:
assert input == ''
assert port.tell() == 0
-@given(text(), text())
-def test_tokenize_escaped_string_character(s, input):
+# Cases for tokenize_escaped_string:
+
+# * any lookahead, any non-empty input beginning with a legal string escaped
+# character: de-escape the first character of the input, append the result to
+# the lookahead, then transition back to the tokenize_string_character state.
+@given(text(), from_regex(r'^["\\].*'))
+def test_tokenize_escaped_string_character_valid(s, input):
+ port = ReadablePort(input)
+ token, lookahead, next = tokenize_escaped_string_character(s, port)
+
+ assert token is None
+ assert port.tell() == 1
+ assert lookahead == s + input[0]
+ assert next == tokenize_string_character
+
+# * any lookahead, any non-empty input not beginning with a legal string escaped
+# character: emit a tokenization error, we've found an invalid string escape.
+@given(text(), from_regex(r'^[^"\\].*'))
+def test_tokenize_escaped_string_character_invalid(s, input):
try:
port = ReadablePort(input)
token, lookahead, next = tokenize_escaped_string_character(s, port)
- assert token is None
+ assert False # must raise
+ except TokenError:
assert port.tell() == 1
- assert lookahead == s + input[0]
- assert next == tokenize_string_character
+
+# * any lookahead, empty input: emit a tokenization error, we've found an EOF
+# inside of a string literal.
+@given(text(), just(''))
+def test_tokenize_escaped_string_character_eof(s, input):
+ try:
+ port = ReadablePort(input)
+ token, lookahead, next = tokenize_escaped_string_character(s, port)
+
+ assert False # must raise
except TokenError:
- assert input == '' or input[0] not in '\\n'
- assert port.tell() == (1 if input else 0)
+ assert port.tell() == 0
+
+# Cases for tokenize_string_end:
+# * any lookahead, any input: generate a String token from the lookahead, then
+# transition back to the tokenize_any state with one character of lookahead
+# ready to go.
@given(text(), text())
def test_tokenize_string_end(s, input):
port = ReadablePort(input)
token, lookahead, next = tokenize_string_end(s, port)
assert token == s
- assert isinstance(token, String)
assert port.tell() == (1 if input else 0)
assert lookahead == (input[0] if input else '')
assert next == tokenize_any
+
+# Cases for tokenize_atom:
+
+# * lookahead containing a string delimiter, any input: found a string atom,
+# transition to the tokenize_string state without reading or generating a
+# token.
+@given(just('"'), text())
+def test_tokenize_atom_string(s, input):
+ port = ReadablePort(input)
+ token, lookahead, next = tokenize_atom(s, port)
+
+ assert token is None
+ assert port.tell() == 0
+ assert lookahead == s
+ assert next == tokenize_string
+
+# * lookahead containing something other than a string delimiter, any input:
+# found a nonstring atom, transition to the tokenize_nonstring_atom state
+# without reading or generating a token.
+@given(from_regex(r'^[^"]'), text())
+def test_tokenize_atom_nonstring(s, input):
+ port = ReadablePort(input)
+ token, lookahead, next = tokenize_atom(s, port)
+
+ assert token is None
+ assert port.tell() == 0
+ assert lookahead == s
+ assert next == tokenize_nonstring_atom
+
+# Cases for the tokenizer:
+
+# * any sequence of separator-token pairs: if the pairs are coalesced into a
+# single giant input, does the tokenizer recover the tokens?
+@given(spaced_token_sequences())
+def test_tokenizer(spaced_tokens):
+ input = ''.join(''.join(pair) for pair in spaced_tokens)
+ tokens = [token for (_, token) in spaced_tokens]
+
+ port = ReadablePort(input)
+
+ assert list(tokenize(port)) == tokens
diff --git a/tests/tokens.py b/tests/tokens.py
new file mode 100644
index 0000000..0027fb2
--- /dev/null
+++ b/tests/tokens.py
@@ -0,0 +1,90 @@
+from hypothesis.strategies import just, one_of, characters, text, lists, tuples
+from hypothesis.strategies import composite, recursive
+
+# Generators for token families
+
+# Generates the `(` token.
+def open_parens():
+ return just('(')
+
+# Generates the ')' token.
+def close_parens():
+ return just(')')
+
+# Generates characters that are legal, unescaped, inside of a string.
+def string_bare_characters():
+ return characters(blacklist_characters='\\"')
+
+# Generates legal string escape sequences.
+def string_escaped_characters():
+ return one_of(just('"'), just('\\')).map(lambda c: '\\' + c)
+
+# Generates single-character string representations, including escapes.
+def string_characters():
+ return one_of(string_bare_characters(), string_escaped_characters())
+
+# Generates arbitrary string bodies (strings, without leading or trailing
+# quotes)
+def string_body():
+ return text(string_characters())
+
+# Generates legal strings.
+def strings():
+ return tuples(just('"'), string_body(), just('"')).map(lambda t: ''.join(t))
+
+# Generates characters which are legal within a symbol.
+def symbol_characters():
+ return characters(blacklist_characters=' \t\n();"')
+
+# Generates legal symbols.
+def symbols():
+ return text(symbol_characters(), min_size=1)
+
+# Generates single whitespace characters.
+def whitespace_characters():
+ return one_of(just('\n'), just(' '), just('\t'))
+
+# Generates a single token.
+def tokens():
+ return one_of(symbols(), strings(), open_parens(), close_parens())
+
+# Generates at least one character of whitespace.
+def whitespace():
+ return text(whitespace_characters(), min_size=1)
+
+# Generates characters which can legally appear inside of a comment (anything
+# but a newline).
+def comment_characters():
+ return characters(blacklist_characters='\n')
+
+# Generates a (possibly-empty) comment, terminated with a trailing newline.
+def comments():
+ return tuples(just(';'), text(comment_characters()), just('\n')).map(lambda t: ''.join(t))
+
+# Generates sequences which can be inserted between arbitrary pairs of tokens
+# without changing their meaning.
+def intertokens():
+ return one_of(comments(), whitespace())
+
+# Generate a pair such that the second element is a token, and joining the
+# elements with an empty string produces a string that tokenizes to the second
+# element.
+def spaced_tokens():
+ def spaced(strategy):
+ return tuples(intertokens(), strategy)
+ def unspaced(strategy):
+ return tuples(one_of(just(''), intertokens()), strategy)
+ def spaced_symbols():
+ return spaced(symbols())
+ def spaced_strings():
+ return unspaced(strings())
+ def spaced_open_parens():
+ return unspaced(open_parens())
+ def spaced_close_parens():
+ return unspaced(close_parens())
+
+ return one_of(spaced_symbols(), spaced_strings(), spaced_open_parens(), spaced_close_parens())
+
+# Generats a list of pairs as per spaced_token().
+def spaced_token_sequences():
+ return lists(spaced_tokens())