summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorOwen Jacobson <owen@grimoire.ca>2017-11-08 04:03:25 -0500
committerOwen Jacobson <owen@grimoire.ca>2017-11-08 04:03:25 -0500
commite157a7a83d5429bca9d564d931ab041fa96cd277 (patch)
tree7487556a8acd2746a4dc2084c42ca706f8e063e6
parent0fcc2dc618f2eb00d8cf82ce328c98e0ea9f2626 (diff)
Add tests for the individual tokenizer states.
-rw-r--r--.gitignore3
-rw-r--r--actinide/tokenizer.py10
-rw-r--r--setup.py9
-rw-r--r--tests/test_tokenizer.py193
4 files changed, 211 insertions, 4 deletions
diff --git a/.gitignore b/.gitignore
index 14d2ac5..73a31e6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,5 @@
/*.egg-info
__pycache__/
+/.hypothesis/
+/.eggs/
+/.cache/
diff --git a/actinide/tokenizer.py b/actinide/tokenizer.py
index f950b91..8fb9d0a 100644
--- a/actinide/tokenizer.py
+++ b/actinide/tokenizer.py
@@ -98,9 +98,11 @@ def tokenize_eof(lookahead, port):
# This never produces a token.
def tokenize_comment(lookahead, port):
next = port.read(1)
- if next != '\n':
- return None, next, tokenize_comment
- return None, next, tokenize_any
+ if next == '':
+ return None, next, tokenize_any
+ if next == '\n':
+ return None, next, tokenize_any
+ return None, next, tokenize_comment
# Consumes the lookahead and packages it up as a Syntax token. This is generally
# appropriate for the ``(`` and ``)`` syntactic elements.
@@ -129,7 +131,7 @@ def tokenize_whitespace(lookahead, port):
def tokenize_symbol(lookahead, port):
next = port.read(1)
if next == '':
- return Symbol(lookahead), next, tokenize_eof
+ return Symbol(lookahead), next, tokenize_any
if next in '"(); \t\n':
return Symbol(lookahead), next, tokenize_any
return None, lookahead + next, tokenize_symbol
diff --git a/setup.py b/setup.py
index 21fffac..8782f3f 100644
--- a/setup.py
+++ b/setup.py
@@ -5,4 +5,13 @@ setup(
version='0.1',
packages=find_packages(),
scripts=['bin/actinide-repl'],
+
+ setup_requires=[
+ 'pytest-runner',
+ ],
+
+ tests_require=[
+ 'pytest',
+ 'hypothesis',
+ ],
)
diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py
new file mode 100644
index 0000000..a300eb2
--- /dev/null
+++ b/tests/test_tokenizer.py
@@ -0,0 +1,193 @@
+from hypothesis import given, settings, HealthCheck
+from hypothesis.strategies import text, from_regex
+import io
+
+from actinide.tokenizer import *
+
+class ReadablePort(io.StringIO):
+ def __repr__(self):
+ # Slightly friendlier debugging output
+ return f"ReadablePort(str={repr(self.getvalue())}, pos={self.tell()})"
+
+def not_(f):
+ return lambda *args, **kwargs: not f(*args, **kwargs)
+
+@given(text(), text())
+def test_tokenize_eof(s, input):
+ port = ReadablePort(input)
+ token, lookahead, next = tokenize_eof(s, port)
+
+ assert token is None
+ assert lookahead == s
+ assert next == tokenize_eof
+ assert port.tell() == 0
+
+def comment_continues(text):
+ if text == '':
+ return False
+ return text[0] != '\n'
+
+@given(text(), text().filter(comment_continues))
+def test_tokenize_comment_continues(s, input):
+ port = ReadablePort(input)
+ token, lookahead, next = tokenize_comment(s, port)
+
+ assert token is None
+ assert port.tell() == 1
+ assert lookahead == input[0]
+ assert next == tokenize_comment
+
+@given(text(), text().filter(not_(comment_continues)))
+def test_tokenize_comment_ends(s, input):
+ port = ReadablePort(input)
+ token, lookahead, next = tokenize_comment(s, port)
+
+ assert token is None
+ assert port.tell() == (1 if input else 0)
+ assert lookahead == (input[0] if input else '')
+ assert next == tokenize_any
+
+@given(text(), text())
+def test_tokenize_syntax(s, input):
+ port = ReadablePort(input)
+ token, lookahead, next = tokenize_syntax(s, port)
+
+ assert token == Syntax(s)
+ assert isinstance(token, Syntax)
+ assert port.tell() == (1 if input else 0)
+ assert lookahead == (input[0] if input else '')
+ assert next == tokenize_any
+
+@given(text(), text())
+def test_tokenize_whitespace(s, input):
+ port = ReadablePort(input)
+ token, lookahead, next = tokenize_whitespace(s, port)
+
+ assert token is None
+ assert port.tell() == (1 if input else 0)
+ assert lookahead == (input[0] if input else '')
+ assert next == tokenize_any
+
+def symbol_continues(text):
+ if text == '':
+ return False
+ return text[0] not in ' \n\t();"'
+
+@given(text(), text().filter(symbol_continues))
+def test_tokenize_symbol_continues(s, input):
+ port = ReadablePort(input)
+ token, lookahead, next = tokenize_symbol(s, port)
+
+ assert token is None
+ assert port.tell() == 1
+ assert lookahead == s + input[0]
+ assert next == tokenize_symbol
+
+@given(text(), text().filter(not_(symbol_continues)))
+def test_tokenize_symbol_ends(s, input):
+ port = ReadablePort(input)
+ token, lookahead, next = tokenize_symbol(s, port)
+
+ assert token == Symbol(s)
+ assert isinstance(token, Symbol)
+ assert port.tell() == (1 if input else 0)
+ assert lookahead == (input[0] if input else '')
+ assert next == tokenize_any
+
+def string_continues(text):
+ if text == '':
+ return False
+ return not text[0] == '"'
+
+@given(text(), text().filter(string_continues))
+def test_tokenize_string_continues(s, input):
+ port = ReadablePort(input)
+ token, lookahead, next = tokenize_string(s, port)
+
+ assert token is None
+ assert port.tell() == 1
+ assert lookahead == input[0]
+ assert next == tokenize_string_character
+
+@given(text(), text().filter(not_(string_continues)))
+def test_tokenize_string_ends(s, input):
+ try:
+ port = ReadablePort(input)
+ token, lookahead, next = tokenize_string(s, port)
+
+ assert token is None
+ assert port.tell() == 1
+ assert lookahead == ''
+ assert next == tokenize_string_end
+ except TokenError:
+ assert input == ''
+ assert port.tell() == 0
+
+def is_escape(text):
+ if text == '':
+ return False
+ return text[0] == '\\'
+
+@given(text(), text().filter(string_continues).filter(not_(is_escape)))
+def test_tokenize_string_character_continues(s, input):
+ port = ReadablePort(input)
+ token, lookahead, next = tokenize_string_character(s, port)
+
+ assert token is None
+ assert port.tell() == 1
+ assert lookahead == s + input[0]
+ assert next == tokenize_string_character
+
+# Using from_regex() rather than text() because searching randomly for strings
+# that start with a specific character is far, _far_ too slow. (It often fails
+# to find any examples.) I _think_ this preserves the property that this group
+# of three tests are exhaustive, but it's not as obvious as it would be if I
+# could use text() here.
+@given(text(), from_regex(r'\\.*').filter(string_continues).filter(is_escape))
+def test_tokenize_string_character_begins_escape(s, input):
+ port = ReadablePort(input)
+ token, lookahead, next = tokenize_string_character(s, port)
+
+ assert token is None
+ assert port.tell() == 1
+ assert lookahead == s
+ assert next == tokenize_escaped_string_character
+
+@given(text(), text().filter(not_(string_continues)))
+def test_tokenize_string_character_ends(s, input):
+ try:
+ port = ReadablePort(input)
+ token, lookahead, next = tokenize_string_character(s, port)
+
+ assert token is None
+ assert port.tell() == 1
+ assert lookahead == s
+ assert next == tokenize_string_end
+ except TokenError:
+ assert input == ''
+ assert port.tell() == 0
+
+@given(text(), text())
+def test_tokenize_escaped_string_character(s, input):
+ try:
+ port = ReadablePort(input)
+ token, lookahead, next = tokenize_escaped_string_character(s, port)
+
+ assert token is None
+ assert port.tell() == 1
+ assert lookahead == s + input[0]
+ assert next == tokenize_string_character
+ except TokenError:
+ assert input == '' or input[0] not in '\\n'
+ assert port.tell() == (1 if input else 0)
+
+@given(text(), text())
+def test_tokenize_string_end(s, input):
+ port = ReadablePort(input)
+ token, lookahead, next = tokenize_string_end(s, port)
+
+ assert token == s
+ assert isinstance(token, String)
+ assert port.tell() == (1 if input else 0)
+ assert lookahead == (input[0] if input else '')
+ assert next == tokenize_any