Add tests for the individual tokenizer states.

author: Owen Jacobson <owen@grimoire.ca> 2017-11-08 04:03:25 -0500
committer: Owen Jacobson <owen@grimoire.ca> 2017-11-08 04:03:25 -0500
commit: e157a7a83d5429bca9d564d931ab041fa96cd277 (patch)
tree: 7487556a8acd2746a4dc2084c42ca706f8e063e6
parent: 0fcc2dc618f2eb00d8cf82ce328c98e0ea9f2626 (diff)
4 files changed, 211 insertions, 4 deletions
diff --git a/.gitignore b/.gitignore
index 14d2ac5..73a31e6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,5 @@
 /*.egg-info
 __pycache__/
+/.hypothesis/
+/.eggs/
+/.cache/
diff --git a/actinide/tokenizer.py b/actinide/tokenizer.py
index f950b91..8fb9d0a 100644
--- a/actinide/tokenizer.py
+++ b/actinide/tokenizer.py
@@ -98,9 +98,11 @@ def tokenize_eof(lookahead, port):
 # This never produces a token.
 def tokenize_comment(lookahead, port):
     next = port.read(1)
-    if next != '\n':
-        return None, next, tokenize_comment
-    return None, next, tokenize_any
+    if next == '':
+        return None, next, tokenize_any
+    if next == '\n':
+        return None, next, tokenize_any
+    return None, next, tokenize_comment
 
 # Consumes the lookahead and packages it up as a Syntax token. This is generally
 # appropriate for the ``(`` and ``)`` syntactic elements.
@@ -129,7 +131,7 @@ def tokenize_whitespace(lookahead, port):
 def tokenize_symbol(lookahead, port):
     next = port.read(1)
     if next == '':
-        return Symbol(lookahead), next, tokenize_eof
+        return Symbol(lookahead), next, tokenize_any
     if next in '"(); \t\n':
         return Symbol(lookahead), next, tokenize_any
     return None, lookahead + next, tokenize_symbol
diff --git a/setup.py b/setup.py
index 21fffac..8782f3f 100644
--- a/setup.py
+++ b/setup.py
@@ -5,4 +5,13 @@ setup(
     version='0.1',
     packages=find_packages(),
     scripts=['bin/actinide-repl'],
+
+    setup_requires=[
+        'pytest-runner',
+    ],
+
+    tests_require=[
+        'pytest',
+        'hypothesis',
+    ],
 )
diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py
new file mode 100644
index 0000000..a300eb2
--- /dev/null
+++ b/tests/test_tokenizer.py
@@ -0,0 +1,193 @@
+from hypothesis import given, settings, HealthCheck
+from hypothesis.strategies import text, from_regex
+import io
+
+from actinide.tokenizer import *
+
+class ReadablePort(io.StringIO):
+    def __repr__(self):
+        # Slightly friendlier debugging output
+        return f"ReadablePort(str={repr(self.getvalue())}, pos={self.tell()})"
+
+def not_(f):
+    return lambda *args, **kwargs: not f(*args, **kwargs)
+
+@given(text(), text())
+def test_tokenize_eof(s, input):
+    port = ReadablePort(input)
+    token, lookahead, next = tokenize_eof(s, port)
+
+    assert token is None
+    assert lookahead == s
+    assert next == tokenize_eof
+    assert port.tell() == 0
+
+def comment_continues(text):
+    if text == '':
+        return False
+    return text[0] != '\n'
+
+@given(text(), text().filter(comment_continues))
+def test_tokenize_comment_continues(s, input):
+    port = ReadablePort(input)
+    token, lookahead, next = tokenize_comment(s, port)
+
+    assert token is None
+    assert port.tell() == 1
+    assert lookahead == input[0]
+    assert next == tokenize_comment
+
+@given(text(), text().filter(not_(comment_continues)))
+def test_tokenize_comment_ends(s, input):
+    port = ReadablePort(input)
+    token, lookahead, next = tokenize_comment(s, port)
+
+    assert token is None
+    assert port.tell() == (1 if input else 0)
+    assert lookahead == (input[0] if input else '')
+    assert next == tokenize_any
+
+@given(text(), text())
+def test_tokenize_syntax(s, input):
+    port = ReadablePort(input)
+    token, lookahead, next = tokenize_syntax(s, port)
+
+    assert token == Syntax(s)
+    assert isinstance(token, Syntax)
+    assert port.tell() == (1 if input else 0)
+    assert lookahead == (input[0] if input else '')
+    assert next == tokenize_any
+
+@given(text(), text())
+def test_tokenize_whitespace(s, input):
+    port = ReadablePort(input)
+    token, lookahead, next = tokenize_whitespace(s, port)
+
+    assert token is None
+    assert port.tell() == (1 if input else 0)
+    assert lookahead == (input[0] if input else '')
+    assert next == tokenize_any
+
+def symbol_continues(text):
+    if text == '':
+        return False
+    return text[0] not in ' \n\t();"'
+
+@given(text(), text().filter(symbol_continues))
+def test_tokenize_symbol_continues(s, input):
+    port = ReadablePort(input)
+    token, lookahead, next = tokenize_symbol(s, port)
+
+    assert token is None
+    assert port.tell() == 1
+    assert lookahead == s + input[0]
+    assert next == tokenize_symbol
+
+@given(text(), text().filter(not_(symbol_continues)))
+def test_tokenize_symbol_ends(s, input):
+    port = ReadablePort(input)
+    token, lookahead, next = tokenize_symbol(s, port)
+
+    assert token == Symbol(s)
+    assert isinstance(token, Symbol)
+    assert port.tell() == (1 if input else 0)
+    assert lookahead == (input[0] if input else '')
+    assert next == tokenize_any
+
+def string_continues(text):
+    if text == '':
+        return False
+    return not text[0] == '"'
+
+@given(text(), text().filter(string_continues))
+def test_tokenize_string_continues(s, input):
+    port = ReadablePort(input)
+    token, lookahead, next = tokenize_string(s, port)
+
+    assert token is None
+    assert port.tell() == 1
+    assert lookahead == input[0]
+    assert next == tokenize_string_character
+
+@given(text(), text().filter(not_(string_continues)))
+def test_tokenize_string_ends(s, input):
+    try:
+        port = ReadablePort(input)
+        token, lookahead, next = tokenize_string(s, port)
+
+        assert token is None
+        assert port.tell() == 1
+        assert lookahead == ''
+        assert next == tokenize_string_end
+    except TokenError:
+        assert input == ''
+        assert port.tell() == 0
+
+def is_escape(text):
+    if text == '':
+        return False
+    return text[0] == '\\'
+
+@given(text(), text().filter(string_continues).filter(not_(is_escape)))
+def test_tokenize_string_character_continues(s, input):
+    port = ReadablePort(input)
+    token, lookahead, next = tokenize_string_character(s, port)
+
+    assert token is None
+    assert port.tell() == 1
+    assert lookahead == s + input[0]
+    assert next == tokenize_string_character
+
+# Using from_regex() rather than text() because searching randomly for strings
+# that start with a specific character is far, _far_ too slow. (It often fails
+# to find any examples.) I _think_ this preserves the property that this group
+# of three tests are exhaustive, but it's not as obvious as it would be if I
+# could use text() here.
+@given(text(), from_regex(r'\\.*').filter(string_continues).filter(is_escape))
+def test_tokenize_string_character_begins_escape(s, input):
+    port = ReadablePort(input)
+    token, lookahead, next = tokenize_string_character(s, port)
+
+    assert token is None
+    assert port.tell() == 1
+    assert lookahead == s
+    assert next == tokenize_escaped_string_character
+
+@given(text(), text().filter(not_(string_continues)))
+def test_tokenize_string_character_ends(s, input):
+    try:
+        port = ReadablePort(input)
+        token, lookahead, next = tokenize_string_character(s, port)
+
+        assert token is None
+        assert port.tell() == 1
+        assert lookahead == s
+        assert next == tokenize_string_end
+    except TokenError:
+        assert input == ''
+        assert port.tell() == 0
+
+@given(text(), text())
+def test_tokenize_escaped_string_character(s, input):
+    try:
+        port = ReadablePort(input)
+        token, lookahead, next = tokenize_escaped_string_character(s, port)
+
+        assert token is None
+        assert port.tell() == 1
+        assert lookahead == s + input[0]
+        assert next == tokenize_string_character
+    except TokenError:
+        assert input == '' or input[0] not in '\\n'
+        assert port.tell() == (1 if input else 0)
+
+@given(text(), text())
+def test_tokenize_string_end(s, input):
+    port = ReadablePort(input)
+    token, lookahead, next = tokenize_string_end(s, port)
+
+    assert token == s
+    assert isinstance(token, String)
+    assert port.tell() == (1 if input else 0)
+    assert lookahead == (input[0] if input else '')
+    assert next == tokenize_any
author	Owen Jacobson <owen@grimoire.ca>	2017-11-08 04:03:25 -0500
committer	Owen Jacobson <owen@grimoire.ca>	2017-11-08 04:03:25 -0500
commit	e157a7a83d5429bca9d564d931ab041fa96cd277 (patch)
tree	7487556a8acd2746a4dc2084c42ca706f8e063e6
parent	0fcc2dc618f2eb00d8cf82ce328c98e0ea9f2626 (diff)