Add tests for the individual tokenizer states.

author: Owen Jacobson <owen@grimoire.ca> 2017-11-08 04:03:25 -0500
committer: Owen Jacobson <owen@grimoire.ca> 2017-11-08 04:03:25 -0500
commit: e157a7a83d5429bca9d564d931ab041fa96cd277 (patch)
tree: 7487556a8acd2746a4dc2084c42ca706f8e063e6 /tests
parent: 0fcc2dc618f2eb00d8cf82ce328c98e0ea9f2626 (diff)
1 files changed, 193 insertions, 0 deletions
diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py
new file mode 100644
index 0000000..a300eb2
--- /dev/null
+++ b/tests/test_tokenizer.py
@@ -0,0 +1,193 @@
+from hypothesis import given, settings, HealthCheck
+from hypothesis.strategies import text, from_regex
+import io
+
+from actinide.tokenizer import *
+
+class ReadablePort(io.StringIO):
+    def __repr__(self):
+        # Slightly friendlier debugging output
+        return f"ReadablePort(str={repr(self.getvalue())}, pos={self.tell()})"
+
+def not_(f):
+    return lambda *args, **kwargs: not f(*args, **kwargs)
+
+@given(text(), text())
+def test_tokenize_eof(s, input):
+    port = ReadablePort(input)
+    token, lookahead, next = tokenize_eof(s, port)
+
+    assert token is None
+    assert lookahead == s
+    assert next == tokenize_eof
+    assert port.tell() == 0
+
+def comment_continues(text):
+    if text == '':
+        return False
+    return text[0] != '\n'
+
+@given(text(), text().filter(comment_continues))
+def test_tokenize_comment_continues(s, input):
+    port = ReadablePort(input)
+    token, lookahead, next = tokenize_comment(s, port)
+
+    assert token is None
+    assert port.tell() == 1
+    assert lookahead == input[0]
+    assert next == tokenize_comment
+
+@given(text(), text().filter(not_(comment_continues)))
+def test_tokenize_comment_ends(s, input):
+    port = ReadablePort(input)
+    token, lookahead, next = tokenize_comment(s, port)
+
+    assert token is None
+    assert port.tell() == (1 if input else 0)
+    assert lookahead == (input[0] if input else '')
+    assert next == tokenize_any
+
+@given(text(), text())
+def test_tokenize_syntax(s, input):
+    port = ReadablePort(input)
+    token, lookahead, next = tokenize_syntax(s, port)
+
+    assert token == Syntax(s)
+    assert isinstance(token, Syntax)
+    assert port.tell() == (1 if input else 0)
+    assert lookahead == (input[0] if input else '')
+    assert next == tokenize_any
+
+@given(text(), text())
+def test_tokenize_whitespace(s, input):
+    port = ReadablePort(input)
+    token, lookahead, next = tokenize_whitespace(s, port)
+
+    assert token is None
+    assert port.tell() == (1 if input else 0)
+    assert lookahead == (input[0] if input else '')
+    assert next == tokenize_any
+
+def symbol_continues(text):
+    if text == '':
+        return False
+    return text[0] not in ' \n\t();"'
+
+@given(text(), text().filter(symbol_continues))
+def test_tokenize_symbol_continues(s, input):
+    port = ReadablePort(input)
+    token, lookahead, next = tokenize_symbol(s, port)
+
+    assert token is None
+    assert port.tell() == 1
+    assert lookahead == s + input[0]
+    assert next == tokenize_symbol
+
+@given(text(), text().filter(not_(symbol_continues)))
+def test_tokenize_symbol_ends(s, input):
+    port = ReadablePort(input)
+    token, lookahead, next = tokenize_symbol(s, port)
+
+    assert token == Symbol(s)
+    assert isinstance(token, Symbol)
+    assert port.tell() == (1 if input else 0)
+    assert lookahead == (input[0] if input else '')
+    assert next == tokenize_any
+
+def string_continues(text):
+    if text == '':
+        return False
+    return not text[0] == '"'
+
+@given(text(), text().filter(string_continues))
+def test_tokenize_string_continues(s, input):
+    port = ReadablePort(input)
+    token, lookahead, next = tokenize_string(s, port)
+
+    assert token is None
+    assert port.tell() == 1
+    assert lookahead == input[0]
+    assert next == tokenize_string_character
+
+@given(text(), text().filter(not_(string_continues)))
+def test_tokenize_string_ends(s, input):
+    try:
+        port = ReadablePort(input)
+        token, lookahead, next = tokenize_string(s, port)
+
+        assert token is None
+        assert port.tell() == 1
+        assert lookahead == ''
+        assert next == tokenize_string_end
+    except TokenError:
+        assert input == ''
+        assert port.tell() == 0
+
+def is_escape(text):
+    if text == '':
+        return False
+    return text[0] == '\\'
+
+@given(text(), text().filter(string_continues).filter(not_(is_escape)))
+def test_tokenize_string_character_continues(s, input):
+    port = ReadablePort(input)
+    token, lookahead, next = tokenize_string_character(s, port)
+
+    assert token is None
+    assert port.tell() == 1
+    assert lookahead == s + input[0]
+    assert next == tokenize_string_character
+
+# Using from_regex() rather than text() because searching randomly for strings
+# that start with a specific character is far, _far_ too slow. (It often fails
+# to find any examples.) I _think_ this preserves the property that this group
+# of three tests are exhaustive, but it's not as obvious as it would be if I
+# could use text() here.
+@given(text(), from_regex(r'\\.*').filter(string_continues).filter(is_escape))
+def test_tokenize_string_character_begins_escape(s, input):
+    port = ReadablePort(input)
+    token, lookahead, next = tokenize_string_character(s, port)
+
+    assert token is None
+    assert port.tell() == 1
+    assert lookahead == s
+    assert next == tokenize_escaped_string_character
+
+@given(text(), text().filter(not_(string_continues)))
+def test_tokenize_string_character_ends(s, input):
+    try:
+        port = ReadablePort(input)
+        token, lookahead, next = tokenize_string_character(s, port)
+
+        assert token is None
+        assert port.tell() == 1
+        assert lookahead == s
+        assert next == tokenize_string_end
+    except TokenError:
+        assert input == ''
+        assert port.tell() == 0
+
+@given(text(), text())
+def test_tokenize_escaped_string_character(s, input):
+    try:
+        port = ReadablePort(input)
+        token, lookahead, next = tokenize_escaped_string_character(s, port)
+
+        assert token is None
+        assert port.tell() == 1
+        assert lookahead == s + input[0]
+        assert next == tokenize_string_character
+    except TokenError:
+        assert input == '' or input[0] not in '\\n'
+        assert port.tell() == (1 if input else 0)
+
+@given(text(), text())
+def test_tokenize_string_end(s, input):
+    port = ReadablePort(input)
+    token, lookahead, next = tokenize_string_end(s, port)
+
+    assert token == s
+    assert isinstance(token, String)
+    assert port.tell() == (1 if input else 0)
+    assert lookahead == (input[0] if input else '')
+    assert next == tokenize_any
author	Owen Jacobson <owen@grimoire.ca>	2017-11-08 04:03:25 -0500
committer	Owen Jacobson <owen@grimoire.ca>	2017-11-08 04:03:25 -0500
commit	e157a7a83d5429bca9d564d931ab041fa96cd277 (patch)
tree	7487556a8acd2746a4dc2084c42ca706f8e063e6 /tests
parent	0fcc2dc618f2eb00d8cf82ce328c98e0ea9f2626 (diff)