diff options
| author | Owen Jacobson <owen@grimoire.ca> | 2017-11-08 04:03:25 -0500 |
|---|---|---|
| committer | Owen Jacobson <owen@grimoire.ca> | 2017-11-08 04:03:25 -0500 |
| commit | e157a7a83d5429bca9d564d931ab041fa96cd277 (patch) | |
| tree | 7487556a8acd2746a4dc2084c42ca706f8e063e6 | |
| parent | 0fcc2dc618f2eb00d8cf82ce328c98e0ea9f2626 (diff) | |
Add tests for the individual tokenizer states.
| -rw-r--r-- | .gitignore | 3 | ||||
| -rw-r--r-- | actinide/tokenizer.py | 10 | ||||
| -rw-r--r-- | setup.py | 9 | ||||
| -rw-r--r-- | tests/test_tokenizer.py | 193 |
4 files changed, 211 insertions, 4 deletions
@@ -1,2 +1,5 @@ /*.egg-info __pycache__/ +/.hypothesis/ +/.eggs/ +/.cache/ diff --git a/actinide/tokenizer.py b/actinide/tokenizer.py index f950b91..8fb9d0a 100644 --- a/actinide/tokenizer.py +++ b/actinide/tokenizer.py @@ -98,9 +98,11 @@ def tokenize_eof(lookahead, port): # This never produces a token. def tokenize_comment(lookahead, port): next = port.read(1) - if next != '\n': - return None, next, tokenize_comment - return None, next, tokenize_any + if next == '': + return None, next, tokenize_any + if next == '\n': + return None, next, tokenize_any + return None, next, tokenize_comment # Consumes the lookahead and packages it up as a Syntax token. This is generally # appropriate for the ``(`` and ``)`` syntactic elements. @@ -129,7 +131,7 @@ def tokenize_whitespace(lookahead, port): def tokenize_symbol(lookahead, port): next = port.read(1) if next == '': - return Symbol(lookahead), next, tokenize_eof + return Symbol(lookahead), next, tokenize_any if next in '"(); \t\n': return Symbol(lookahead), next, tokenize_any return None, lookahead + next, tokenize_symbol @@ -5,4 +5,13 @@ setup( version='0.1', packages=find_packages(), scripts=['bin/actinide-repl'], + + setup_requires=[ + 'pytest-runner', + ], + + tests_require=[ + 'pytest', + 'hypothesis', + ], ) diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py new file mode 100644 index 0000000..a300eb2 --- /dev/null +++ b/tests/test_tokenizer.py @@ -0,0 +1,193 @@ +from hypothesis import given, settings, HealthCheck +from hypothesis.strategies import text, from_regex +import io + +from actinide.tokenizer import * + +class ReadablePort(io.StringIO): + def __repr__(self): + # Slightly friendlier debugging output + return f"ReadablePort(str={repr(self.getvalue())}, pos={self.tell()})" + +def not_(f): + return lambda *args, **kwargs: not f(*args, **kwargs) + +@given(text(), text()) +def test_tokenize_eof(s, input): + port = ReadablePort(input) + token, lookahead, next = tokenize_eof(s, port) + + assert token is None + assert lookahead == s + assert next == tokenize_eof + assert port.tell() == 0 + +def comment_continues(text): + if text == '': + return False + return text[0] != '\n' + +@given(text(), text().filter(comment_continues)) +def test_tokenize_comment_continues(s, input): + port = ReadablePort(input) + token, lookahead, next = tokenize_comment(s, port) + + assert token is None + assert port.tell() == 1 + assert lookahead == input[0] + assert next == tokenize_comment + +@given(text(), text().filter(not_(comment_continues))) +def test_tokenize_comment_ends(s, input): + port = ReadablePort(input) + token, lookahead, next = tokenize_comment(s, port) + + assert token is None + assert port.tell() == (1 if input else 0) + assert lookahead == (input[0] if input else '') + assert next == tokenize_any + +@given(text(), text()) +def test_tokenize_syntax(s, input): + port = ReadablePort(input) + token, lookahead, next = tokenize_syntax(s, port) + + assert token == Syntax(s) + assert isinstance(token, Syntax) + assert port.tell() == (1 if input else 0) + assert lookahead == (input[0] if input else '') + assert next == tokenize_any + +@given(text(), text()) +def test_tokenize_whitespace(s, input): + port = ReadablePort(input) + token, lookahead, next = tokenize_whitespace(s, port) + + assert token is None + assert port.tell() == (1 if input else 0) + assert lookahead == (input[0] if input else '') + assert next == tokenize_any + +def symbol_continues(text): + if text == '': + return False + return text[0] not in ' \n\t();"' + +@given(text(), text().filter(symbol_continues)) +def test_tokenize_symbol_continues(s, input): + port = ReadablePort(input) + token, lookahead, next = tokenize_symbol(s, port) + + assert token is None + assert port.tell() == 1 + assert lookahead == s + input[0] + assert next == tokenize_symbol + +@given(text(), text().filter(not_(symbol_continues))) +def test_tokenize_symbol_ends(s, input): + port = ReadablePort(input) + token, lookahead, next = tokenize_symbol(s, port) + + assert token == Symbol(s) + assert isinstance(token, Symbol) + assert port.tell() == (1 if input else 0) + assert lookahead == (input[0] if input else '') + assert next == tokenize_any + +def string_continues(text): + if text == '': + return False + return not text[0] == '"' + +@given(text(), text().filter(string_continues)) +def test_tokenize_string_continues(s, input): + port = ReadablePort(input) + token, lookahead, next = tokenize_string(s, port) + + assert token is None + assert port.tell() == 1 + assert lookahead == input[0] + assert next == tokenize_string_character + +@given(text(), text().filter(not_(string_continues))) +def test_tokenize_string_ends(s, input): + try: + port = ReadablePort(input) + token, lookahead, next = tokenize_string(s, port) + + assert token is None + assert port.tell() == 1 + assert lookahead == '' + assert next == tokenize_string_end + except TokenError: + assert input == '' + assert port.tell() == 0 + +def is_escape(text): + if text == '': + return False + return text[0] == '\\' + +@given(text(), text().filter(string_continues).filter(not_(is_escape))) +def test_tokenize_string_character_continues(s, input): + port = ReadablePort(input) + token, lookahead, next = tokenize_string_character(s, port) + + assert token is None + assert port.tell() == 1 + assert lookahead == s + input[0] + assert next == tokenize_string_character + +# Using from_regex() rather than text() because searching randomly for strings +# that start with a specific character is far, _far_ too slow. (It often fails +# to find any examples.) I _think_ this preserves the property that this group +# of three tests are exhaustive, but it's not as obvious as it would be if I +# could use text() here. +@given(text(), from_regex(r'\\.*').filter(string_continues).filter(is_escape)) +def test_tokenize_string_character_begins_escape(s, input): + port = ReadablePort(input) + token, lookahead, next = tokenize_string_character(s, port) + + assert token is None + assert port.tell() == 1 + assert lookahead == s + assert next == tokenize_escaped_string_character + +@given(text(), text().filter(not_(string_continues))) +def test_tokenize_string_character_ends(s, input): + try: + port = ReadablePort(input) + token, lookahead, next = tokenize_string_character(s, port) + + assert token is None + assert port.tell() == 1 + assert lookahead == s + assert next == tokenize_string_end + except TokenError: + assert input == '' + assert port.tell() == 0 + +@given(text(), text()) +def test_tokenize_escaped_string_character(s, input): + try: + port = ReadablePort(input) + token, lookahead, next = tokenize_escaped_string_character(s, port) + + assert token is None + assert port.tell() == 1 + assert lookahead == s + input[0] + assert next == tokenize_string_character + except TokenError: + assert input == '' or input[0] not in '\\n' + assert port.tell() == (1 if input else 0) + +@given(text(), text()) +def test_tokenize_string_end(s, input): + port = ReadablePort(input) + token, lookahead, next = tokenize_string_end(s, port) + + assert token == s + assert isinstance(token, String) + assert port.tell() == (1 if input else 0) + assert lookahead == (input[0] if input else '') + assert next == tokenize_any |
