tests/test_tokenizer.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356

from hypothesis import given, settings, HealthCheck, event
from hypothesis.strategies import just, text, characters, from_regex, one_of, tuples, sampled_from
import io

from actinide.tokenizer import *

from .tokens import spaced_token_sequences

class ReadablePort(io.StringIO):
    def __repr__(self):
        # Slightly friendlier debugging output
        return f"ReadablePort(str={repr(self.getvalue())}, pos={self.tell()})"

# Many of the following tests proceed by cases, because the underlying behaviour
# is too complex to treat as a uniform set of properties. The cases are meant to
# be total, and in principle could be defined as a set of filters on the
# ``text()`` generator that , combined, exhaust the possible outcomes of that
# generator.
#
# Implementing the tests that way causes Hypothesis to generate a significant
# number of examples that it then throws away without verifying, because
# Hypothesis has no insight into filters to use when generating examples.
# Instead, this test suite specifies generators per-case.

# Cases for tokenize_any:

# We test this a bit differently from the subsequent tokenizer states. Because
# it's a pure routing state, we can generate lookahead, expected_state pairs and
# check them in one pass, rather than testing each possible outcome separately.
# In every case, the input is irrelevant: this state never reads.

def next_token_states():
    return one_of(
        tuples(just(''), just(tokenize_eof)),
        tuples(just(';'), just(tokenize_comment)),
        tuples(sampled_from('()'), just(tokenize_syntax)),
        tuples(sampled_from(' \t\n'), just(tokenize_whitespace)),
        tuples(just('"'), just(tokenize_atom)),
        tuples(characters(blacklist_characters=' \t\n();"'), just(tokenize_atom)),
    )

@given(next_token_states(), text())
def test_tokenize_any(lookahead_next, input):
    s, expected_state = lookahead_next
    port = ReadablePort(input)
    token, lookahead, next = tokenize_any(s, input)

    assert token is None
    assert lookahead == s
    assert next == expected_state
    assert port.tell() == 0

# Since the previous test case is rigged for success, also verify that no input
# causes tokenize_any to enter an unexpected state or to throw an exception.
@given(text(), text())
def test_tokenize_any_fuzz(s, input):
    port = ReadablePort(input)
    token, lookahead, next = tokenize_any(s, input)

    assert token is None
    assert lookahead == s
    assert next in (tokenize_eof, tokenize_comment, tokenize_syntax, tokenize_whitespace, tokenize_atom)
    assert port.tell() == 0

# Cases for tokenize_eof:

# * any lookahead, any input: tokenize_eof is a trap state performing no reads,
#   always returning to itself, and never generating a token.
@given(text(), text())
def test_tokenize_eof(s, input):
    port = ReadablePort(input)
    token, lookahead, next = tokenize_eof(s, port)

    assert token is None
    assert lookahead == s
    assert next == tokenize_eof
    assert port.tell() == 0

# Cases for tokenize_comment:

# * any lookahead, one or more characters beginning with a non-newline as input:
#   tokenize_comment continues the current comment, throwing away one character
#   of input, without generating a token.
@given(text(), from_regex(r'^[^\n].*'))
def test_tokenize_comment_continues(s, input):
    port = ReadablePort(input)
    token, lookahead, next = tokenize_comment(s, port)

    assert token is None
    assert port.tell() == 1
    assert lookahead == input[0]
    assert next == tokenize_comment

# * any lookahead, one or more characters beginning with a newline as input, and
# * any lookahead, empty input:
#   tokenize_comment concludes the current comment and prepares for the next
#   token, without generating a token.
@given(text(), just('') | from_regex(r'^\n.*'))
def test_tokenize_comment_ends(s, input):
    port = ReadablePort(input)
    token, lookahead, next = tokenize_comment(s, port)

    assert token is None
    assert port.tell() == (1 if input else 0)
    assert lookahead == (input[0] if input else '')
    assert next == tokenize_any

# Cases for tokenize_syntax:

# * any lookahead, any input: generate the lookahead as a Syntax token and
#   transition back to tokenize_any to prepare for the next token, with one
#   character of lookahead ready to go.
@given(text(), text())
def test_tokenize_syntax(s, input):
    port = ReadablePort(input)
    token, lookahead, next = tokenize_syntax(s, port)

    assert token == s
    assert port.tell() == (1 if input else 0)
    assert lookahead == (input[0] if input else '')
    assert next == tokenize_any

# Cases for test_tokenize_whitespace:

# * any lookahead, any input: throw away the presumed-whitespace lookahead, then
#   transition back to tokenize_any to prepare for the next token, with one
#   character of lookahead ready to go, without generating a token.
@given(text(), text())
def test_tokenize_whitespace(s, input):
    port = ReadablePort(input)
    token, lookahead, next = tokenize_whitespace(s, port)

    assert token is None
    assert port.tell() == (1 if input else 0)
    assert lookahead == (input[0] if input else '')
    assert next == tokenize_any

# Cases for tokenize_nonstring_atom:

# * any lookahead, any non-empty input not beginning with whitespace, syntax, a
#   comment delimiter, or a string literal: accumulate one character of input
#   onto the lookahead, then transition back to tokenize_symbol to process the
#   next character of input, without generating a token.
@given(text(), from_regex(r'^[^ \n\t();"].*'))
def test_tokenize_nonstring_atom_continues(s, input):
    port = ReadablePort(input)
    token, lookahead, next = tokenize_nonstring_atom(s, port)

    assert token is None
    assert port.tell() == 1
    assert lookahead == s + input[0]
    assert next == tokenize_nonstring_atom

# * any lookahead, a non-empty input beginning with whitespace, syntax, a
#   comment delimiter, or a string literal, and
# * any lookahead, empty input:
#   generate the accumulated input as a Symbol token, then transition back to tokenize_any with one character of lookahead ready to go.
@given(text(), just('') | from_regex(r'^[ \n\t();"].*'))
def test_tokenize_tokenize_nonstring_atom_ends(s, input):
    port = ReadablePort(input)
    token, lookahead, next = tokenize_nonstring_atom(s, port)

    assert token == s
    assert port.tell() == (1 if input else 0)
    assert lookahead == (input[0] if input else '')
    assert next == tokenize_any

# And now, the _worst_ part of the state machine. Cases for tokenize_string:

# * any lookahead, a non-empty input not beginning with a string delimiter:
#   begin a non-empty string by transitioning to the tokenize_string_character
#   state with one character of lookahead, without generating a token.
@given(text(), from_regex(r'^[^"].*'))
def test_tokenize_string_continues(s, input):
    port = ReadablePort(input)
    token, lookahead, next = tokenize_string(s, port)

    assert token is None
    assert port.tell() == 1
    assert lookahead == s + input[0]
    assert next == tokenize_string_character

# * any lookahad, a non-empty input beginning with a string delimiter: terminate
#   an empty string by transitioning to the tokenize_string_end state with an
#   *empty* lookahead, without generating a token.
@given(text(), from_regex(r'^["].*'))
def test_tokenize_string_empty(s, input):
    port = ReadablePort(input)
    token, lookahead, next = tokenize_string(s, port)

    assert token is None
    assert port.tell() == 1
    assert lookahead == s + input[0]
    assert next == tokenize_string_end

# * any lookahead, empty input: emit a tokenization error, as we've encountered
#   EOF inside of a string.
@given(text(), just(''))
def test_tokenize_string_eof(s, input):
    try:
        port = ReadablePort(input)
        token, lookahead, next = tokenize_string(s, port)

        assert False # must raise
    except TokenError:
        assert port.tell() == 0

# Cases for tokenize_string_character:

# * any lookahead, any non-empty input not beginning with a string delimiter or
#   escape character: append one character of input to the lookahead, then
#   continue in the tokenize_string_character state without generating a token.
@given(text(), from_regex(r'^[^\\"].*'))
def test_tokenize_string_character_continues(s, input):
    port = ReadablePort(input)
    token, lookahead, next = tokenize_string_character(s, port)

    assert token is None
    assert port.tell() == 1
    assert lookahead == s + input[0]
    assert next == tokenize_string_character

# * any lookahead, any non-empty input which begins with an escape character:
#   leave the lookahead unchanged, but transition to the
#   tokenize_escaped_string_character state to determine which escape character
#   we're dealing with, without emitting a token.
@given(text(), from_regex(r'^[\\].*'))
def test_tokenize_string_character_begins_escape(s, input):
    port = ReadablePort(input)
    token, lookahead, next = tokenize_string_character(s, port)

    assert token is None
    assert port.tell() == 1
    assert lookahead == s + input[0]
    assert next == tokenize_escaped_string_character

# * any lookahead, any non-empty input which begins with a string delimiter:
#   we're at the end of a string. Transition to the tokenize_string_end state
#   with the current lookahead, without generating a token.
@given(text(), from_regex(r'^["].*'))
def test_tokenize_string_character_ends(s, input):
    port = ReadablePort(input)
    token, lookahead, next = tokenize_string_character(s, port)

    assert token is None
    assert port.tell() == 1
    assert lookahead == s + input[0]
    assert next == tokenize_string_end

# * any lookahead, empty input: emit a tokenization error, as we've encountered
#   EOF inside of a string literal.
@given(text(), just(''))
def test_tokenize_string_character_eof(s, input):
    try:
        port = ReadablePort(input)
        token, lookahead, next = tokenize_string_character(s, port)

        assert False # must raise
    except TokenError:
        assert input == ''
        assert port.tell() == 0

# Cases for tokenize_escaped_string:

# * any lookahead, any non-empty input beginning with a legal string escaped
#   character: de-escape the first character of the input, append the result to
#   the lookahead, then transition back to the tokenize_string_character state.
@given(text(), from_regex(r'^["\\].*'))
def test_tokenize_escaped_string_character_valid(s, input):
    port = ReadablePort(input)
    token, lookahead, next = tokenize_escaped_string_character(s, port)

    assert token is None
    assert port.tell() == 1
    assert lookahead == s + input[0]
    assert next == tokenize_string_character

# * any lookahead, any non-empty input not beginning with a legal string escaped
#   character: emit a tokenization error, we've found an invalid string escape.
@given(text(), from_regex(r'^[^"\\].*'))
def test_tokenize_escaped_string_character_invalid(s, input):
    try:
        port = ReadablePort(input)
        token, lookahead, next = tokenize_escaped_string_character(s, port)

        assert False # must raise
    except TokenError:
        assert port.tell() == 1

# * any lookahead, empty input: emit a tokenization error, we've found an EOF
#   inside of a string literal.
@given(text(), just(''))
def test_tokenize_escaped_string_character_eof(s, input):
    try:
        port = ReadablePort(input)
        token, lookahead, next = tokenize_escaped_string_character(s, port)

        assert False # must raise
    except TokenError:
        assert port.tell() == 0

# Cases for tokenize_string_end:

# * any lookahead, any input: generate a String token from the lookahead, then
#   transition back to the tokenize_any state with one character of lookahead
#   ready to go.
@given(text(), text())
def test_tokenize_string_end(s, input):
    port = ReadablePort(input)
    token, lookahead, next = tokenize_string_end(s, port)

    assert token == s
    assert port.tell() == (1 if input else 0)
    assert lookahead == (input[0] if input else '')
    assert next == tokenize_any

# Cases for tokenize_atom:

# * lookahead containing a string delimiter, any input: found a string atom,
#   transition to the tokenize_string state without reading or generating a
#   token.
@given(just('"'), text())
def test_tokenize_atom_string(s, input):
    port = ReadablePort(input)
    token, lookahead, next = tokenize_atom(s, port)

    assert token is None
    assert port.tell() == 0
    assert lookahead == s
    assert next == tokenize_string

# * lookahead containing something other than a string delimiter, any input:
#   found a nonstring atom, transition to the tokenize_nonstring_atom state
#   without reading or generating a token.
@given(from_regex(r'^[^"]'), text())
def test_tokenize_atom_nonstring(s, input):
    port = ReadablePort(input)
    token, lookahead, next = tokenize_atom(s, port)

    assert token is None
    assert port.tell() == 0
    assert lookahead == s
    assert next == tokenize_nonstring_atom

# Cases for the tokenizer:

# * any sequence of separator-token pairs: if the pairs are coalesced into a
#   single giant input, does the tokenizer recover the tokens?
@given(spaced_token_sequences())
def test_tokenizer(spaced_tokens):
    input = ''.join(''.join(pair) for pair in spaced_tokens)
    tokens = [token for (_, token) in spaced_tokens]

    port = ReadablePort(input)

    assert list(tokenize(port)) == tokens