summaryrefslogtreecommitdiff
path: root/tests/test_tokenizer.py
blob: 5c0ddea98fc6d04633f7f73a41656186b861a096 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
from hypothesis import given, settings, HealthCheck, event
from hypothesis.strategies import just, text, characters, from_regex, one_of, tuples, sampled_from
import io

from actinide.tokenizer import *
from actinide.ports import *

from .tokens import spaced_token_sequences, tokens, nontokens

# Cases for the tokenizer:

# * any single token: reads back that token.
@given(tokens())
def test_tokenizer_single_token(input):
    port = string_to_input_port(input)

    assert read_token(port) == input

# * any input guaranteed not to contain a token: reads back None, consuming the
#   whole input in the process.
@given(nontokens())
def test_tokenizer_no_token(input):
    port = string_to_input_port(input)

    assert read_token(port) == None

# * any sequence of separator-token pairs: if the pairs are coalesced into a
#   single giant input, does the tokenizer recover the tokens?
@given(spaced_token_sequences())
def test_tokenizer_spaced_sequence(spaced_tokens):
    input = ''.join(''.join(pair) for pair in spaced_tokens)
    tokens = [token for (_, token) in spaced_tokens]

    port = string_to_input_port(input)
    def iterate_read_token(port):
        token = read_token(port)
        while token is not None:
            yield token
            token = read_token(port)

    assert list(iterate_read_token(port)) == tokens