from hypothesis import given, settings, HealthCheck, event
from hypothesis.strategies import just, text, characters, from_regex, one_of, tuples, sampled_from
import io

from actinide.tokenizer import *

from .tokens import spaced_token_sequences

class ReadablePort(io.StringIO):
    def __repr__(self):
        # Slightly friendlier debugging output
        return f"ReadablePort(str={repr(self.getvalue())}, pos={self.tell()})"

# Many of the following tests proceed by cases, because the underlying behaviour
# is too complex to treat as a uniform set of properties. The cases are meant to
# be total, and in principle could be defined as a set of filters on the
# ``text()`` generator that , combined, exhaust the possible outcomes of that
# generator.
#
# Implementing the tests that way causes Hypothesis to generate a significant
# number of examples that it then throws away without verifying, because
# Hypothesis has no insight into filters to use when generating examples.
# Instead, this test suite specifies generators per-case.

# Cases for tokenize_any:

# We test this a bit differently from the subsequent tokenizer states. Because
# it's a pure routing state, we can generate lookahead, expected_state pairs and
# check them in one pass, rather than testing each possible outcome separately.
# In every case, the input is irrelevant: this state never reads.

def next_token_states():
    return one_of(
        tuples(just(''), just(tokenize_eof)),
        tuples(just(';'), just(tokenize_comment)),
        tuples(sampled_from('()'), just(tokenize_syntax)),
        tuples(sampled_from(' \t\n'), just(tokenize_whitespace)),
        tuples(just('"'), just(tokenize_atom)),
        tuples(characters(blacklist_characters=' \t\n();"'), just(tokenize_atom)),
    )

@given(next_token_states(), text())
def test_tokenize_any(lookahead_next, input):
    s, expected_state = lookahead_next
    port = ReadablePort(input)
    token, lookahead, next = tokenize_any(s, input)

    assert token is None
    assert lookahead == s
    assert next == expected_state
    assert port.tell() == 0

# Since the previous test case is rigged for success, also verify that no input
# causes tokenize_any to enter an unexpected state or to throw an exception.
@given(text(), text())
def test_tokenize_any_fuzz(s, input):
    port = ReadablePort(input)
    token, lookahead, next = tokenize_any(s, input)

    assert token is None
    assert lookahead == s
    assert next in (tokenize_eof, tokenize_comment, tokenize_syntax, tokenize_whitespace, tokenize_atom)
    assert port.tell() == 0

# Cases for tokenize_eof:

# * any lookahead, any input: tokenize_eof is a trap state performing no reads,
#   always returning to itself, and never generating a token.
@given(text(), text())
def test_tokenize_eof(s, input):
    port = ReadablePort(input)
    token, lookahead, next = tokenize_eof(s, port)

    assert token is None
    assert lookahead == s
    assert next == tokenize_eof
    assert port.tell() == 0

# Cases for tokenize_comment:

# * any lookahead, one or more characters beginning with a non-newline as input:
#   tokenize_comment continues the current comment, throwing away one character
#   of input, without generating a token.
@given(text(), from_regex(r'^[^\n].*'))
def test_tokenize_comment_continues(s, input):
    port = ReadablePort(input)
    token, lookahead, next = tokenize_comment(s, port)

    assert token is None
    assert port.tell() == 1
    assert lookahead == input[0]
    assert next == tokenize_comment

# * any lookahead, one or more characters beginning with a newline as input, and
# * any lookahead, empty input:
#   tokenize_comment concludes the current comment and prepares for the next
#   token, without generating a token.
@given(text(), just('') | from_regex(r'^\n.*'))
def test_tokenize_comment_ends(s, input):
    port = ReadablePort(input)
    token, lookahead, next = tokenize_comment(s, port)

    assert token is None
    assert port.tell() == (1 if input else 0)
    assert lookahead == (input[0] if input else '')
    assert next == tokenize_any

# Cases for tokenize_syntax:

# * any lookahead, any input: generate the lookahead as a Syntax token and
#   transition back to tokenize_any to prepare for the next token, with one
#   character of lookahead ready to go.
@given(text(), text())
def test_tokenize_syntax(s, input):
    port = ReadablePort(input)
    token, lookahead, next = tokenize_syntax(s, port)

    assert token == s
    assert port.tell() == (1 if input else 0)
    assert lookahead == (input[0] if input else '')
    assert next == tokenize_any

# Cases for test_tokenize_whitespace:

# * any lookahead, any input: throw away the presumed-whitespace lookahead, then
#   transition back to tokenize_any to prepare for the next token, with one
#   character of lookahead ready to go, without generating a token.
@given(text(), text())
def test_tokenize_whitespace(s, input):
    port = ReadablePort(input)
    token, lookahead, next = tokenize_whitespace(s, port)

    assert token is None
    assert port.tell() == (1 if input else 0)
    assert lookahead == (input[0] if input else '')
    assert next == tokenize_any

# Cases for tokenize_nonstring_atom:

# * any lookahead, any non-empty input not beginning with whitespace, syntax, a
#   comment delimiter, or a string literal: accumulate one character of input
#   onto the lookahead, then transition back to tokenize_symbol to process the
#   next character of input, without generating a token.
@given(text(), from_regex(r'^[^ \n\t();"].*'))
def test_tokenize_nonstring_atom_continues(s, input):
    port = ReadablePort(input)
    token, lookahead, next = tokenize_nonstring_atom(s, port)

    assert token is None
    assert port.tell() == 1
    assert lookahead == s + input[0]
    assert next == tokenize_nonstring_atom

# * any lookahead, a non-empty input beginning with whitespace, syntax, a
#   comment delimiter, or a string literal, and
# * any lookahead, empty input:
#   generate the accumulated input as a Symbol token, then transition back to tokenize_any with one character of lookahead ready to go.
@given(text(), just('') | from_regex(r'^[ \n\t();"].*'))
def test_tokenize_tokenize_nonstring_atom_ends(s, input):
    port = ReadablePort(input)
    token, lookahead, next = tokenize_nonstring_atom(s, port)

    assert token == s
    assert port.tell() == (1 if input else 0)
    assert lookahead == (input[0] if input else '')
    assert next == tokenize_any

# And now, the _worst_ part of the state machine. Cases for tokenize_string:

# * any lookahead, a non-empty input not beginning with a string delimiter:
#   begin a non-empty string by transitioning to the tokenize_string_character
#   state with one character of lookahead, without generating a token.
@given(text(), from_regex(r'^[^"].*'))
def test_tokenize_string_continues(s, input):
    port = ReadablePort(input)
    token, lookahead, next = tokenize_string(s, port)

    assert token is None
    assert port.tell() == 1
    assert lookahead == s + input[0]
    assert next == tokenize_string_character

# * any lookahad, a non-empty input beginning with a string delimiter: terminate
#   an empty string by transitioning to the tokenize_string_end state with an
#   *empty* lookahead, without generating a token.
@given(text(), from_regex(r'^["].*'))
def test_tokenize_string_empty(s, input):
    port = ReadablePort(input)
    token, lookahead, next = tokenize_string(s, port)

    assert token is None
    assert port.tell() == 1
    assert lookahead == s + input[0]
    assert next == tokenize_string_end

# * any lookahead, empty input: emit a tokenization error, as we've encountered
#   EOF inside of a string.
@given(text(), just(''))
def test_tokenize_string_eof(s, input):
    try:
        port = ReadablePort(input)
        token, lookahead, next = tokenize_string(s, port)

        assert False # must raise
    except TokenError:
        assert port.tell() == 0

# Cases for tokenize_string_character:

# * any lookahead, any non-empty input not beginning with a string delimiter or
#   escape character: append one character of input to the lookahead, then
#   continue in the tokenize_string_character state without generating a token.
@given(text(), from_regex(r'^[^\\"].*'))
def test_tokenize_string_character_continues(s, input):
    port = ReadablePort(input)
    token, lookahead, next = tokenize_string_character(s, port)

    assert token is None
    assert port.tell() == 1
    assert lookahead == s + input[0]
    assert next == tokenize_string_character

# * any lookahead, any non-empty input which begins with an escape character:
#   leave the lookahead unchanged, but transition to the
#   tokenize_escaped_string_character state to determine which escape character
#   we're dealing with, without emitting a token.
@given(text(), from_regex(r'^[\\].*'))
def test_tokenize_string_character_begins_escape(s, input):
    port = ReadablePort(input)
    token, lookahead, next = tokenize_string_character(s, port)

    assert token is None
    assert port.tell() == 1
    assert lookahead == s + input[0]
    assert next == tokenize_escaped_string_character

# * any lookahead, any non-empty input which begins with a string delimiter:
#   we're at the end of a string. Transition to the tokenize_string_end state
#   with the current lookahead, without generating a token.
@given(text(), from_regex(r'^["].*'))
def test_tokenize_string_character_ends(s, input):
    port = ReadablePort(input)
    token, lookahead, next = tokenize_string_character(s, port)

    assert token is None
    assert port.tell() == 1
    assert lookahead == s + input[0]
    assert next == tokenize_string_end

# * any lookahead, empty input: emit a tokenization error, as we've encountered
#   EOF inside of a string literal.
@given(text(), just(''))
def test_tokenize_string_character_eof(s, input):
    try:
        port = ReadablePort(input)
        token, lookahead, next = tokenize_string_character(s, port)

        assert False # must raise
    except TokenError:
        assert input == ''
        assert port.tell() == 0

# Cases for tokenize_escaped_string:

# * any lookahead, any non-empty input beginning with a legal string escaped
#   character: de-escape the first character of the input, append the result to
#   the lookahead, then transition back to the tokenize_string_character state.
@given(text(), from_regex(r'^["\\].*'))
def test_tokenize_escaped_string_character_valid(s, input):
    port = ReadablePort(input)
    token, lookahead, next = tokenize_escaped_string_character(s, port)

    assert token is None
    assert port.tell() == 1
    assert lookahead == s + input[0]
    assert next == tokenize_string_character

# * any lookahead, any non-empty input not beginning with a legal string escaped
#   character: emit a tokenization error, we've found an invalid string escape.
@given(text(), from_regex(r'^[^"\\].*'))
def test_tokenize_escaped_string_character_invalid(s, input):
    try:
        port = ReadablePort(input)
        token, lookahead, next = tokenize_escaped_string_character(s, port)

        assert False # must raise
    except TokenError:
        assert port.tell() == 1

# * any lookahead, empty input: emit a tokenization error, we've found an EOF
#   inside of a string literal.
@given(text(), just(''))
def test_tokenize_escaped_string_character_eof(s, input):
    try:
        port = ReadablePort(input)
        token, lookahead, next = tokenize_escaped_string_character(s, port)

        assert False # must raise
    except TokenError:
        assert port.tell() == 0

# Cases for tokenize_string_end:

# * any lookahead, any input: generate a String token from the lookahead, then
#   transition back to the tokenize_any state with one character of lookahead
#   ready to go.
@given(text(), text())
def test_tokenize_string_end(s, input):
    port = ReadablePort(input)
    token, lookahead, next = tokenize_string_end(s, port)

    assert token == s
    assert port.tell() == (1 if input else 0)
    assert lookahead == (input[0] if input else '')
    assert next == tokenize_any

# Cases for tokenize_atom:

# * lookahead containing a string delimiter, any input: found a string atom,
#   transition to the tokenize_string state without reading or generating a
#   token.
@given(just('"'), text())
def test_tokenize_atom_string(s, input):
    port = ReadablePort(input)
    token, lookahead, next = tokenize_atom(s, port)

    assert token is None
    assert port.tell() == 0
    assert lookahead == s
    assert next == tokenize_string

# * lookahead containing something other than a string delimiter, any input:
#   found a nonstring atom, transition to the tokenize_nonstring_atom state
#   without reading or generating a token.
@given(from_regex(r'^[^"]'), text())
def test_tokenize_atom_nonstring(s, input):
    port = ReadablePort(input)
    token, lookahead, next = tokenize_atom(s, port)

    assert token is None
    assert port.tell() == 0
    assert lookahead == s
    assert next == tokenize_nonstring_atom

# Cases for the tokenizer:

# * any sequence of separator-token pairs: if the pairs are coalesced into a
#   single giant input, does the tokenizer recover the tokens?
@given(spaced_token_sequences())
def test_tokenizer(spaced_tokens):
    input = ''.join(''.join(pair) for pair in spaced_tokens)
    tokens = [token for (_, token) in spaced_tokens]

    port = ReadablePort(input)

    assert list(tokenize(port)) == tokens