Wrote the tokenizer.

author: Owen Jacobson <owen@grimoire.ca> 2017-11-08 02:16:50 -0500
committer: Owen Jacobson <owen@grimoire.ca> 2017-11-08 02:22:04 -0500
commit: 0fcc2dc618f2eb00d8cf82ce328c98e0ea9f2626 (patch)
tree: adabcf3d10a9204cc2adcea16404fe8b13afc16d
parent: cb4f719cc09ab723acd72df4b727b1740650d87c (diff)
7 files changed, 243 insertions, 4 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..14d2ac5
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+/*.egg-info
+__pycache__/
diff --git a/README.rst b/README.rst
index fc6d805..1e29ec9 100644
--- a/README.rst
+++ b/README.rst
@@ -42,6 +42,12 @@ fill up RAM, and drain your battery.
 .. _lispy: http://norvig.com/lispy.html
 
 ************
+Requirements
+************
+
+Actinide requires Python 3.6 or later.
+
+************
 Installation
 ************
 
@@ -57,10 +63,13 @@ Or, if you prefer, add ``actinide`` to your application's ``Pipfile`` or
 Freestanding REPL
 *****************
 
-The Actinide interpreter can be started interactively using the `actinide-repl`
-command. In this mode, Actinide forms can be entered interactively. The REPL
-will immediately evaluate each top-level form, then print the result of that
-evaluation.
+**Note: this section is presently incorrect - the ``actinide-repl`` command
+instead contains a test harness for the tokenizer.**
+
+The Actinide interpreter can be started interactively using the
+``actinide-repl`` command. In this mode, Actinide forms can be entered
+interactively. The REPL will immediately evaluate each top-level form, then
+print the result of that evaluation.
 
 To exit the REPL, type an end-of-file (Ctrl-D on most OSes, Ctrl-Z on Windows).
 
diff --git a/actinide/__init__.py b/actinide/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/actinide/__init__.py
diff --git a/actinide/tokenizer.py b/actinide/tokenizer.py
new file mode 100644
index 0000000..f950b91
--- /dev/null
+++ b/actinide/tokenizer.py
@@ -0,0 +1,202 @@
+from .types import *
+
+# ## TOKENIZATION
+#
+# The following code implements a state-machine-driven tokenizer which can
+# separate an input port into a sequence of lisp tokens. The following token
+# classes are defined:
+#
+# * Comments: ``;`` followed by all bytes to EOF or to the end of the line.
+#
+# * Strings: ``"`` through to the next unescaped ``"`` are read, de-escaped, and
+#   returned. The sequences ``\"`` and ``\\`` are treated specially: the former
+#   de-escapes to ``"``, and the latter to ``\``. An unclosed string literal or
+#   an unknown escape sequence is a tokenization error.
+#
+# * Open and close parens: ``(`` and ``)`` are returned as freestanding tokens.
+#
+# * Whitespace: Space, horizontal tab, and newline characters are discarded
+#   during tokenization.
+#
+# * Symbols: Any sequence of characters not included in one of the above classes
+#   is read and returned as a single token. This includes words, numbers, and
+#   special literals.
+#
+# Internally, the tokenizer is a state machine which maintains two pieces of
+# state: the "lookahead" (holding data to feed to the next state transition
+# function) and the next state transition function. The top-level ``tokenize``
+# function acts as a trampoline, repeatedly calling ``next`` until input is
+# exhausted, yielding tokens any time ``next`` includes a token in its return
+# value.
+#
+# The various ``next`` functions take the current lookahead and the port,
+# perform whatever logic is needed (including, potentially, reading from the
+# port) to determine the next state, and return a 3-tuple of ``token`` (may be
+# ``None``), ``lookahead`` (which replaces the previous lookahead), and ``next``
+# (the new next state transition function).
+
+class TokenError(Exception):
+    '''
+    Raised during ``tokenize`` if the input stream cannot be divided into legal
+    tokens.
+    '''
+    pass
+
+# Tokenize a port, producing a generator that yields successive tokens as it's
+# advanced.
+#
+# This is the top-level driver for the state machine that divides the underlying
+# input into tokens. It does no input handling itself, other than reading the
+# first character of the port: so long as the lookahead is non-empty, this calls
+# the next state transition function to determine what to do and how to change
+# the lookahead.
+#
+# Initially, this is in the ``tokenize_any`` state.
+def tokenize(port):
+    lookahead, next = port.read(1), tokenize_any
+    while len(lookahead) > 0:
+        token, lookahead, next = next(lookahead, port)
+        if token is not None:
+            yield token
+
+# If the lookahead is exactly one read result, this will correctly determine the
+# next token type and return that state without consuming input. This is
+# generally the correct state to transition to any time the next token is
+# unknown - for example, at the end of another token.
+#
+# This never produces a token directly. It can transition to the tokenizer state
+# for any token type, as well as to the trap state for EOF.
+def tokenize_any(lookahead, port):
+    if lookahead == '':
+        return None, lookahead, tokenize_eof
+    if lookahead == ';':
+        return None, lookahead, tokenize_comment
+    if lookahead in '()':
+        return None, lookahead, tokenize_syntax
+    if lookahead in ' \t\n':
+        return None, lookahead, tokenize_whitespace
+    if lookahead == '"':
+        return None, lookahead, tokenize_string
+    return None, lookahead, tokenize_symbol
+
+# Special trap state. This never produces a token, and always transitions to
+# itself. The lookahead in this state is generally ``''``, and since this never
+# performs any further reads, it will remain that value indefinitely.
+#
+# The top-level parser exits in this situation by examining ``lookahead``, but
+# it's possible to reach this state from string literal tokenization or after a
+# comment.
+def tokenize_eof(lookahead, port):
+    return None, lookahead, tokenize_eof
+
+# Consumes one read result at a time until it finds an end of line or runs out
+# of input. This throws away comments entirely, at parse time, without
+# considering whether the comment content can be separated into tokens. As this
+# scans the comment, the lookahead will be set to successive characters from the
+# port, but never more than one character at a time.
+#
+# This never produces a token.
+def tokenize_comment(lookahead, port):
+    next = port.read(1)
+    if next != '\n':
+        return None, next, tokenize_comment
+    return None, next, tokenize_any
+
+# Consumes the lookahead and packages it up as a Syntax token. This is generally
+# appropriate for the ``(`` and ``)`` syntactic elements.
+#
+# The resulting lookahead will be the next character of input, and this always
+# dispatches back to ``tokenize_any`` so that the next token (if any) can be
+# determined.
+def tokenize_syntax(lookahead, port):
+    return Syntax(lookahead), port.read(1), tokenize_any
+
+# Consumes and ignores whitespace in the input. This never produces a token, and
+# throws away the lookahead entirely. The resulting lookahead is the next
+# character of input.
+def tokenize_whitespace(lookahead, port):
+    return None, port.read(1), tokenize_any
+
+# Consumes characters until it finds a character which cannot be part of a token
+# or until it finds the end of input, accumulating them into a single Symbol
+# token. This is a heavily-overloaded token category, as it contains not only
+# Actinide symbols but also all non-String literals.
+#
+# While the tokenizer remains in this state, the lookahead accumulates the
+# characters of the token. When this matches a completed token, it produces a
+# Symbol token, and resets the lookahead back to a single read result containing
+# the next character of input.
+def tokenize_symbol(lookahead, port):
+    next = port.read(1)
+    if next == '':
+        return Symbol(lookahead), next, tokenize_eof
+    if next in '"(); \t\n':
+        return Symbol(lookahead), next, tokenize_any
+    return None, lookahead + next, tokenize_symbol
+
+# ### STRINGS
+#
+# The following states handle string literals in the input stream. String
+# literals are fairly simple: they begin with a quote, contain arbitrary
+# characters other than a bare \ or ", and end with a quote. The sequences
+# ``\\`` and ``\"`` are de-escaped by removing the leading backslash and
+# included in the resulting string.
+#
+# These states use the lookahead to accumulate the characters of the string. On
+# transition back to ``tokenize_any``, the lookahead is always set back to a
+# single character. If, at any point, these states encounter EOF, they raise a
+# ``TokenError``: no legal token in Actinide begins with a quote mark and ends
+# with EOF.
+
+# The lookahead is assumed to be the opening quote of a string, and discarded.
+# Read forwards one character to determine whether this is an empty string
+# literal or not, then proceed either to ``tokenize_string_end`` for an empty
+# string, or to ``tokenize_string_character`` for a non-empty string.
+#
+# This never yields a token. The lookahead is set to the characters of the
+# string read so far.
+def tokenize_string(lookahead, port):
+    next = port.read(1)
+    if next == '':
+        raise TokenError('Unclosed string literal')
+    if next == '"':
+        return None, '', tokenize_string_end
+    return None, next, tokenize_string_character
+
+# The lookahead contains the body of the string read so far. Reads forwards one
+# character to determine if the string continues, contains an escaped character,
+# or ends.
+#
+# This never yields a token.
+def tokenize_string_character(lookahead, port):
+    next = port.read(1)
+    if next == '':
+        raise TokenError('Unclosed string literal')
+    if next == '\\':
+        return None, lookahead, tokenize_escaped_string_character
+    if next == '"':
+        return None, lookahead, tokenize_string_end
+    return None, lookahead + next, tokenize_string_character
+
+# The lookahead contains the body of the string so far. Reads forwards one
+# character to determine which, if any, escaped character to process: if it's
+# one we recognize, de-escape it and append it to the string, otherwise raise a
+# TokenError.
+#
+# This never yields a token, and always dispatches back to
+# ``tokenize_string_character`` on a legal escape character.
+def tokenize_escaped_string_character(lookahead, port):
+    next = port.read(1)
+    if next == '':
+        raise TokenError('Unclosed string literal')
+    if next == 'n':
+        return None, lookahead + '\n', tokenize_string_character
+    if next == '\\':
+        return None, lookahead + '\\', tokenize_string_character
+    raise TokenError(f"Unknown string escape '\{next}'")
+
+# Package the lookahead (the full string body, de-escaped and without leading
+# and trailing quotes) up as a String token and return it, then transition back
+# to the ``tokenize_any`` state with a single read result in the lookahead.
+def tokenize_string_end(lookahead, port):
+    return String(lookahead), port.read(1), tokenize_any
diff --git a/actinide/types.py b/actinide/types.py
new file mode 100644
index 0000000..2b618f4
--- /dev/null
+++ b/actinide/types.py
@@ -0,0 +1,12 @@
+__all__ = ['String', 'Symbol', 'Syntax']
+
+# ## REPRESENTATIONS
+#
+# The following defines specify the Python representations of various Actinide
+# types.
+
+String = str
+class Symbol(str):
+    pass
+class Syntax(str):
+    pass
diff --git a/bin/actinide-repl b/bin/actinide-repl
new file mode 100755
index 0000000..7909d36
--- /dev/null
+++ b/bin/actinide-repl
@@ -0,0 +1,6 @@
+#!/usr/bin/env python
+
+import sys
+import actinide.tokenizer as at
+
+print(repr(list(at.tokenize(sys.stdin))))
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..21fffac
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,8 @@
+from setuptools import setup, find_packages
+
+setup(
+    name='actinide',
+    version='0.1',
+    packages=find_packages(),
+    scripts=['bin/actinide-repl'],
+)
author	Owen Jacobson <owen@grimoire.ca>	2017-11-08 02:16:50 -0500
committer	Owen Jacobson <owen@grimoire.ca>	2017-11-08 02:22:04 -0500
commit	0fcc2dc618f2eb00d8cf82ce328c98e0ea9f2626 (patch)
tree	adabcf3d10a9204cc2adcea16404fe8b13afc16d
parent	cb4f719cc09ab723acd72df4b727b1740650d87c (diff)