Add expression-evaluator: DAGs & state machines tutorial project

Educational calculator teaching FSMs (explicit transition table tokenizer) and DAGs (recursive descent parser with AST evaluation). Includes CLI with REPL, graphviz visualization, and 61 tests. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-08 18:09:42 +00:00
parent 3a8705ece8
commit 01d5532823
11 changed files with 1557 additions and 0 deletions
--- a/python/expression-evaluator/tokenizer.py
+++ b/python/expression-evaluator/tokenizer.py
@@ -0,0 +1,306 @@
+"""
+Part 1: State Machine Tokenizer
+================================
+A tokenizer (lexer) converts raw text into a stream of tokens.
+This implementation uses an EXPLICIT finite state machine (FSM):
+
+  - States are named values (an enum), not implicit control flow
+  - A transition table maps (current_state, input_class) -> (next_state, action)
+  - The main loop reads one character at a time and consults the table
+
+This is the same pattern used in:
+  - Network protocol parsers (HTTP, TCP state machines)
+  - Regular expression engines
+  - Compiler front-ends (lexers for C, Python, etc.)
+  - Game AI (enemy behavior states)
+
+Key FSM concepts demonstrated:
+  - States: the "memory" of what we're currently building
+  - Transitions: rules for moving between states based on input
+  - Actions: side effects (emit a token, accumulate a character)
+  - Mealy machine: outputs depend on both state AND input
+"""
+
+from dataclasses import dataclass
+from enum import Enum
+
+
+# ---------- Token types ----------
+
+class TokenType(Enum):
+    NUMBER      = "NUMBER"
+    PLUS        = "PLUS"
+    MINUS       = "MINUS"
+    MULTIPLY    = "MULTIPLY"
+    DIVIDE      = "DIVIDE"
+    POWER       = "POWER"
+    LPAREN      = "LPAREN"
+    RPAREN      = "RPAREN"
+    UNARY_MINUS = "UNARY_MINUS"
+    EOF         = "EOF"
+
+
+@dataclass
+class Token:
+    type: TokenType
+    value: str          # raw text: "42", "+", "(", etc.
+    position: int       # character offset in original expression
+
+    def __repr__(self):
+        return f"Token({self.type.name}, {self.value!r}, pos={self.position})"
+
+
+OPERATOR_MAP = {
+    '+': TokenType.PLUS,
+    '-': TokenType.MINUS,
+    '*': TokenType.MULTIPLY,
+    '/': TokenType.DIVIDE,
+    '^': TokenType.POWER,
+}
+
+
+# ---------- FSM state definitions ----------
+
+class State(Enum):
+    """
+    The tokenizer's finite set of states.
+
+    START   -- idle / between tokens, deciding what comes next
+    INTEGER -- accumulating digits of an integer (e.g. "12" so far)
+    DECIMAL -- accumulating digits after a decimal point (e.g. "12.3" so far)
+    """
+    START   = "START"
+    INTEGER = "INTEGER"
+    DECIMAL = "DECIMAL"
+
+
+class CharClass(Enum):
+    """
+    Character classification -- groups raw characters into categories
+    so the transition table stays small and readable.
+    """
+    DIGIT    = "DIGIT"
+    DOT      = "DOT"
+    OPERATOR = "OPERATOR"
+    LPAREN   = "LPAREN"
+    RPAREN   = "RPAREN"
+    SPACE    = "SPACE"
+    EOF      = "EOF"
+    UNKNOWN  = "UNKNOWN"
+
+
+class Action(Enum):
+    """
+    What the FSM does on a transition. In a Mealy machine, the output
+    (action) depends on both the current state AND the input.
+    """
+    ACCUMULATE              = "ACCUMULATE"
+    EMIT_NUMBER             = "EMIT_NUMBER"
+    EMIT_OPERATOR           = "EMIT_OPERATOR"
+    EMIT_LPAREN             = "EMIT_LPAREN"
+    EMIT_RPAREN             = "EMIT_RPAREN"
+    EMIT_NUMBER_THEN_OP     = "EMIT_NUMBER_THEN_OP"
+    EMIT_NUMBER_THEN_LPAREN = "EMIT_NUMBER_THEN_LPAREN"
+    EMIT_NUMBER_THEN_RPAREN = "EMIT_NUMBER_THEN_RPAREN"
+    EMIT_NUMBER_THEN_DONE   = "EMIT_NUMBER_THEN_DONE"
+    SKIP                    = "SKIP"
+    DONE                    = "DONE"
+    ERROR                   = "ERROR"
+
+
+@dataclass(frozen=True)
+class Transition:
+    next_state: State
+    action: Action
+
+
+# ---------- Transition table ----------
+# This is the heart of the state machine. Every (state, char_class) pair
+# maps to exactly one transition: a next state and an action to perform.
+# Making this a data structure (not nested if/else) means we can:
+#   1. Inspect it programmatically (e.g. to generate a diagram)
+#   2. Verify completeness (every combination is covered)
+#   3. Understand the FSM at a glance
+
+TRANSITIONS = {
+    # --- START: between tokens, dispatch based on character class ---
+    (State.START, CharClass.DIGIT):    Transition(State.INTEGER, Action.ACCUMULATE),
+    (State.START, CharClass.DOT):      Transition(State.DECIMAL, Action.ACCUMULATE),
+    (State.START, CharClass.OPERATOR): Transition(State.START,   Action.EMIT_OPERATOR),
+    (State.START, CharClass.LPAREN):   Transition(State.START,   Action.EMIT_LPAREN),
+    (State.START, CharClass.RPAREN):   Transition(State.START,   Action.EMIT_RPAREN),
+    (State.START, CharClass.SPACE):    Transition(State.START,   Action.SKIP),
+    (State.START, CharClass.EOF):      Transition(State.START,   Action.DONE),
+
+    # --- INTEGER: accumulating digits like "123" ---
+    (State.INTEGER, CharClass.DIGIT):    Transition(State.INTEGER, Action.ACCUMULATE),
+    (State.INTEGER, CharClass.DOT):      Transition(State.DECIMAL, Action.ACCUMULATE),
+    (State.INTEGER, CharClass.OPERATOR): Transition(State.START,   Action.EMIT_NUMBER_THEN_OP),
+    (State.INTEGER, CharClass.LPAREN):   Transition(State.START,   Action.EMIT_NUMBER_THEN_LPAREN),
+    (State.INTEGER, CharClass.RPAREN):   Transition(State.START,   Action.EMIT_NUMBER_THEN_RPAREN),
+    (State.INTEGER, CharClass.SPACE):    Transition(State.START,   Action.EMIT_NUMBER),
+    (State.INTEGER, CharClass.EOF):      Transition(State.START,   Action.EMIT_NUMBER_THEN_DONE),
+
+    # --- DECIMAL: accumulating digits after "." like "123.45" ---
+    (State.DECIMAL, CharClass.DIGIT):    Transition(State.DECIMAL, Action.ACCUMULATE),
+    (State.DECIMAL, CharClass.DOT):      Transition(State.START,   Action.ERROR),
+    (State.DECIMAL, CharClass.OPERATOR): Transition(State.START,   Action.EMIT_NUMBER_THEN_OP),
+    (State.DECIMAL, CharClass.LPAREN):   Transition(State.START,   Action.EMIT_NUMBER_THEN_LPAREN),
+    (State.DECIMAL, CharClass.RPAREN):   Transition(State.START,   Action.EMIT_NUMBER_THEN_RPAREN),
+    (State.DECIMAL, CharClass.SPACE):    Transition(State.START,   Action.EMIT_NUMBER),
+    (State.DECIMAL, CharClass.EOF):      Transition(State.START,   Action.EMIT_NUMBER_THEN_DONE),
+}
+
+
+# ---------- Errors ----------
+
+class TokenError(Exception):
+    def __init__(self, message, position):
+        self.position = position
+        super().__init__(f"Token error at position {position}: {message}")
+
+
+# ---------- Character classification ----------
+
+def classify(ch):
+    """Map a single character to its CharClass."""
+    if ch.isdigit():
+        return CharClass.DIGIT
+    if ch == '.':
+        return CharClass.DOT
+    if ch in OPERATOR_MAP:
+        return CharClass.OPERATOR
+    if ch == '(':
+        return CharClass.LPAREN
+    if ch == ')':
+        return CharClass.RPAREN
+    if ch.isspace():
+        return CharClass.SPACE
+    return CharClass.UNKNOWN
+
+
+# ---------- Main tokenize function ----------
+
+def tokenize(expression):
+    """
+    Process an expression string through the state machine, producing tokens.
+
+    The main loop:
+      1. Classify the current character
+      2. Look up (state, char_class) in the transition table
+      3. Execute the action (accumulate, emit, skip, etc.)
+      4. Move to the next state
+      5. Advance to the next character
+
+    After all tokens are emitted, a post-processing step resolves
+    unary minus: if a MINUS token appears at the start, after an operator,
+    or after LPAREN, it is re-classified as UNARY_MINUS.
+    """
+    state = State.START
+    buffer = []             # characters accumulated for the current token
+    buffer_start = 0        # position where the current buffer started
+    tokens = []
+    pos = 0
+
+    # Append a sentinel so EOF is handled uniformly in the loop
+    chars = expression + '\0'
+
+    while pos <= len(expression):
+        ch = chars[pos]
+        char_class = CharClass.EOF if pos == len(expression) else classify(ch)
+
+        if char_class == CharClass.UNKNOWN:
+            raise TokenError(f"unexpected character {ch!r}", pos)
+
+        # Look up the transition
+        key = (state, char_class)
+        transition = TRANSITIONS.get(key)
+        if transition is None:
+            raise TokenError(f"no transition for state={state.name}, input={char_class.name}", pos)
+
+        action = transition.action
+        next_state = transition.next_state
+
+        # --- Execute the action ---
+
+        if action == Action.ACCUMULATE:
+            if not buffer:
+                buffer_start = pos
+            buffer.append(ch)
+
+        elif action == Action.EMIT_NUMBER:
+            tokens.append(Token(TokenType.NUMBER, ''.join(buffer), buffer_start))
+            buffer.clear()
+
+        elif action == Action.EMIT_OPERATOR:
+            tokens.append(Token(OPERATOR_MAP[ch], ch, pos))
+
+        elif action == Action.EMIT_LPAREN:
+            tokens.append(Token(TokenType.LPAREN, ch, pos))
+
+        elif action == Action.EMIT_RPAREN:
+            tokens.append(Token(TokenType.RPAREN, ch, pos))
+
+        elif action == Action.EMIT_NUMBER_THEN_OP:
+            tokens.append(Token(TokenType.NUMBER, ''.join(buffer), buffer_start))
+            buffer.clear()
+            tokens.append(Token(OPERATOR_MAP[ch], ch, pos))
+
+        elif action == Action.EMIT_NUMBER_THEN_LPAREN:
+            tokens.append(Token(TokenType.NUMBER, ''.join(buffer), buffer_start))
+            buffer.clear()
+            tokens.append(Token(TokenType.LPAREN, ch, pos))
+
+        elif action == Action.EMIT_NUMBER_THEN_RPAREN:
+            tokens.append(Token(TokenType.NUMBER, ''.join(buffer), buffer_start))
+            buffer.clear()
+            tokens.append(Token(TokenType.RPAREN, ch, pos))
+
+        elif action == Action.EMIT_NUMBER_THEN_DONE:
+            tokens.append(Token(TokenType.NUMBER, ''.join(buffer), buffer_start))
+            buffer.clear()
+
+        elif action == Action.SKIP:
+            pass
+
+        elif action == Action.DONE:
+            pass
+
+        elif action == Action.ERROR:
+            raise TokenError(f"unexpected {ch!r} in state {state.name}", pos)
+
+        state = next_state
+        pos += 1
+
+    # --- Post-processing: resolve unary minus ---
+    # A MINUS is unary if it appears:
+    #   - at the very start of the token stream
+    #   - immediately after an operator (+, -, *, /, ^) or LPAREN
+    # This context-sensitivity cannot be captured by the FSM alone --
+    # it requires looking at previously emitted tokens.
+    _resolve_unary_minus(tokens)
+
+    tokens.append(Token(TokenType.EOF, '', len(expression)))
+    return tokens
+
+
+def _resolve_unary_minus(tokens):
+    """
+    Convert binary MINUS tokens to UNARY_MINUS where appropriate.
+
+    Why this isn't in the FSM: the FSM processes characters one at a time
+    and only tracks what kind of token it's currently building (its state).
+    But whether '-' is unary or binary depends on the PREVIOUS TOKEN --
+    information the FSM doesn't track. This is a common real-world pattern:
+    the lexer handles most work, then a lightweight post-pass adds context.
+    """
+    unary_predecessor = {
+        TokenType.PLUS, TokenType.MINUS, TokenType.MULTIPLY,
+        TokenType.DIVIDE, TokenType.POWER, TokenType.LPAREN,
+        TokenType.UNARY_MINUS,
+    }
+    for i, token in enumerate(tokens):
+        if token.type != TokenType.MINUS:
+            continue
+        if i == 0 or tokens[i - 1].type in unary_predecessor:
+            tokens[i] = Token(TokenType.UNARY_MINUS, token.value, token.position)