Add expression-evaluator: DAGs & state machines tutorial project
Educational calculator teaching FSMs (explicit transition table tokenizer) and DAGs (recursive descent parser with AST evaluation). Includes CLI with REPL, graphviz visualization, and 61 tests. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
306
python/expression-evaluator/tokenizer.py
Normal file
306
python/expression-evaluator/tokenizer.py
Normal file
@@ -0,0 +1,306 @@
|
||||
"""
|
||||
Part 1: State Machine Tokenizer
|
||||
================================
|
||||
A tokenizer (lexer) converts raw text into a stream of tokens.
|
||||
This implementation uses an EXPLICIT finite state machine (FSM):
|
||||
|
||||
- States are named values (an enum), not implicit control flow
|
||||
- A transition table maps (current_state, input_class) -> (next_state, action)
|
||||
- The main loop reads one character at a time and consults the table
|
||||
|
||||
This is the same pattern used in:
|
||||
- Network protocol parsers (HTTP, TCP state machines)
|
||||
- Regular expression engines
|
||||
- Compiler front-ends (lexers for C, Python, etc.)
|
||||
- Game AI (enemy behavior states)
|
||||
|
||||
Key FSM concepts demonstrated:
|
||||
- States: the "memory" of what we're currently building
|
||||
- Transitions: rules for moving between states based on input
|
||||
- Actions: side effects (emit a token, accumulate a character)
|
||||
- Mealy machine: outputs depend on both state AND input
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum
|
||||
|
||||
|
||||
# ---------- Token types ----------
|
||||
|
||||
class TokenType(Enum):
|
||||
NUMBER = "NUMBER"
|
||||
PLUS = "PLUS"
|
||||
MINUS = "MINUS"
|
||||
MULTIPLY = "MULTIPLY"
|
||||
DIVIDE = "DIVIDE"
|
||||
POWER = "POWER"
|
||||
LPAREN = "LPAREN"
|
||||
RPAREN = "RPAREN"
|
||||
UNARY_MINUS = "UNARY_MINUS"
|
||||
EOF = "EOF"
|
||||
|
||||
|
||||
@dataclass
|
||||
class Token:
|
||||
type: TokenType
|
||||
value: str # raw text: "42", "+", "(", etc.
|
||||
position: int # character offset in original expression
|
||||
|
||||
def __repr__(self):
|
||||
return f"Token({self.type.name}, {self.value!r}, pos={self.position})"
|
||||
|
||||
|
||||
OPERATOR_MAP = {
|
||||
'+': TokenType.PLUS,
|
||||
'-': TokenType.MINUS,
|
||||
'*': TokenType.MULTIPLY,
|
||||
'/': TokenType.DIVIDE,
|
||||
'^': TokenType.POWER,
|
||||
}
|
||||
|
||||
|
||||
# ---------- FSM state definitions ----------
|
||||
|
||||
class State(Enum):
|
||||
"""
|
||||
The tokenizer's finite set of states.
|
||||
|
||||
START -- idle / between tokens, deciding what comes next
|
||||
INTEGER -- accumulating digits of an integer (e.g. "12" so far)
|
||||
DECIMAL -- accumulating digits after a decimal point (e.g. "12.3" so far)
|
||||
"""
|
||||
START = "START"
|
||||
INTEGER = "INTEGER"
|
||||
DECIMAL = "DECIMAL"
|
||||
|
||||
|
||||
class CharClass(Enum):
|
||||
"""
|
||||
Character classification -- groups raw characters into categories
|
||||
so the transition table stays small and readable.
|
||||
"""
|
||||
DIGIT = "DIGIT"
|
||||
DOT = "DOT"
|
||||
OPERATOR = "OPERATOR"
|
||||
LPAREN = "LPAREN"
|
||||
RPAREN = "RPAREN"
|
||||
SPACE = "SPACE"
|
||||
EOF = "EOF"
|
||||
UNKNOWN = "UNKNOWN"
|
||||
|
||||
|
||||
class Action(Enum):
|
||||
"""
|
||||
What the FSM does on a transition. In a Mealy machine, the output
|
||||
(action) depends on both the current state AND the input.
|
||||
"""
|
||||
ACCUMULATE = "ACCUMULATE"
|
||||
EMIT_NUMBER = "EMIT_NUMBER"
|
||||
EMIT_OPERATOR = "EMIT_OPERATOR"
|
||||
EMIT_LPAREN = "EMIT_LPAREN"
|
||||
EMIT_RPAREN = "EMIT_RPAREN"
|
||||
EMIT_NUMBER_THEN_OP = "EMIT_NUMBER_THEN_OP"
|
||||
EMIT_NUMBER_THEN_LPAREN = "EMIT_NUMBER_THEN_LPAREN"
|
||||
EMIT_NUMBER_THEN_RPAREN = "EMIT_NUMBER_THEN_RPAREN"
|
||||
EMIT_NUMBER_THEN_DONE = "EMIT_NUMBER_THEN_DONE"
|
||||
SKIP = "SKIP"
|
||||
DONE = "DONE"
|
||||
ERROR = "ERROR"
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class Transition:
|
||||
next_state: State
|
||||
action: Action
|
||||
|
||||
|
||||
# ---------- Transition table ----------
|
||||
# This is the heart of the state machine. Every (state, char_class) pair
|
||||
# maps to exactly one transition: a next state and an action to perform.
|
||||
# Making this a data structure (not nested if/else) means we can:
|
||||
# 1. Inspect it programmatically (e.g. to generate a diagram)
|
||||
# 2. Verify completeness (every combination is covered)
|
||||
# 3. Understand the FSM at a glance
|
||||
|
||||
TRANSITIONS = {
|
||||
# --- START: between tokens, dispatch based on character class ---
|
||||
(State.START, CharClass.DIGIT): Transition(State.INTEGER, Action.ACCUMULATE),
|
||||
(State.START, CharClass.DOT): Transition(State.DECIMAL, Action.ACCUMULATE),
|
||||
(State.START, CharClass.OPERATOR): Transition(State.START, Action.EMIT_OPERATOR),
|
||||
(State.START, CharClass.LPAREN): Transition(State.START, Action.EMIT_LPAREN),
|
||||
(State.START, CharClass.RPAREN): Transition(State.START, Action.EMIT_RPAREN),
|
||||
(State.START, CharClass.SPACE): Transition(State.START, Action.SKIP),
|
||||
(State.START, CharClass.EOF): Transition(State.START, Action.DONE),
|
||||
|
||||
# --- INTEGER: accumulating digits like "123" ---
|
||||
(State.INTEGER, CharClass.DIGIT): Transition(State.INTEGER, Action.ACCUMULATE),
|
||||
(State.INTEGER, CharClass.DOT): Transition(State.DECIMAL, Action.ACCUMULATE),
|
||||
(State.INTEGER, CharClass.OPERATOR): Transition(State.START, Action.EMIT_NUMBER_THEN_OP),
|
||||
(State.INTEGER, CharClass.LPAREN): Transition(State.START, Action.EMIT_NUMBER_THEN_LPAREN),
|
||||
(State.INTEGER, CharClass.RPAREN): Transition(State.START, Action.EMIT_NUMBER_THEN_RPAREN),
|
||||
(State.INTEGER, CharClass.SPACE): Transition(State.START, Action.EMIT_NUMBER),
|
||||
(State.INTEGER, CharClass.EOF): Transition(State.START, Action.EMIT_NUMBER_THEN_DONE),
|
||||
|
||||
# --- DECIMAL: accumulating digits after "." like "123.45" ---
|
||||
(State.DECIMAL, CharClass.DIGIT): Transition(State.DECIMAL, Action.ACCUMULATE),
|
||||
(State.DECIMAL, CharClass.DOT): Transition(State.START, Action.ERROR),
|
||||
(State.DECIMAL, CharClass.OPERATOR): Transition(State.START, Action.EMIT_NUMBER_THEN_OP),
|
||||
(State.DECIMAL, CharClass.LPAREN): Transition(State.START, Action.EMIT_NUMBER_THEN_LPAREN),
|
||||
(State.DECIMAL, CharClass.RPAREN): Transition(State.START, Action.EMIT_NUMBER_THEN_RPAREN),
|
||||
(State.DECIMAL, CharClass.SPACE): Transition(State.START, Action.EMIT_NUMBER),
|
||||
(State.DECIMAL, CharClass.EOF): Transition(State.START, Action.EMIT_NUMBER_THEN_DONE),
|
||||
}
|
||||
|
||||
|
||||
# ---------- Errors ----------
|
||||
|
||||
class TokenError(Exception):
|
||||
def __init__(self, message, position):
|
||||
self.position = position
|
||||
super().__init__(f"Token error at position {position}: {message}")
|
||||
|
||||
|
||||
# ---------- Character classification ----------
|
||||
|
||||
def classify(ch):
|
||||
"""Map a single character to its CharClass."""
|
||||
if ch.isdigit():
|
||||
return CharClass.DIGIT
|
||||
if ch == '.':
|
||||
return CharClass.DOT
|
||||
if ch in OPERATOR_MAP:
|
||||
return CharClass.OPERATOR
|
||||
if ch == '(':
|
||||
return CharClass.LPAREN
|
||||
if ch == ')':
|
||||
return CharClass.RPAREN
|
||||
if ch.isspace():
|
||||
return CharClass.SPACE
|
||||
return CharClass.UNKNOWN
|
||||
|
||||
|
||||
# ---------- Main tokenize function ----------
|
||||
|
||||
def tokenize(expression):
|
||||
"""
|
||||
Process an expression string through the state machine, producing tokens.
|
||||
|
||||
The main loop:
|
||||
1. Classify the current character
|
||||
2. Look up (state, char_class) in the transition table
|
||||
3. Execute the action (accumulate, emit, skip, etc.)
|
||||
4. Move to the next state
|
||||
5. Advance to the next character
|
||||
|
||||
After all tokens are emitted, a post-processing step resolves
|
||||
unary minus: if a MINUS token appears at the start, after an operator,
|
||||
or after LPAREN, it is re-classified as UNARY_MINUS.
|
||||
"""
|
||||
state = State.START
|
||||
buffer = [] # characters accumulated for the current token
|
||||
buffer_start = 0 # position where the current buffer started
|
||||
tokens = []
|
||||
pos = 0
|
||||
|
||||
# Append a sentinel so EOF is handled uniformly in the loop
|
||||
chars = expression + '\0'
|
||||
|
||||
while pos <= len(expression):
|
||||
ch = chars[pos]
|
||||
char_class = CharClass.EOF if pos == len(expression) else classify(ch)
|
||||
|
||||
if char_class == CharClass.UNKNOWN:
|
||||
raise TokenError(f"unexpected character {ch!r}", pos)
|
||||
|
||||
# Look up the transition
|
||||
key = (state, char_class)
|
||||
transition = TRANSITIONS.get(key)
|
||||
if transition is None:
|
||||
raise TokenError(f"no transition for state={state.name}, input={char_class.name}", pos)
|
||||
|
||||
action = transition.action
|
||||
next_state = transition.next_state
|
||||
|
||||
# --- Execute the action ---
|
||||
|
||||
if action == Action.ACCUMULATE:
|
||||
if not buffer:
|
||||
buffer_start = pos
|
||||
buffer.append(ch)
|
||||
|
||||
elif action == Action.EMIT_NUMBER:
|
||||
tokens.append(Token(TokenType.NUMBER, ''.join(buffer), buffer_start))
|
||||
buffer.clear()
|
||||
|
||||
elif action == Action.EMIT_OPERATOR:
|
||||
tokens.append(Token(OPERATOR_MAP[ch], ch, pos))
|
||||
|
||||
elif action == Action.EMIT_LPAREN:
|
||||
tokens.append(Token(TokenType.LPAREN, ch, pos))
|
||||
|
||||
elif action == Action.EMIT_RPAREN:
|
||||
tokens.append(Token(TokenType.RPAREN, ch, pos))
|
||||
|
||||
elif action == Action.EMIT_NUMBER_THEN_OP:
|
||||
tokens.append(Token(TokenType.NUMBER, ''.join(buffer), buffer_start))
|
||||
buffer.clear()
|
||||
tokens.append(Token(OPERATOR_MAP[ch], ch, pos))
|
||||
|
||||
elif action == Action.EMIT_NUMBER_THEN_LPAREN:
|
||||
tokens.append(Token(TokenType.NUMBER, ''.join(buffer), buffer_start))
|
||||
buffer.clear()
|
||||
tokens.append(Token(TokenType.LPAREN, ch, pos))
|
||||
|
||||
elif action == Action.EMIT_NUMBER_THEN_RPAREN:
|
||||
tokens.append(Token(TokenType.NUMBER, ''.join(buffer), buffer_start))
|
||||
buffer.clear()
|
||||
tokens.append(Token(TokenType.RPAREN, ch, pos))
|
||||
|
||||
elif action == Action.EMIT_NUMBER_THEN_DONE:
|
||||
tokens.append(Token(TokenType.NUMBER, ''.join(buffer), buffer_start))
|
||||
buffer.clear()
|
||||
|
||||
elif action == Action.SKIP:
|
||||
pass
|
||||
|
||||
elif action == Action.DONE:
|
||||
pass
|
||||
|
||||
elif action == Action.ERROR:
|
||||
raise TokenError(f"unexpected {ch!r} in state {state.name}", pos)
|
||||
|
||||
state = next_state
|
||||
pos += 1
|
||||
|
||||
# --- Post-processing: resolve unary minus ---
|
||||
# A MINUS is unary if it appears:
|
||||
# - at the very start of the token stream
|
||||
# - immediately after an operator (+, -, *, /, ^) or LPAREN
|
||||
# This context-sensitivity cannot be captured by the FSM alone --
|
||||
# it requires looking at previously emitted tokens.
|
||||
_resolve_unary_minus(tokens)
|
||||
|
||||
tokens.append(Token(TokenType.EOF, '', len(expression)))
|
||||
return tokens
|
||||
|
||||
|
||||
def _resolve_unary_minus(tokens):
|
||||
"""
|
||||
Convert binary MINUS tokens to UNARY_MINUS where appropriate.
|
||||
|
||||
Why this isn't in the FSM: the FSM processes characters one at a time
|
||||
and only tracks what kind of token it's currently building (its state).
|
||||
But whether '-' is unary or binary depends on the PREVIOUS TOKEN --
|
||||
information the FSM doesn't track. This is a common real-world pattern:
|
||||
the lexer handles most work, then a lightweight post-pass adds context.
|
||||
"""
|
||||
unary_predecessor = {
|
||||
TokenType.PLUS, TokenType.MINUS, TokenType.MULTIPLY,
|
||||
TokenType.DIVIDE, TokenType.POWER, TokenType.LPAREN,
|
||||
TokenType.UNARY_MINUS,
|
||||
}
|
||||
for i, token in enumerate(tokens):
|
||||
if token.type != TokenType.MINUS:
|
||||
continue
|
||||
if i == 0 or tokens[i - 1].type in unary_predecessor:
|
||||
tokens[i] = Token(TokenType.UNARY_MINUS, token.value, token.position)
|
||||
Reference in New Issue
Block a user