Educational calculator teaching FSMs (explicit transition table tokenizer) and DAGs (recursive descent parser with AST evaluation). Includes CLI with REPL, graphviz visualization, and 61 tests. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
218 lines
7.3 KiB
Python
218 lines
7.3 KiB
Python
"""
|
|
Part 2: DAG Construction -- Recursive Descent Parser
|
|
=====================================================
|
|
A parser converts a flat list of tokens into a tree structure (AST).
|
|
The AST is a DAG (Directed Acyclic Graph) where:
|
|
|
|
- Nodes are operations (BinOpNode) or values (NumberNode)
|
|
- Edges point from parent operations to their operands
|
|
- The graph is acyclic because an operation's inputs are always
|
|
"simpler" sub-expressions (no circular dependencies)
|
|
- It is a tree (a special case of DAG) because no node is shared
|
|
|
|
This is the same structure as:
|
|
- Spreadsheet dependency graphs (cell A1 depends on B1, B2...)
|
|
- Build systems (Makefile targets depend on other targets)
|
|
- Task scheduling (some tasks must finish before others start)
|
|
- Neural network computation graphs (forward pass is a DAG)
|
|
|
|
Key DAG concepts demonstrated:
|
|
- Nodes: operations and values
|
|
- Directed edges: from operation to its inputs (dependencies)
|
|
- Acyclic: no circular dependencies
|
|
- Topological ordering: natural evaluation order (leaves first)
|
|
|
|
Grammar (BNF) -- precedence is encoded by nesting depth:
|
|
expression ::= term ((PLUS | MINUS) term)* # lowest precedence
|
|
term ::= unary ((MULTIPLY | DIVIDE) unary)*
|
|
unary ::= UNARY_MINUS unary | power
|
|
power ::= atom (POWER power)? # right-associative
|
|
atom ::= NUMBER | LPAREN expression RPAREN # highest precedence
|
|
|
|
Call chain: expression -> term -> unary -> power -> atom
|
|
This means: +/- binds loosest, then *//, then unary -, then ^, then parens.
|
|
So -3^2 = -(3^2) = -9, matching standard math convention.
|
|
"""
|
|
|
|
from dataclasses import dataclass
|
|
|
|
from tokenizer import Token, TokenType
|
|
|
|
|
|
# ---------- AST node types ----------
|
|
# These are the nodes of our DAG. Each node is either a leaf (NumberNode)
|
|
# or an interior node with edges pointing to its children (operands).
|
|
|
|
@dataclass
|
|
class NumberNode:
|
|
"""Leaf node: a numeric literal. In DAG terms, a node with no outgoing edges."""
|
|
value: float
|
|
|
|
def __repr__(self):
|
|
if self.value == int(self.value):
|
|
return f"NumberNode({int(self.value)})"
|
|
return f"NumberNode({self.value})"
|
|
|
|
|
|
@dataclass
|
|
class BinOpNode:
|
|
"""
|
|
Interior node: a binary operation with two children.
|
|
|
|
DAG edges: this node -> left, this node -> right
|
|
The edges represent "depends on": to compute this node's value,
|
|
we must first compute left and right.
|
|
"""
|
|
op: TokenType
|
|
left: 'NumberNode | BinOpNode | UnaryOpNode'
|
|
right: 'NumberNode | BinOpNode | UnaryOpNode'
|
|
|
|
def __repr__(self):
|
|
return f"BinOpNode({self.op.name}, {self.left}, {self.right})"
|
|
|
|
|
|
@dataclass
|
|
class UnaryOpNode:
|
|
"""Interior node: a unary operation (negation) with one child."""
|
|
op: TokenType
|
|
operand: 'NumberNode | BinOpNode | UnaryOpNode'
|
|
|
|
def __repr__(self):
|
|
return f"UnaryOpNode({self.op.name}, {self.operand})"
|
|
|
|
|
|
# Union type for any AST node
|
|
Node = NumberNode | BinOpNode | UnaryOpNode
|
|
|
|
|
|
# ---------- Errors ----------
|
|
|
|
class ParseError(Exception):
|
|
def __init__(self, message, position=None):
|
|
self.position = position
|
|
pos_info = f" at position {position}" if position is not None else ""
|
|
super().__init__(f"Parse error{pos_info}: {message}")
|
|
|
|
|
|
# ---------- Recursive descent parser ----------
|
|
|
|
class Parser:
|
|
"""
|
|
Converts a list of tokens into an AST (expression tree / DAG).
|
|
|
|
Each grammar rule becomes a method. The call tree mirrors the shape
|
|
of the AST being built. When a deeper method returns a node, it
|
|
becomes a child of the node built by the caller -- this is how
|
|
the DAG edges form.
|
|
|
|
Precedence is encoded by nesting: lower-precedence operators are
|
|
parsed at higher (outer) levels, so they become closer to the root
|
|
of the tree and are evaluated last.
|
|
"""
|
|
|
|
def __init__(self, tokens):
|
|
self.tokens = tokens
|
|
self.pos = 0
|
|
|
|
def peek(self):
|
|
"""Look at the current token without consuming it."""
|
|
return self.tokens[self.pos]
|
|
|
|
def consume(self, expected=None):
|
|
"""Consume and return the current token, optionally asserting its type."""
|
|
token = self.tokens[self.pos]
|
|
if expected is not None and token.type != expected:
|
|
raise ParseError(
|
|
f"expected {expected.name}, got {token.type.name}",
|
|
token.position,
|
|
)
|
|
self.pos += 1
|
|
return token
|
|
|
|
def parse(self):
|
|
"""Entry point: parse the full expression and verify we consumed everything."""
|
|
if self.peek().type == TokenType.EOF:
|
|
raise ParseError("empty expression")
|
|
node = self.expression()
|
|
self.consume(TokenType.EOF)
|
|
return node
|
|
|
|
# --- Grammar rules ---
|
|
# Each method corresponds to one production in the grammar.
|
|
# The nesting encodes operator precedence.
|
|
|
|
def expression(self):
|
|
"""expression ::= term ((PLUS | MINUS) term)*"""
|
|
node = self.term()
|
|
while self.peek().type in (TokenType.PLUS, TokenType.MINUS):
|
|
op_token = self.consume()
|
|
right = self.term()
|
|
# Build a new BinOpNode -- this creates a DAG edge from
|
|
# the new node to both 'node' (left) and 'right'
|
|
node = BinOpNode(op_token.type, node, right)
|
|
return node
|
|
|
|
def term(self):
|
|
"""term ::= unary ((MULTIPLY | DIVIDE) unary)*"""
|
|
node = self.unary()
|
|
while self.peek().type in (TokenType.MULTIPLY, TokenType.DIVIDE):
|
|
op_token = self.consume()
|
|
right = self.unary()
|
|
node = BinOpNode(op_token.type, node, right)
|
|
return node
|
|
|
|
def unary(self):
|
|
"""
|
|
unary ::= UNARY_MINUS unary | power
|
|
|
|
Unary minus is parsed here, between term and power, so it binds
|
|
looser than ^ but tighter than * and /. This gives the standard
|
|
math behavior: -3^2 = -(3^2) = -9.
|
|
|
|
The recursion (unary calls itself) handles double negation: --3 = 3.
|
|
"""
|
|
if self.peek().type == TokenType.UNARY_MINUS:
|
|
op_token = self.consume()
|
|
operand = self.unary()
|
|
return UnaryOpNode(op_token.type, operand)
|
|
return self.power()
|
|
|
|
def power(self):
|
|
"""
|
|
power ::= atom (POWER power)?
|
|
|
|
Right-recursive for right-associativity: 2^3^4 = 2^(3^4) = 2^81.
|
|
Compare with term() which uses a while loop for LEFT-associativity.
|
|
"""
|
|
node = self.atom()
|
|
if self.peek().type == TokenType.POWER:
|
|
op_token = self.consume()
|
|
right = self.power() # recurse (not loop) for right-associativity
|
|
node = BinOpNode(op_token.type, node, right)
|
|
return node
|
|
|
|
def atom(self):
|
|
"""
|
|
atom ::= NUMBER | LPAREN expression RPAREN
|
|
|
|
The base case: either a literal number or a parenthesized
|
|
sub-expression. Parentheses work by recursing back to
|
|
expression(), which restarts precedence parsing from the top.
|
|
"""
|
|
token = self.peek()
|
|
|
|
if token.type == TokenType.NUMBER:
|
|
self.consume()
|
|
return NumberNode(float(token.value))
|
|
|
|
if token.type == TokenType.LPAREN:
|
|
self.consume()
|
|
node = self.expression()
|
|
self.consume(TokenType.RPAREN)
|
|
return node
|
|
|
|
raise ParseError(
|
|
f"expected number or '(', got {token.type.name}",
|
|
token.position,
|
|
)
|