Code/python/expression-evaluator/parser.py

"""
Part 2: DAG Construction -- Recursive Descent Parser
=====================================================
A parser converts a flat list of tokens into a tree structure (AST).
The AST is a DAG (Directed Acyclic Graph) where:

  - Nodes are operations (BinOpNode) or values (NumberNode)
  - Edges point from parent operations to their operands
  - The graph is acyclic because an operation's inputs are always
    "simpler" sub-expressions (no circular dependencies)
  - It is a tree (a special case of DAG) because no node is shared

This is the same structure as:
  - Spreadsheet dependency graphs (cell A1 depends on B1, B2...)
  - Build systems (Makefile targets depend on other targets)
  - Task scheduling (some tasks must finish before others start)
  - Neural network computation graphs (forward pass is a DAG)

Key DAG concepts demonstrated:
  - Nodes: operations and values
  - Directed edges: from operation to its inputs (dependencies)
  - Acyclic: no circular dependencies
  - Topological ordering: natural evaluation order (leaves first)

Grammar (BNF) -- precedence is encoded by nesting depth:
  expression ::= term ((PLUS | MINUS) term)*          # lowest precedence
  term       ::= unary ((MULTIPLY | DIVIDE) unary)*
  unary      ::= UNARY_MINUS unary | power
  power      ::= atom (POWER power)?                  # right-associative
  atom       ::= NUMBER | LPAREN expression RPAREN    # highest precedence

Call chain: expression -> term -> unary -> power -> atom
This means: +/- binds loosest, then *//, then unary -, then ^, then parens.
So -3^2 = -(3^2) = -9, matching standard math convention.
"""

from dataclasses import dataclass

from tokenizer import Token, TokenType


# ---------- AST node types ----------
# These are the nodes of our DAG. Each node is either a leaf (NumberNode)
# or an interior node with edges pointing to its children (operands).

@dataclass
class NumberNode:
    """Leaf node: a numeric literal. In DAG terms, a node with no outgoing edges."""
    value: float

    def __repr__(self):
        if self.value == int(self.value):
            return f"NumberNode({int(self.value)})"
        return f"NumberNode({self.value})"


@dataclass
class BinOpNode:
    """
    Interior node: a binary operation with two children.

    DAG edges: this node -> left, this node -> right
    The edges represent "depends on": to compute this node's value,
    we must first compute left and right.
    """
    op: TokenType
    left: 'NumberNode | BinOpNode | UnaryOpNode'
    right: 'NumberNode | BinOpNode | UnaryOpNode'

    def __repr__(self):
        return f"BinOpNode({self.op.name}, {self.left}, {self.right})"


@dataclass
class UnaryOpNode:
    """Interior node: a unary operation (negation) with one child."""
    op: TokenType
    operand: 'NumberNode | BinOpNode | UnaryOpNode'

    def __repr__(self):
        return f"UnaryOpNode({self.op.name}, {self.operand})"


# Union type for any AST node
Node = NumberNode | BinOpNode | UnaryOpNode


# ---------- Errors ----------

class ParseError(Exception):
    def __init__(self, message, position=None):
        self.position = position
        pos_info = f" at position {position}" if position is not None else ""
        super().__init__(f"Parse error{pos_info}: {message}")


# ---------- Recursive descent parser ----------

class Parser:
    """
    Converts a list of tokens into an AST (expression tree / DAG).

    Each grammar rule becomes a method. The call tree mirrors the shape
    of the AST being built. When a deeper method returns a node, it
    becomes a child of the node built by the caller -- this is how
    the DAG edges form.

    Precedence is encoded by nesting: lower-precedence operators are
    parsed at higher (outer) levels, so they become closer to the root
    of the tree and are evaluated last.
    """

    def __init__(self, tokens):
        self.tokens = tokens
        self.pos = 0

    def peek(self):
        """Look at the current token without consuming it."""
        return self.tokens[self.pos]

    def consume(self, expected=None):
        """Consume and return the current token, optionally asserting its type."""
        token = self.tokens[self.pos]
        if expected is not None and token.type != expected:
            raise ParseError(
                f"expected {expected.name}, got {token.type.name}",
                token.position,
            )
        self.pos += 1
        return token

    def parse(self):
        """Entry point: parse the full expression and verify we consumed everything."""
        if self.peek().type == TokenType.EOF:
            raise ParseError("empty expression")
        node = self.expression()
        self.consume(TokenType.EOF)
        return node

    # --- Grammar rules ---
    # Each method corresponds to one production in the grammar.
    # The nesting encodes operator precedence.

    def expression(self):
        """expression ::= term ((PLUS | MINUS) term)*"""
        node = self.term()
        while self.peek().type in (TokenType.PLUS, TokenType.MINUS):
            op_token = self.consume()
            right = self.term()
            # Build a new BinOpNode -- this creates a DAG edge from
            # the new node to both 'node' (left) and 'right'
            node = BinOpNode(op_token.type, node, right)
        return node

    def term(self):
        """term ::= unary ((MULTIPLY | DIVIDE) unary)*"""
        node = self.unary()
        while self.peek().type in (TokenType.MULTIPLY, TokenType.DIVIDE):
            op_token = self.consume()
            right = self.unary()
            node = BinOpNode(op_token.type, node, right)
        return node

    def unary(self):
        """
        unary ::= UNARY_MINUS unary | power

        Unary minus is parsed here, between term and power, so it binds
        looser than ^ but tighter than * and /. This gives the standard
        math behavior: -3^2 = -(3^2) = -9.

        The recursion (unary calls itself) handles double negation: --3 = 3.
        """
        if self.peek().type == TokenType.UNARY_MINUS:
            op_token = self.consume()
            operand = self.unary()
            return UnaryOpNode(op_token.type, operand)
        return self.power()

    def power(self):
        """
        power ::= atom (POWER power)?

        Right-recursive for right-associativity: 2^3^4 = 2^(3^4) = 2^81.
        Compare with term() which uses a while loop for LEFT-associativity.
        """
        node = self.atom()
        if self.peek().type == TokenType.POWER:
            op_token = self.consume()
            right = self.power()  # recurse (not loop) for right-associativity
            node = BinOpNode(op_token.type, node, right)
        return node

    def atom(self):
        """
        atom ::= NUMBER | LPAREN expression RPAREN

        The base case: either a literal number or a parenthesized
        sub-expression. Parentheses work by recursing back to
        expression(), which restarts precedence parsing from the top.
        """
        token = self.peek()

        if token.type == TokenType.NUMBER:
            self.consume()
            return NumberNode(float(token.value))

        if token.type == TokenType.LPAREN:
            self.consume()
            node = self.expression()
            self.consume(TokenType.RPAREN)
            return node

        raise ParseError(
            f"expected number or '(', got {token.type.name}",
            token.position,
        )