Code/python/expression-evaluator/tokenizer.py

"""
Part 1: State Machine Tokenizer
================================
A tokenizer (lexer) converts raw text into a stream of tokens.
This implementation uses an EXPLICIT finite state machine (FSM):

  - States are named values (an enum), not implicit control flow
  - A transition table maps (current_state, input_class) -> (next_state, action)
  - The main loop reads one character at a time and consults the table

This is the same pattern used in:
  - Network protocol parsers (HTTP, TCP state machines)
  - Regular expression engines
  - Compiler front-ends (lexers for C, Python, etc.)
  - Game AI (enemy behavior states)

Key FSM concepts demonstrated:
  - States: the "memory" of what we're currently building
  - Transitions: rules for moving between states based on input
  - Actions: side effects (emit a token, accumulate a character)
  - Mealy machine: outputs depend on both state AND input
"""

from dataclasses import dataclass
from enum import Enum


# ---------- Token types ----------

class TokenType(Enum):
    NUMBER      = "NUMBER"
    PLUS        = "PLUS"
    MINUS       = "MINUS"
    MULTIPLY    = "MULTIPLY"
    DIVIDE      = "DIVIDE"
    POWER       = "POWER"
    LPAREN      = "LPAREN"
    RPAREN      = "RPAREN"
    UNARY_MINUS = "UNARY_MINUS"
    EOF         = "EOF"


@dataclass
class Token:
    type: TokenType
    value: str          # raw text: "42", "+", "(", etc.
    position: int       # character offset in original expression

    def __repr__(self):
        return f"Token({self.type.name}, {self.value!r}, pos={self.position})"


OPERATOR_MAP = {
    '+': TokenType.PLUS,
    '-': TokenType.MINUS,
    '*': TokenType.MULTIPLY,
    '/': TokenType.DIVIDE,
    '^': TokenType.POWER,
}


# ---------- FSM state definitions ----------

class State(Enum):
    """
    The tokenizer's finite set of states.

    START   -- idle / between tokens, deciding what comes next
    INTEGER -- accumulating digits of an integer (e.g. "12" so far)
    DECIMAL -- accumulating digits after a decimal point (e.g. "12.3" so far)
    """
    START   = "START"
    INTEGER = "INTEGER"
    DECIMAL = "DECIMAL"


class CharClass(Enum):
    """
    Character classification -- groups raw characters into categories
    so the transition table stays small and readable.
    """
    DIGIT    = "DIGIT"
    DOT      = "DOT"
    OPERATOR = "OPERATOR"
    LPAREN   = "LPAREN"
    RPAREN   = "RPAREN"
    SPACE    = "SPACE"
    EOF      = "EOF"
    UNKNOWN  = "UNKNOWN"


class Action(Enum):
    """
    What the FSM does on a transition. In a Mealy machine, the output
    (action) depends on both the current state AND the input.
    """
    ACCUMULATE              = "ACCUMULATE"
    EMIT_NUMBER             = "EMIT_NUMBER"
    EMIT_OPERATOR           = "EMIT_OPERATOR"
    EMIT_LPAREN             = "EMIT_LPAREN"
    EMIT_RPAREN             = "EMIT_RPAREN"
    EMIT_NUMBER_THEN_OP     = "EMIT_NUMBER_THEN_OP"
    EMIT_NUMBER_THEN_LPAREN = "EMIT_NUMBER_THEN_LPAREN"
    EMIT_NUMBER_THEN_RPAREN = "EMIT_NUMBER_THEN_RPAREN"
    EMIT_NUMBER_THEN_DONE   = "EMIT_NUMBER_THEN_DONE"
    SKIP                    = "SKIP"
    DONE                    = "DONE"
    ERROR                   = "ERROR"


@dataclass(frozen=True)
class Transition:
    next_state: State
    action: Action


# ---------- Transition table ----------
# This is the heart of the state machine. Every (state, char_class) pair
# maps to exactly one transition: a next state and an action to perform.
# Making this a data structure (not nested if/else) means we can:
#   1. Inspect it programmatically (e.g. to generate a diagram)
#   2. Verify completeness (every combination is covered)
#   3. Understand the FSM at a glance

TRANSITIONS = {
    # --- START: between tokens, dispatch based on character class ---
    (State.START, CharClass.DIGIT):    Transition(State.INTEGER, Action.ACCUMULATE),
    (State.START, CharClass.DOT):      Transition(State.DECIMAL, Action.ACCUMULATE),
    (State.START, CharClass.OPERATOR): Transition(State.START,   Action.EMIT_OPERATOR),
    (State.START, CharClass.LPAREN):   Transition(State.START,   Action.EMIT_LPAREN),
    (State.START, CharClass.RPAREN):   Transition(State.START,   Action.EMIT_RPAREN),
    (State.START, CharClass.SPACE):    Transition(State.START,   Action.SKIP),
    (State.START, CharClass.EOF):      Transition(State.START,   Action.DONE),

    # --- INTEGER: accumulating digits like "123" ---
    (State.INTEGER, CharClass.DIGIT):    Transition(State.INTEGER, Action.ACCUMULATE),
    (State.INTEGER, CharClass.DOT):      Transition(State.DECIMAL, Action.ACCUMULATE),
    (State.INTEGER, CharClass.OPERATOR): Transition(State.START,   Action.EMIT_NUMBER_THEN_OP),
    (State.INTEGER, CharClass.LPAREN):   Transition(State.START,   Action.EMIT_NUMBER_THEN_LPAREN),
    (State.INTEGER, CharClass.RPAREN):   Transition(State.START,   Action.EMIT_NUMBER_THEN_RPAREN),
    (State.INTEGER, CharClass.SPACE):    Transition(State.START,   Action.EMIT_NUMBER),
    (State.INTEGER, CharClass.EOF):      Transition(State.START,   Action.EMIT_NUMBER_THEN_DONE),

    # --- DECIMAL: accumulating digits after "." like "123.45" ---
    (State.DECIMAL, CharClass.DIGIT):    Transition(State.DECIMAL, Action.ACCUMULATE),
    (State.DECIMAL, CharClass.DOT):      Transition(State.START,   Action.ERROR),
    (State.DECIMAL, CharClass.OPERATOR): Transition(State.START,   Action.EMIT_NUMBER_THEN_OP),
    (State.DECIMAL, CharClass.LPAREN):   Transition(State.START,   Action.EMIT_NUMBER_THEN_LPAREN),
    (State.DECIMAL, CharClass.RPAREN):   Transition(State.START,   Action.EMIT_NUMBER_THEN_RPAREN),
    (State.DECIMAL, CharClass.SPACE):    Transition(State.START,   Action.EMIT_NUMBER),
    (State.DECIMAL, CharClass.EOF):      Transition(State.START,   Action.EMIT_NUMBER_THEN_DONE),
}


# ---------- Errors ----------

class TokenError(Exception):
    def __init__(self, message, position):
        self.position = position
        super().__init__(f"Token error at position {position}: {message}")


# ---------- Character classification ----------

def classify(ch):
    """Map a single character to its CharClass."""
    if ch.isdigit():
        return CharClass.DIGIT
    if ch == '.':
        return CharClass.DOT
    if ch in OPERATOR_MAP:
        return CharClass.OPERATOR
    if ch == '(':
        return CharClass.LPAREN
    if ch == ')':
        return CharClass.RPAREN
    if ch.isspace():
        return CharClass.SPACE
    return CharClass.UNKNOWN


# ---------- Main tokenize function ----------

def tokenize(expression):
    """
    Process an expression string through the state machine, producing tokens.

    The main loop:
      1. Classify the current character
      2. Look up (state, char_class) in the transition table
      3. Execute the action (accumulate, emit, skip, etc.)
      4. Move to the next state
      5. Advance to the next character

    After all tokens are emitted, a post-processing step resolves
    unary minus: if a MINUS token appears at the start, after an operator,
    or after LPAREN, it is re-classified as UNARY_MINUS.
    """
    state = State.START
    buffer = []             # characters accumulated for the current token
    buffer_start = 0        # position where the current buffer started
    tokens = []
    pos = 0

    # Append a sentinel so EOF is handled uniformly in the loop
    chars = expression + '\0'

    while pos <= len(expression):
        ch = chars[pos]
        char_class = CharClass.EOF if pos == len(expression) else classify(ch)

        if char_class == CharClass.UNKNOWN:
            raise TokenError(f"unexpected character {ch!r}", pos)

        # Look up the transition
        key = (state, char_class)
        transition = TRANSITIONS.get(key)
        if transition is None:
            raise TokenError(f"no transition for state={state.name}, input={char_class.name}", pos)

        action = transition.action
        next_state = transition.next_state

        # --- Execute the action ---

        if action == Action.ACCUMULATE:
            if not buffer:
                buffer_start = pos
            buffer.append(ch)

        elif action == Action.EMIT_NUMBER:
            tokens.append(Token(TokenType.NUMBER, ''.join(buffer), buffer_start))
            buffer.clear()

        elif action == Action.EMIT_OPERATOR:
            tokens.append(Token(OPERATOR_MAP[ch], ch, pos))

        elif action == Action.EMIT_LPAREN:
            tokens.append(Token(TokenType.LPAREN, ch, pos))

        elif action == Action.EMIT_RPAREN:
            tokens.append(Token(TokenType.RPAREN, ch, pos))

        elif action == Action.EMIT_NUMBER_THEN_OP:
            tokens.append(Token(TokenType.NUMBER, ''.join(buffer), buffer_start))
            buffer.clear()
            tokens.append(Token(OPERATOR_MAP[ch], ch, pos))

        elif action == Action.EMIT_NUMBER_THEN_LPAREN:
            tokens.append(Token(TokenType.NUMBER, ''.join(buffer), buffer_start))
            buffer.clear()
            tokens.append(Token(TokenType.LPAREN, ch, pos))

        elif action == Action.EMIT_NUMBER_THEN_RPAREN:
            tokens.append(Token(TokenType.NUMBER, ''.join(buffer), buffer_start))
            buffer.clear()
            tokens.append(Token(TokenType.RPAREN, ch, pos))

        elif action == Action.EMIT_NUMBER_THEN_DONE:
            tokens.append(Token(TokenType.NUMBER, ''.join(buffer), buffer_start))
            buffer.clear()

        elif action == Action.SKIP:
            pass

        elif action == Action.DONE:
            pass

        elif action == Action.ERROR:
            raise TokenError(f"unexpected {ch!r} in state {state.name}", pos)

        state = next_state
        pos += 1

    # --- Post-processing: resolve unary minus ---
    # A MINUS is unary if it appears:
    #   - at the very start of the token stream
    #   - immediately after an operator (+, -, *, /, ^) or LPAREN
    # This context-sensitivity cannot be captured by the FSM alone --
    # it requires looking at previously emitted tokens.
    _resolve_unary_minus(tokens)

    tokens.append(Token(TokenType.EOF, '', len(expression)))
    return tokens


def _resolve_unary_minus(tokens):
    """
    Convert binary MINUS tokens to UNARY_MINUS where appropriate.

    Why this isn't in the FSM: the FSM processes characters one at a time
    and only tracks what kind of token it's currently building (its state).
    But whether '-' is unary or binary depends on the PREVIOUS TOKEN --
    information the FSM doesn't track. This is a common real-world pattern:
    the lexer handles most work, then a lightweight post-pass adds context.
    """
    unary_predecessor = {
        TokenType.PLUS, TokenType.MINUS, TokenType.MULTIPLY,
        TokenType.DIVIDE, TokenType.POWER, TokenType.LPAREN,
        TokenType.UNARY_MINUS,
    }
    for i, token in enumerate(tokens):
        if token.type != TokenType.MINUS:
            continue
        if i == 0 or tokens[i - 1].type in unary_predecessor:
            tokens[i] = Token(TokenType.UNARY_MINUS, token.value, token.position)