""" Part 1: State Machine Tokenizer ================================ A tokenizer (lexer) converts raw text into a stream of tokens. This implementation uses an EXPLICIT finite state machine (FSM): - States are named values (an enum), not implicit control flow - A transition table maps (current_state, input_class) -> (next_state, action) - The main loop reads one character at a time and consults the table This is the same pattern used in: - Network protocol parsers (HTTP, TCP state machines) - Regular expression engines - Compiler front-ends (lexers for C, Python, etc.) - Game AI (enemy behavior states) Key FSM concepts demonstrated: - States: the "memory" of what we're currently building - Transitions: rules for moving between states based on input - Actions: side effects (emit a token, accumulate a character) - Mealy machine: outputs depend on both state AND input """ from dataclasses import dataclass from enum import Enum # ---------- Token types ---------- class TokenType(Enum): NUMBER = "NUMBER" PLUS = "PLUS" MINUS = "MINUS" MULTIPLY = "MULTIPLY" DIVIDE = "DIVIDE" POWER = "POWER" LPAREN = "LPAREN" RPAREN = "RPAREN" UNARY_MINUS = "UNARY_MINUS" EOF = "EOF" @dataclass class Token: type: TokenType value: str # raw text: "42", "+", "(", etc. position: int # character offset in original expression def __repr__(self): return f"Token({self.type.name}, {self.value!r}, pos={self.position})" OPERATOR_MAP = { '+': TokenType.PLUS, '-': TokenType.MINUS, '*': TokenType.MULTIPLY, '/': TokenType.DIVIDE, '^': TokenType.POWER, } # ---------- FSM state definitions ---------- class State(Enum): """ The tokenizer's finite set of states. START -- idle / between tokens, deciding what comes next INTEGER -- accumulating digits of an integer (e.g. "12" so far) DECIMAL -- accumulating digits after a decimal point (e.g. "12.3" so far) """ START = "START" INTEGER = "INTEGER" DECIMAL = "DECIMAL" class CharClass(Enum): """ Character classification -- groups raw characters into categories so the transition table stays small and readable. """ DIGIT = "DIGIT" DOT = "DOT" OPERATOR = "OPERATOR" LPAREN = "LPAREN" RPAREN = "RPAREN" SPACE = "SPACE" EOF = "EOF" UNKNOWN = "UNKNOWN" class Action(Enum): """ What the FSM does on a transition. In a Mealy machine, the output (action) depends on both the current state AND the input. """ ACCUMULATE = "ACCUMULATE" EMIT_NUMBER = "EMIT_NUMBER" EMIT_OPERATOR = "EMIT_OPERATOR" EMIT_LPAREN = "EMIT_LPAREN" EMIT_RPAREN = "EMIT_RPAREN" EMIT_NUMBER_THEN_OP = "EMIT_NUMBER_THEN_OP" EMIT_NUMBER_THEN_LPAREN = "EMIT_NUMBER_THEN_LPAREN" EMIT_NUMBER_THEN_RPAREN = "EMIT_NUMBER_THEN_RPAREN" EMIT_NUMBER_THEN_DONE = "EMIT_NUMBER_THEN_DONE" SKIP = "SKIP" DONE = "DONE" ERROR = "ERROR" @dataclass(frozen=True) class Transition: next_state: State action: Action # ---------- Transition table ---------- # This is the heart of the state machine. Every (state, char_class) pair # maps to exactly one transition: a next state and an action to perform. # Making this a data structure (not nested if/else) means we can: # 1. Inspect it programmatically (e.g. to generate a diagram) # 2. Verify completeness (every combination is covered) # 3. Understand the FSM at a glance TRANSITIONS = { # --- START: between tokens, dispatch based on character class --- (State.START, CharClass.DIGIT): Transition(State.INTEGER, Action.ACCUMULATE), (State.START, CharClass.DOT): Transition(State.DECIMAL, Action.ACCUMULATE), (State.START, CharClass.OPERATOR): Transition(State.START, Action.EMIT_OPERATOR), (State.START, CharClass.LPAREN): Transition(State.START, Action.EMIT_LPAREN), (State.START, CharClass.RPAREN): Transition(State.START, Action.EMIT_RPAREN), (State.START, CharClass.SPACE): Transition(State.START, Action.SKIP), (State.START, CharClass.EOF): Transition(State.START, Action.DONE), # --- INTEGER: accumulating digits like "123" --- (State.INTEGER, CharClass.DIGIT): Transition(State.INTEGER, Action.ACCUMULATE), (State.INTEGER, CharClass.DOT): Transition(State.DECIMAL, Action.ACCUMULATE), (State.INTEGER, CharClass.OPERATOR): Transition(State.START, Action.EMIT_NUMBER_THEN_OP), (State.INTEGER, CharClass.LPAREN): Transition(State.START, Action.EMIT_NUMBER_THEN_LPAREN), (State.INTEGER, CharClass.RPAREN): Transition(State.START, Action.EMIT_NUMBER_THEN_RPAREN), (State.INTEGER, CharClass.SPACE): Transition(State.START, Action.EMIT_NUMBER), (State.INTEGER, CharClass.EOF): Transition(State.START, Action.EMIT_NUMBER_THEN_DONE), # --- DECIMAL: accumulating digits after "." like "123.45" --- (State.DECIMAL, CharClass.DIGIT): Transition(State.DECIMAL, Action.ACCUMULATE), (State.DECIMAL, CharClass.DOT): Transition(State.START, Action.ERROR), (State.DECIMAL, CharClass.OPERATOR): Transition(State.START, Action.EMIT_NUMBER_THEN_OP), (State.DECIMAL, CharClass.LPAREN): Transition(State.START, Action.EMIT_NUMBER_THEN_LPAREN), (State.DECIMAL, CharClass.RPAREN): Transition(State.START, Action.EMIT_NUMBER_THEN_RPAREN), (State.DECIMAL, CharClass.SPACE): Transition(State.START, Action.EMIT_NUMBER), (State.DECIMAL, CharClass.EOF): Transition(State.START, Action.EMIT_NUMBER_THEN_DONE), } # ---------- Errors ---------- class TokenError(Exception): def __init__(self, message, position): self.position = position super().__init__(f"Token error at position {position}: {message}") # ---------- Character classification ---------- def classify(ch): """Map a single character to its CharClass.""" if ch.isdigit(): return CharClass.DIGIT if ch == '.': return CharClass.DOT if ch in OPERATOR_MAP: return CharClass.OPERATOR if ch == '(': return CharClass.LPAREN if ch == ')': return CharClass.RPAREN if ch.isspace(): return CharClass.SPACE return CharClass.UNKNOWN # ---------- Main tokenize function ---------- def tokenize(expression): """ Process an expression string through the state machine, producing tokens. The main loop: 1. Classify the current character 2. Look up (state, char_class) in the transition table 3. Execute the action (accumulate, emit, skip, etc.) 4. Move to the next state 5. Advance to the next character After all tokens are emitted, a post-processing step resolves unary minus: if a MINUS token appears at the start, after an operator, or after LPAREN, it is re-classified as UNARY_MINUS. """ state = State.START buffer = [] # characters accumulated for the current token buffer_start = 0 # position where the current buffer started tokens = [] pos = 0 # Append a sentinel so EOF is handled uniformly in the loop chars = expression + '\0' while pos <= len(expression): ch = chars[pos] char_class = CharClass.EOF if pos == len(expression) else classify(ch) if char_class == CharClass.UNKNOWN: raise TokenError(f"unexpected character {ch!r}", pos) # Look up the transition key = (state, char_class) transition = TRANSITIONS.get(key) if transition is None: raise TokenError(f"no transition for state={state.name}, input={char_class.name}", pos) action = transition.action next_state = transition.next_state # --- Execute the action --- if action == Action.ACCUMULATE: if not buffer: buffer_start = pos buffer.append(ch) elif action == Action.EMIT_NUMBER: tokens.append(Token(TokenType.NUMBER, ''.join(buffer), buffer_start)) buffer.clear() elif action == Action.EMIT_OPERATOR: tokens.append(Token(OPERATOR_MAP[ch], ch, pos)) elif action == Action.EMIT_LPAREN: tokens.append(Token(TokenType.LPAREN, ch, pos)) elif action == Action.EMIT_RPAREN: tokens.append(Token(TokenType.RPAREN, ch, pos)) elif action == Action.EMIT_NUMBER_THEN_OP: tokens.append(Token(TokenType.NUMBER, ''.join(buffer), buffer_start)) buffer.clear() tokens.append(Token(OPERATOR_MAP[ch], ch, pos)) elif action == Action.EMIT_NUMBER_THEN_LPAREN: tokens.append(Token(TokenType.NUMBER, ''.join(buffer), buffer_start)) buffer.clear() tokens.append(Token(TokenType.LPAREN, ch, pos)) elif action == Action.EMIT_NUMBER_THEN_RPAREN: tokens.append(Token(TokenType.NUMBER, ''.join(buffer), buffer_start)) buffer.clear() tokens.append(Token(TokenType.RPAREN, ch, pos)) elif action == Action.EMIT_NUMBER_THEN_DONE: tokens.append(Token(TokenType.NUMBER, ''.join(buffer), buffer_start)) buffer.clear() elif action == Action.SKIP: pass elif action == Action.DONE: pass elif action == Action.ERROR: raise TokenError(f"unexpected {ch!r} in state {state.name}", pos) state = next_state pos += 1 # --- Post-processing: resolve unary minus --- # A MINUS is unary if it appears: # - at the very start of the token stream # - immediately after an operator (+, -, *, /, ^) or LPAREN # This context-sensitivity cannot be captured by the FSM alone -- # it requires looking at previously emitted tokens. _resolve_unary_minus(tokens) tokens.append(Token(TokenType.EOF, '', len(expression))) return tokens def _resolve_unary_minus(tokens): """ Convert binary MINUS tokens to UNARY_MINUS where appropriate. Why this isn't in the FSM: the FSM processes characters one at a time and only tracks what kind of token it's currently building (its state). But whether '-' is unary or binary depends on the PREVIOUS TOKEN -- information the FSM doesn't track. This is a common real-world pattern: the lexer handles most work, then a lightweight post-pass adds context. """ unary_predecessor = { TokenType.PLUS, TokenType.MINUS, TokenType.MULTIPLY, TokenType.DIVIDE, TokenType.POWER, TokenType.LPAREN, TokenType.UNARY_MINUS, } for i, token in enumerate(tokens): if token.type != TokenType.MINUS: continue if i == 0 or tokens[i - 1].type in unary_predecessor: tokens[i] = Token(TokenType.UNARY_MINUS, token.value, token.position)