# pdfaggregator.py import re from io import BytesIO from pypdf import PdfReader, PdfWriter, PageObject from reportlab.pdfgen import canvas from reportlab.lib.pagesizes import LETTER # ----------------------------- # Parsing / Section Utilities # ----------------------------- def parse_page_range(entry): """Return list of zero-based page indices if entry is a page range (@1-10).""" if entry.startswith("@"): s = entry[1:] try: start, end = s.split("-") start = int(start) - 1 end = int(end) return list(range(start, end)) except ValueError: print(f"[WARN] Invalid page range: {entry}") return None return None def strip_numbering(title): """Remove leading numbering like '1.3 Background' -> 'Background'""" return re.sub(r'^\d+(\.\d+)*\s+', '', title) def crop_page(page, top_ratio=0.0, bottom_ratio=0.0): """Crop the top/bottom of a page using ratios.""" llx, lly, urx, ury = page.mediabox height = ury - lly new_lly = lly + height * bottom_ratio new_ury = ury - height * top_ratio page.cropbox.lower_left = (llx, new_lly) page.cropbox.upper_right = (urx, new_ury) # ----------------------------- # Outline / Section Tree # ----------------------------- def find_section_with_level(nodes, prefix, level=0): """Find a section node by prefix in outline tree.""" for node in nodes: if node["title"].startswith(prefix): return node, level found = find_section_with_level( node.get("children", []), prefix, level + 1) if found[0]: return found return None, None def collect_subtree_pages(node, pages=None): """Recursively collect pages of node and all its children.""" if pages is None: pages = [] pages.append(node["page"]) for child in node.get("children", []): collect_subtree_pages(child, pages) return pages def flatten_outline_pages(nodes, pages=None): """Flatten all pages from the outline tree.""" if pages is None: pages = [] for node in nodes: pages.append(node["page"]) flatten_outline_pages(node.get("children", []), pages) return pages def find_end_page(target_node, outline_tree, total_pages): """Find the last page of a section including its subsections.""" subtree_pages = collect_subtree_pages(target_node) last_page = max(subtree_pages) all_pages = sorted(set(flatten_outline_pages(outline_tree))) for p in all_pages: if p > last_page: return p return total_pages def build_outline_tree(reader): """ Build a normalized outline tree from pypdf's reader.outline. Each node: { "title": str, "page": int, "children": [ ... ] } """ def walk(items): tree = [] for item in items: if isinstance(item, list): # children of previous item if tree: tree[-1]["children"] = walk(item) else: tree.append({ "title": item.title.strip(), "page": reader.get_destination_page_number(item), "children": [] }) return tree try: outline = reader.outline except Exception: return [] if not outline: return [] return walk(outline) # ----------------------------- # TOC Generation # ----------------------------- def create_toc_pdf(toc_entries, heading="Table of Contents"): """Generate a flat, unnumbered TOC PDF page in memory.""" buffer = BytesIO() c = canvas.Canvas(buffer, pagesize=LETTER) c.setFont("Helvetica-Bold", 16) c.drawString(50, 750, heading) c.setFont("Helvetica", 12) y = 720 for entry in toc_entries: line = f"{strip_numbering(entry['title'])} ........................ {entry['page']}" c.drawString(50, y, line) y -= 18 if y < 50: c.showPage() c.setFont("Helvetica", 12) y = 750 c.save() buffer.seek(0) return PdfReader(buffer) def extract_page_range(entry, reader, content_writer, current_page, REFERENCE_BOX, header_crop=0.05, footer_crop=0.03): """Extract pages from an explicit @page-range entry.""" page_indices = parse_page_range(entry) if not page_indices: return current_page, REFERENCE_BOX, None # nothing extracted toc_entry = {"title": f"Pages {entry[1:]}", "page": current_page + 1, "level": 0} for i, p in enumerate(page_indices): if p < 0 or p >= len(reader.pages): continue page = reader.pages[p] crop_page(page, header_crop, footer_crop) if REFERENCE_BOX is None: REFERENCE_BOX = (page.cropbox.lower_left, page.cropbox.upper_right) page.mediabox.lower_left = REFERENCE_BOX[0] page.mediabox.upper_right = REFERENCE_BOX[1] page.cropbox.lower_left = REFERENCE_BOX[0] page.cropbox.upper_right = REFERENCE_BOX[1] content_writer.add_page(page) current_page += 1 return current_page, REFERENCE_BOX, toc_entry def extract_section_prefix(entry, reader, content_writer, current_page, REFERENCE_BOX, outline_tree, header_crop=0.05, footer_crop=0.03): """Extract pages from a section prefix entry in the PDF outline.""" target, level = find_section_with_level(outline_tree, entry) if not target: print( f"[WARN] Section {entry} not found in PDF {reader.stream.name if hasattr(reader.stream, 'name') else ''}") return current_page, REFERENCE_BOX, None start_page = target["page"] end_page = find_end_page(target, outline_tree, len(reader.pages)) toc_entry = {"title": target["title"], "page": current_page + 1, "level": level} for i, p in enumerate(range(start_page, end_page)): page = reader.pages[p] crop_page(page, header_crop if i == 0 else 0, footer_crop) if REFERENCE_BOX is None: REFERENCE_BOX = (page.cropbox.lower_left, page.cropbox.upper_right) page.mediabox.lower_left = REFERENCE_BOX[0] page.mediabox.upper_right = REFERENCE_BOX[1] page.cropbox.lower_left = REFERENCE_BOX[0] page.cropbox.upper_right = REFERENCE_BOX[1] content_writer.add_page(page) current_page += 1 return current_page, REFERENCE_BOX, toc_entry def parse_inputs(args): """ Parse CLI positional arguments into PDF_INPUTS structure. """ pdf_inputs = [] for item in args.inputs: if ":" not in item: raise ValueError( f"Invalid input '{item}'. Expected format: file.pdf:section1,section2" ) file_path, sections = item.split(":", 1) section_list = [s.strip() for s in sections.split(",") if s.strip()] pdf_inputs.append({ "file": file_path, "sections": section_list }) return pdf_inputs