From 370e97d08df03305fe936ece3632287cafb5e9f6 Mon Sep 17 00:00:00 2001 From: local Date: Mon, 19 Jan 2026 23:07:59 +0000 Subject: [PATCH] tidyup --- python/pdfcreator/main.py | 270 -------------------------------------- 1 file changed, 270 deletions(-) delete mode 100644 python/pdfcreator/main.py diff --git a/python/pdfcreator/main.py b/python/pdfcreator/main.py deleted file mode 100644 index 3448bb2..0000000 --- a/python/pdfcreator/main.py +++ /dev/null @@ -1,270 +0,0 @@ -import re -from pypdf import PdfReader, PdfWriter -from reportlab.pdfgen import canvas -from reportlab.lib.pagesizes import LETTER -from io import BytesIO - -# ================= CONFIG ================= - -PDF_INPUTS = [ - {"file": "pdfcreator/input.pdf", "sections": ["1.3", "2.1", "@111-114"]}, - {"file": "pdfcreator/input2.pdf", "sections": ["3.2"]}, -] - -OUTPUT_PDF = "pdfcreator/extracted_sections.pdf" - -HEADER_CROP = 0.12 # top of first page of section -FOOTER_CROP = 0.06 # bottom of all pages - -# ========================================= - - -def strip_numbering(title): - """ - Remove leading numbering from a string like '1.3 Background' - Returns 'Background'. - """ - return re.sub(r'^\d+(\.\d+)*\s+', '', title) - - -# ---------- Outline utilities ------------ -def parse_page_range(entry): - """ - Returns a list of zero-based page indices if entry is a page range. - Page ranges must be prefixed with '@', e.g., "@1-10". - Otherwise returns None (treated as section prefix). - """ - if entry.startswith("@"): - s = entry[1:] # remove the @ - try: - start, end = s.split("-") - start = int(start) - 1 # zero-based - end = int(end) # inclusive in range - return list(range(start, end)) - except ValueError: - print(f"[WARN] Invalid page range: {entry}") - return None - return None # not a page range - - -def build_outline_tree(reader): - def _build(outline): - tree = [] - for item in outline: - if isinstance(item, list): - tree[-1]["children"] = _build(item) - else: - tree.append({ - "title": item.title.strip(), - "page": reader.get_destination_page_number(item), - "children": [] - }) - return tree - return _build(reader.outline) - - -def find_section_with_level(nodes, prefix, level=0): - for node in nodes: - if node["title"].startswith(prefix): - return node, level - found = find_section_with_level(node["children"], prefix, level + 1) - if found[0]: - return found - return None, None - - -def collect_subtree_pages(node, pages=None): - if pages is None: - pages = [] - pages.append(node["page"]) - for child in node["children"]: - collect_subtree_pages(child, pages) - return pages - - -def flatten_outline_pages(nodes, pages=None): - if pages is None: - pages = [] - for node in nodes: - pages.append(node["page"]) - flatten_outline_pages(node["children"], pages) - return pages - - -def find_end_page(target_node, outline_tree, total_pages): - subtree_pages = collect_subtree_pages(target_node) - last_page = max(subtree_pages) - - all_pages = sorted(set(flatten_outline_pages(outline_tree))) - for p in all_pages: - if p > last_page: - return p - return total_pages - - -# ---------- Page manipulation ------------ - -def crop_page(page, top_ratio=0.0, bottom_ratio=0.0): - llx, lly, urx, ury = page.mediabox - height = ury - lly - - new_lly = lly + height * bottom_ratio - new_ury = ury - height * top_ratio - - page.cropbox.lower_left = (llx, new_lly) - page.cropbox.upper_right = (urx, new_ury) - - -# ---------- TOC generation --------------- -def create_toc_pdf(toc_entries, heading): - buffer = BytesIO() - c = canvas.Canvas(buffer, pagesize=LETTER) - - c.setFont("Helvetica-Bold", 16) - c.drawString(50, 750, heading) - - c.setFont("Helvetica", 12) - y = 720 - - for entry in toc_entries: - line = f"{strip_numbering(entry['title'])} ........................ {entry['page']}" - c.drawString(50, y, line) # flat: no indentation - y -= 18 - - if y < 50: - c.showPage() - c.setFont("Helvetica", 12) - y = 750 - - c.save() - buffer.seek(0) - return PdfReader(buffer) - - -# ================= MAIN =================== - -content_writer = PdfWriter() -toc_entries = [] -current_page = 0 -REFERENCE_BOX = None - -for pdf_info in PDF_INPUTS: - reader = PdfReader(pdf_info["file"]) - outline_tree = build_outline_tree(reader) - total_pages = len(reader.pages) - - for entry in pdf_info["sections"]: - - page_indices = parse_page_range(entry) - - if page_indices: - # --- Explicit page range --- - toc_entries.append({ - "title": f"Pages {entry[1:]}", # remove '@' for display - "page": current_page + 1, - "level": 0 - }) - - for i, p in enumerate(page_indices): - if p < 0 or p >= total_pages: - print( - f"[WARN] Page {p+1} out of range in {pdf_info['file']}") - continue - page = reader.pages[p] - - crop_page(page, top_ratio=HEADER_CROP, - bottom_ratio=FOOTER_CROP) - if REFERENCE_BOX is None: - REFERENCE_BOX = ( - page.cropbox.lower_left, - page.cropbox.upper_right - ) - page.mediabox.lower_left = REFERENCE_BOX[0] - page.mediabox.upper_right = REFERENCE_BOX[1] - page.cropbox.lower_left = REFERENCE_BOX[0] - page.cropbox.upper_right = REFERENCE_BOX[1] - - content_writer.add_page(page) - current_page += 1 - else: - - target, level = find_section_with_level( - outline_tree, entry) - if not target: - print( - f"[WARN] Section {entry} not found in {pdf_info['file']}") - continue - - start_page = target["page"] - end_page = find_end_page(target, outline_tree, total_pages) - - toc_entries.append({ - "title": target["title"], # EXACT heading text - "page": current_page + 1, # 1-based - "level": level - }) - - for i, p in enumerate(range(start_page, end_page)): - page = reader.pages[p] - - if i == 0: - crop_page(page, HEADER_CROP, FOOTER_CROP) - else: - crop_page(page, bottom_ratio=FOOTER_CROP) - - # Capture reference AFTER cropping - if REFERENCE_BOX is None: - REFERENCE_BOX = ( - page.cropbox.lower_left, - page.cropbox.upper_right - ) - - # Normalize page size - page.mediabox.lower_left = REFERENCE_BOX[0] - page.mediabox.upper_right = REFERENCE_BOX[1] - page.cropbox.lower_left = REFERENCE_BOX[0] - page.cropbox.upper_right = REFERENCE_BOX[1] - - content_writer.add_page(page) - current_page += 1 - - -# ---------- Build final PDF --------------- - -final_writer = PdfWriter() - -# Derive TOC heading from first source document -first_reader = PdfReader(PDF_INPUTS[0]["file"]) -toc_heading = "Contents" if first_reader.outline else "Table of Contents" - -# Visible TOC pages -toc_pdf = create_toc_pdf(toc_entries, toc_heading) -toc_page_count = len(toc_pdf.pages) - -for page in toc_pdf.pages: - final_writer.add_page(page) - -# Content pages -for page in content_writer.pages: - final_writer.add_page(page) - -bookmark_stack = {} - -for entry in toc_entries: - parent = bookmark_stack.get(entry["level"] - 1) - - bm = final_writer.add_outline_item( - title=entry["title"], # exact heading text - page_number=(entry["page"] - 1) + toc_page_count, - parent=parent - ) - - bookmark_stack[entry["level"]] = bm - - -# ---------- Write output ------------------ - -with open(OUTPUT_PDF, "wb") as f: - final_writer.write(f) - -print(f"[OK] Created {OUTPUT_PDF}")