from pypdf import PdfReader from io import BytesIO from reportlab.lib.pagesizes import LETTER from reportlab.pdfgen import canvas from pypdf import PdfReader, PdfWriter, PageObject # ----------- CONFIG ------------- PDF_INPUTS = [ {"file": "pdfcreator/input.pdf", "sections": ["1", "2.2", "3"]}, {"file": "pdfcreator/input2.pdf", "sections": ["3", "4"]}, ] OUTPUT_PDF = "pdfcreator/combined_sections.pdf" # Cropping ratios HEADER_CROP = 0.1 # top of first page of section FOOTER_CROP = 0.0 # bottom of pages # -------------------------------- # ----- HELPER FUNCTIONS --------- def build_outline_tree(reader): def _build(outline): tree = [] for item in outline: if isinstance(item, list): tree[-1]["children"] = _build(item) else: tree.append({ "title": item.title.strip(), "page": reader.get_destination_page_number(item), "children": [] }) return tree return _build(reader.outline) def find_section(nodes, title): for node in nodes: if node["title"] == title or node["title"].startswith(title + " "): return node found = find_section(node["children"], title) if found: return found return None def collect_subtree_pages(node, pages=None): if pages is None: pages = [] pages.append(node["page"]) for child in node["children"]: collect_subtree_pages(child, pages) return pages def flatten_outline_pages(nodes, pages=None): if pages is None: pages = [] for node in nodes: pages.append(node["page"]) flatten_outline_pages(node["children"], pages) return pages def find_end_page(target_node, outline_tree, total_pages): subtree_pages = collect_subtree_pages(target_node) last_section_page = max(subtree_pages) all_outline_pages = flatten_outline_pages(outline_tree) all_outline_pages = sorted(set(all_outline_pages)) for page in all_outline_pages: if page > last_section_page: return page return total_pages def crop_page(page, top_ratio=0.0, bottom_ratio=0.0): llx, lly, urx, ury = page.mediabox height = ury - lly new_lly = lly + height * bottom_ratio new_ury = ury - height * top_ratio if new_ury <= new_lly: raise ValueError("Invalid crop ratios: page height would be negative") page.cropbox.lower_left = (llx, new_lly) page.cropbox.upper_right = (urx, new_ury) def normalize_page_size(page, reference_box): """ Force page MediaBox and CropBox to match reference. """ page.mediabox.lower_left = reference_box.lower_left page.mediabox.upper_right = reference_box.upper_right page.cropbox.lower_left = reference_box.lower_left page.cropbox.upper_right = reference_box.upper_right # -------------------------------- # --------- MAIN PROCESS ---------- writer = PdfWriter() toc_entries = [] # To build TOC later current_page_index = 0 for pdf_info in PDF_INPUTS: file_path = pdf_info["file"] sections_to_extract = pdf_info["sections"] reader = PdfReader(file_path) outline_tree = build_outline_tree(reader) total_pages = len(reader.pages) for section_title in sections_to_extract: target = find_section(outline_tree, section_title) if not target: print(f"[WARN] Section '{section_title}' not found in {file_path}") continue start_page = target["page"] end_page = find_end_page(target, outline_tree, total_pages) REFERENCE_BOX = None # Add pages to combined PDF for i, p in enumerate(range(start_page, end_page)): page = reader.pages[p] # Crop first page header+footer if i == 0: crop_page(page, top_ratio=HEADER_CROP, bottom_ratio=FOOTER_CROP) else: crop_page(page, top_ratio=HEADER_CROP, bottom_ratio=FOOTER_CROP) # crop_page(page, bottom_ratio=FOOTER_CROP) if REFERENCE_BOX is None: # Make a copy, not a reference REFERENCE_BOX = ( page.cropbox.lower_left, page.cropbox.upper_right ) # Step 3: Normalize page size page.mediabox.lower_left = REFERENCE_BOX[0] page.mediabox.upper_right = REFERENCE_BOX[1] page.cropbox.lower_left = REFERENCE_BOX[0] page.cropbox.upper_right = REFERENCE_BOX[1] writer.add_page(page) # Track TOC toc_entries.append({ "title": f"{section_title} ({file_path})", "page": current_page_index + 1 # 1-based page number }) current_page_index += (end_page - start_page) # --------- ADD TOC PAGE(S) ---------- def create_toc_pdf(toc_entries): packet = BytesIO() c = canvas.Canvas(packet, pagesize=LETTER) c.setFont("Helvetica-Bold", 16) c.drawString(50, 750, "Table of Contents") c.setFont("Helvetica", 12) y = 720 for entry in toc_entries: text = f"{entry['title']} .... {entry['page']}" c.drawString(50, y, text) y -= 20 if y < 50: c.showPage() y = 750 c.save() packet.seek(0) return PdfReader(packet) toc_pdf = create_toc_pdf(toc_entries) # Combine TOC + extracted sections final_writer = PdfWriter() # TOC first for page in toc_pdf.pages: final_writer.add_page(page) # Then extracted content for page in writer.pages: final_writer.add_page(page) # Save with open(OUTPUT_PDF, "wb") as f: final_writer.write(f) # --------- WRITE OUTPUT ----------- with open(OUTPUT_PDF, "wb") as f: final_writer.write(f) print(f"[INFO] Combined PDF written to {OUTPUT_PDF} with TOC.")