diff --git a/python/pdfcreator/main.py b/python/pdfcreator/main.py index 313203d..d4448cd 100644 --- a/python/pdfcreator/main.py +++ b/python/pdfcreator/main.py @@ -1,28 +1,23 @@ -from pypdf import PdfReader, PdfWriter +from pypdf import PdfReader +from io import BytesIO +from reportlab.lib.pagesizes import LETTER +from reportlab.pdfgen import canvas +from pypdf import PdfReader, PdfWriter, PageObject -INPUT_PDF = "pdfcreator/input.pdf" -OUTPUT_PDF = "pdfcreator/extracted_section.pdf" -TARGET_SECTION_TITLE = "1.3" +# ----------- CONFIG ------------- +PDF_INPUTS = [ + {"file": "pdfcreator/input.pdf", "sections": ["1", "2.2", "3"]}, + {"file": "pdfcreator/input2.pdf", "sections": ["3", "4"]}, +] +OUTPUT_PDF = "pdfcreator/combined_sections.pdf" -def crop_page(page, top_ratio=0.12, bottom_ratio=0.12): - """ - Crop the visible area of a PDF page. +# Cropping ratios +HEADER_CROP = 0.1 # top of first page of section +FOOTER_CROP = 0.0 # bottom of pages +# -------------------------------- - top_ratio: fraction of page height to remove from the top - bottom_ratio: fraction of page height to remove from the bottom - """ - llx, lly, urx, ury = page.mediabox - height = ury - lly - - new_lly = lly + height * bottom_ratio - new_ury = ury - height * top_ratio - - if new_ury <= new_lly: - raise ValueError("Invalid crop ratios: page height would be negative") - - page.cropbox.lower_left = (llx, new_lly) - page.cropbox.upper_right = (urx, new_ury) +# ----- HELPER FUNCTIONS --------- def build_outline_tree(reader): @@ -38,7 +33,6 @@ def build_outline_tree(reader): "children": [] }) return tree - return _build(reader.outline) @@ -53,77 +47,157 @@ def find_section(nodes, title): def collect_subtree_pages(node, pages=None): - """ - Collect all page numbers belonging to this section and its descendants. - """ if pages is None: pages = [] - pages.append(node["page"]) for child in node["children"]: collect_subtree_pages(child, pages) - return pages def flatten_outline_pages(nodes, pages=None): - """ - Collect all outline entry page numbers in document order. - """ if pages is None: pages = [] - for node in nodes: pages.append(node["page"]) flatten_outline_pages(node["children"], pages) - return pages def find_end_page(target_node, outline_tree, total_pages): - """ - End page = first outline page after the last descendant page. - """ subtree_pages = collect_subtree_pages(target_node) last_section_page = max(subtree_pages) - all_outline_pages = flatten_outline_pages(outline_tree) all_outline_pages = sorted(set(all_outline_pages)) - for page in all_outline_pages: if page > last_section_page: return page - return total_pages -def extract_section(): - reader = PdfReader(INPUT_PDF) - writer = PdfWriter() +def crop_page(page, top_ratio=0.0, bottom_ratio=0.0): + llx, lly, urx, ury = page.mediabox + height = ury - lly + new_lly = lly + height * bottom_ratio + new_ury = ury - height * top_ratio + if new_ury <= new_lly: + raise ValueError("Invalid crop ratios: page height would be negative") + page.cropbox.lower_left = (llx, new_lly) + page.cropbox.upper_right = (urx, new_ury) + +def normalize_page_size(page, reference_box): + """ + Force page MediaBox and CropBox to match reference. + """ + page.mediabox.lower_left = reference_box.lower_left + page.mediabox.upper_right = reference_box.upper_right + + page.cropbox.lower_left = reference_box.lower_left + page.cropbox.upper_right = reference_box.upper_right + +# -------------------------------- + + +# --------- MAIN PROCESS ---------- +writer = PdfWriter() +toc_entries = [] # To build TOC later +current_page_index = 0 + +for pdf_info in PDF_INPUTS: + file_path = pdf_info["file"] + sections_to_extract = pdf_info["sections"] + + reader = PdfReader(file_path) outline_tree = build_outline_tree(reader) total_pages = len(reader.pages) - target = find_section(outline_tree, TARGET_SECTION_TITLE) - if not target: - raise ValueError(f"Section '{TARGET_SECTION_TITLE}' not found") + for section_title in sections_to_extract: + target = find_section(outline_tree, section_title) + if not target: + print(f"[WARN] Section '{section_title}' not found in {file_path}") + continue - start_page = target["page"] - end_page = find_end_page(target, outline_tree, total_pages) + start_page = target["page"] + end_page = find_end_page(target, outline_tree, total_pages) - for p in range(start_page, end_page): - page = reader.pages[p] - crop_page(page) - writer.add_page(page) + REFERENCE_BOX = None + # Add pages to combined PDF + for i, p in enumerate(range(start_page, end_page)): + page = reader.pages[p] - with open(OUTPUT_PDF, "wb") as f: - writer.write(f) + # Crop first page header+footer + if i == 0: + crop_page(page, top_ratio=HEADER_CROP, + bottom_ratio=FOOTER_CROP) + else: + crop_page(page, top_ratio=HEADER_CROP, + bottom_ratio=FOOTER_CROP) + # crop_page(page, bottom_ratio=FOOTER_CROP) - print( - f"Extracted '{target['title']}' " - f"(pages {start_page + 1}–{end_page})" - ) + if REFERENCE_BOX is None: + # Make a copy, not a reference + REFERENCE_BOX = ( + page.cropbox.lower_left, + page.cropbox.upper_right + ) + # Step 3: Normalize page size + page.mediabox.lower_left = REFERENCE_BOX[0] + page.mediabox.upper_right = REFERENCE_BOX[1] + page.cropbox.lower_left = REFERENCE_BOX[0] + page.cropbox.upper_right = REFERENCE_BOX[1] + + writer.add_page(page) + + # Track TOC + toc_entries.append({ + "title": f"{section_title} ({file_path})", + "page": current_page_index + 1 # 1-based page number + }) + current_page_index += (end_page - start_page) + +# --------- ADD TOC PAGE(S) ---------- -if __name__ == "__main__": - extract_section() +def create_toc_pdf(toc_entries): + packet = BytesIO() + c = canvas.Canvas(packet, pagesize=LETTER) + c.setFont("Helvetica-Bold", 16) + c.drawString(50, 750, "Table of Contents") + c.setFont("Helvetica", 12) + y = 720 + for entry in toc_entries: + text = f"{entry['title']} .... {entry['page']}" + c.drawString(50, y, text) + y -= 20 + if y < 50: + c.showPage() + y = 750 + c.save() + packet.seek(0) + return PdfReader(packet) + + +toc_pdf = create_toc_pdf(toc_entries) + +# Combine TOC + extracted sections +final_writer = PdfWriter() + +# TOC first +for page in toc_pdf.pages: + final_writer.add_page(page) + +# Then extracted content +for page in writer.pages: + final_writer.add_page(page) + +# Save +with open(OUTPUT_PDF, "wb") as f: + final_writer.write(f) + + +# --------- WRITE OUTPUT ----------- +with open(OUTPUT_PDF, "wb") as f: + final_writer.write(f) + +print(f"[INFO] Combined PDF written to {OUTPUT_PDF} with TOC.")