update pdf creator

TOC, harmonize page size
This commit is contained in:
local
2026-01-19 17:37:16 +00:00
parent 7891956d52
commit 6c4b78f274

View File

@@ -1,28 +1,23 @@
from pypdf import PdfReader, PdfWriter from pypdf import PdfReader
from io import BytesIO
from reportlab.lib.pagesizes import LETTER
from reportlab.pdfgen import canvas
from pypdf import PdfReader, PdfWriter, PageObject
INPUT_PDF = "pdfcreator/input.pdf" # ----------- CONFIG -------------
OUTPUT_PDF = "pdfcreator/extracted_section.pdf" PDF_INPUTS = [
TARGET_SECTION_TITLE = "1.3" {"file": "pdfcreator/input.pdf", "sections": ["1", "2.2", "3"]},
{"file": "pdfcreator/input2.pdf", "sections": ["3", "4"]},
]
OUTPUT_PDF = "pdfcreator/combined_sections.pdf"
def crop_page(page, top_ratio=0.12, bottom_ratio=0.12): # Cropping ratios
""" HEADER_CROP = 0.1 # top of first page of section
Crop the visible area of a PDF page. FOOTER_CROP = 0.0 # bottom of pages
# --------------------------------
top_ratio: fraction of page height to remove from the top # ----- HELPER FUNCTIONS ---------
bottom_ratio: fraction of page height to remove from the bottom
"""
llx, lly, urx, ury = page.mediabox
height = ury - lly
new_lly = lly + height * bottom_ratio
new_ury = ury - height * top_ratio
if new_ury <= new_lly:
raise ValueError("Invalid crop ratios: page height would be negative")
page.cropbox.lower_left = (llx, new_lly)
page.cropbox.upper_right = (urx, new_ury)
def build_outline_tree(reader): def build_outline_tree(reader):
@@ -38,7 +33,6 @@ def build_outline_tree(reader):
"children": [] "children": []
}) })
return tree return tree
return _build(reader.outline) return _build(reader.outline)
@@ -53,77 +47,157 @@ def find_section(nodes, title):
def collect_subtree_pages(node, pages=None): def collect_subtree_pages(node, pages=None):
"""
Collect all page numbers belonging to this section and its descendants.
"""
if pages is None: if pages is None:
pages = [] pages = []
pages.append(node["page"]) pages.append(node["page"])
for child in node["children"]: for child in node["children"]:
collect_subtree_pages(child, pages) collect_subtree_pages(child, pages)
return pages return pages
def flatten_outline_pages(nodes, pages=None): def flatten_outline_pages(nodes, pages=None):
"""
Collect all outline entry page numbers in document order.
"""
if pages is None: if pages is None:
pages = [] pages = []
for node in nodes: for node in nodes:
pages.append(node["page"]) pages.append(node["page"])
flatten_outline_pages(node["children"], pages) flatten_outline_pages(node["children"], pages)
return pages return pages
def find_end_page(target_node, outline_tree, total_pages): def find_end_page(target_node, outline_tree, total_pages):
"""
End page = first outline page after the last descendant page.
"""
subtree_pages = collect_subtree_pages(target_node) subtree_pages = collect_subtree_pages(target_node)
last_section_page = max(subtree_pages) last_section_page = max(subtree_pages)
all_outline_pages = flatten_outline_pages(outline_tree) all_outline_pages = flatten_outline_pages(outline_tree)
all_outline_pages = sorted(set(all_outline_pages)) all_outline_pages = sorted(set(all_outline_pages))
for page in all_outline_pages: for page in all_outline_pages:
if page > last_section_page: if page > last_section_page:
return page return page
return total_pages return total_pages
def extract_section(): def crop_page(page, top_ratio=0.0, bottom_ratio=0.0):
reader = PdfReader(INPUT_PDF) llx, lly, urx, ury = page.mediabox
writer = PdfWriter() height = ury - lly
new_lly = lly + height * bottom_ratio
new_ury = ury - height * top_ratio
if new_ury <= new_lly:
raise ValueError("Invalid crop ratios: page height would be negative")
page.cropbox.lower_left = (llx, new_lly)
page.cropbox.upper_right = (urx, new_ury)
def normalize_page_size(page, reference_box):
"""
Force page MediaBox and CropBox to match reference.
"""
page.mediabox.lower_left = reference_box.lower_left
page.mediabox.upper_right = reference_box.upper_right
page.cropbox.lower_left = reference_box.lower_left
page.cropbox.upper_right = reference_box.upper_right
# --------------------------------
# --------- MAIN PROCESS ----------
writer = PdfWriter()
toc_entries = [] # To build TOC later
current_page_index = 0
for pdf_info in PDF_INPUTS:
file_path = pdf_info["file"]
sections_to_extract = pdf_info["sections"]
reader = PdfReader(file_path)
outline_tree = build_outline_tree(reader) outline_tree = build_outline_tree(reader)
total_pages = len(reader.pages) total_pages = len(reader.pages)
target = find_section(outline_tree, TARGET_SECTION_TITLE) for section_title in sections_to_extract:
target = find_section(outline_tree, section_title)
if not target: if not target:
raise ValueError(f"Section '{TARGET_SECTION_TITLE}' not found") print(f"[WARN] Section '{section_title}' not found in {file_path}")
continue
start_page = target["page"] start_page = target["page"]
end_page = find_end_page(target, outline_tree, total_pages) end_page = find_end_page(target, outline_tree, total_pages)
for p in range(start_page, end_page): REFERENCE_BOX = None
# Add pages to combined PDF
for i, p in enumerate(range(start_page, end_page)):
page = reader.pages[p] page = reader.pages[p]
crop_page(page)
# Crop first page header+footer
if i == 0:
crop_page(page, top_ratio=HEADER_CROP,
bottom_ratio=FOOTER_CROP)
else:
crop_page(page, top_ratio=HEADER_CROP,
bottom_ratio=FOOTER_CROP)
# crop_page(page, bottom_ratio=FOOTER_CROP)
if REFERENCE_BOX is None:
# Make a copy, not a reference
REFERENCE_BOX = (
page.cropbox.lower_left,
page.cropbox.upper_right
)
# Step 3: Normalize page size
page.mediabox.lower_left = REFERENCE_BOX[0]
page.mediabox.upper_right = REFERENCE_BOX[1]
page.cropbox.lower_left = REFERENCE_BOX[0]
page.cropbox.upper_right = REFERENCE_BOX[1]
writer.add_page(page) writer.add_page(page)
# Track TOC
toc_entries.append({
"title": f"{section_title} ({file_path})",
"page": current_page_index + 1 # 1-based page number
})
current_page_index += (end_page - start_page)
# --------- ADD TOC PAGE(S) ----------
def create_toc_pdf(toc_entries):
packet = BytesIO()
c = canvas.Canvas(packet, pagesize=LETTER)
c.setFont("Helvetica-Bold", 16)
c.drawString(50, 750, "Table of Contents")
c.setFont("Helvetica", 12)
y = 720
for entry in toc_entries:
text = f"{entry['title']} .... {entry['page']}"
c.drawString(50, y, text)
y -= 20
if y < 50:
c.showPage()
y = 750
c.save()
packet.seek(0)
return PdfReader(packet)
toc_pdf = create_toc_pdf(toc_entries)
# Combine TOC + extracted sections
final_writer = PdfWriter()
# TOC first
for page in toc_pdf.pages:
final_writer.add_page(page)
# Then extracted content
for page in writer.pages:
final_writer.add_page(page)
# Save
with open(OUTPUT_PDF, "wb") as f: with open(OUTPUT_PDF, "wb") as f:
writer.write(f) final_writer.write(f)
print(
f"Extracted '{target['title']}' "
f"(pages {start_page + 1}{end_page})"
)
if __name__ == "__main__": # --------- WRITE OUTPUT -----------
extract_section() with open(OUTPUT_PDF, "wb") as f:
final_writer.write(f)
print(f"[INFO] Combined PDF written to {OUTPUT_PDF} with TOC.")