Files
Code/python/pdfcreator/main.py
2026-01-19 15:21:44 +00:00

130 lines
3.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
from pypdf import PdfReader, PdfWriter
INPUT_PDF = "pdfcreator/input.pdf"
OUTPUT_PDF = "pdfcreator/extracted_section.pdf"
TARGET_SECTION_TITLE = "1.3"
def crop_page(page, top_ratio=0.12, bottom_ratio=0.12):
"""
Crop the visible area of a PDF page.
top_ratio: fraction of page height to remove from the top
bottom_ratio: fraction of page height to remove from the bottom
"""
llx, lly, urx, ury = page.mediabox
height = ury - lly
new_lly = lly + height * bottom_ratio
new_ury = ury - height * top_ratio
if new_ury <= new_lly:
raise ValueError("Invalid crop ratios: page height would be negative")
page.cropbox.lower_left = (llx, new_lly)
page.cropbox.upper_right = (urx, new_ury)
def build_outline_tree(reader):
def _build(outline):
tree = []
for item in outline:
if isinstance(item, list):
tree[-1]["children"] = _build(item)
else:
tree.append({
"title": item.title.strip(),
"page": reader.get_destination_page_number(item),
"children": []
})
return tree
return _build(reader.outline)
def find_section(nodes, title):
for node in nodes:
if node["title"] == title or node["title"].startswith(title + " "):
return node
found = find_section(node["children"], title)
if found:
return found
return None
def collect_subtree_pages(node, pages=None):
"""
Collect all page numbers belonging to this section and its descendants.
"""
if pages is None:
pages = []
pages.append(node["page"])
for child in node["children"]:
collect_subtree_pages(child, pages)
return pages
def flatten_outline_pages(nodes, pages=None):
"""
Collect all outline entry page numbers in document order.
"""
if pages is None:
pages = []
for node in nodes:
pages.append(node["page"])
flatten_outline_pages(node["children"], pages)
return pages
def find_end_page(target_node, outline_tree, total_pages):
"""
End page = first outline page after the last descendant page.
"""
subtree_pages = collect_subtree_pages(target_node)
last_section_page = max(subtree_pages)
all_outline_pages = flatten_outline_pages(outline_tree)
all_outline_pages = sorted(set(all_outline_pages))
for page in all_outline_pages:
if page > last_section_page:
return page
return total_pages
def extract_section():
reader = PdfReader(INPUT_PDF)
writer = PdfWriter()
outline_tree = build_outline_tree(reader)
total_pages = len(reader.pages)
target = find_section(outline_tree, TARGET_SECTION_TITLE)
if not target:
raise ValueError(f"Section '{TARGET_SECTION_TITLE}' not found")
start_page = target["page"]
end_page = find_end_page(target, outline_tree, total_pages)
for p in range(start_page, end_page):
page = reader.pages[p]
crop_page(page)
writer.add_page(page)
with open(OUTPUT_PDF, "wb") as f:
writer.write(f)
print(
f"Extracted '{target['title']}' "
f"(pages {start_page + 1}{end_page})"
)
if __name__ == "__main__":
extract_section()