130 lines
3.3 KiB
Python
130 lines
3.3 KiB
Python
from pypdf import PdfReader, PdfWriter
|
||
|
||
INPUT_PDF = "pdfcreator/input.pdf"
|
||
OUTPUT_PDF = "pdfcreator/extracted_section.pdf"
|
||
TARGET_SECTION_TITLE = "1.3"
|
||
|
||
|
||
def crop_page(page, top_ratio=0.12, bottom_ratio=0.12):
|
||
"""
|
||
Crop the visible area of a PDF page.
|
||
|
||
top_ratio: fraction of page height to remove from the top
|
||
bottom_ratio: fraction of page height to remove from the bottom
|
||
"""
|
||
llx, lly, urx, ury = page.mediabox
|
||
height = ury - lly
|
||
|
||
new_lly = lly + height * bottom_ratio
|
||
new_ury = ury - height * top_ratio
|
||
|
||
if new_ury <= new_lly:
|
||
raise ValueError("Invalid crop ratios: page height would be negative")
|
||
|
||
page.cropbox.lower_left = (llx, new_lly)
|
||
page.cropbox.upper_right = (urx, new_ury)
|
||
|
||
|
||
def build_outline_tree(reader):
|
||
def _build(outline):
|
||
tree = []
|
||
for item in outline:
|
||
if isinstance(item, list):
|
||
tree[-1]["children"] = _build(item)
|
||
else:
|
||
tree.append({
|
||
"title": item.title.strip(),
|
||
"page": reader.get_destination_page_number(item),
|
||
"children": []
|
||
})
|
||
return tree
|
||
|
||
return _build(reader.outline)
|
||
|
||
|
||
def find_section(nodes, title):
|
||
for node in nodes:
|
||
if node["title"] == title or node["title"].startswith(title + " "):
|
||
return node
|
||
found = find_section(node["children"], title)
|
||
if found:
|
||
return found
|
||
return None
|
||
|
||
|
||
def collect_subtree_pages(node, pages=None):
|
||
"""
|
||
Collect all page numbers belonging to this section and its descendants.
|
||
"""
|
||
if pages is None:
|
||
pages = []
|
||
|
||
pages.append(node["page"])
|
||
for child in node["children"]:
|
||
collect_subtree_pages(child, pages)
|
||
|
||
return pages
|
||
|
||
|
||
def flatten_outline_pages(nodes, pages=None):
|
||
"""
|
||
Collect all outline entry page numbers in document order.
|
||
"""
|
||
if pages is None:
|
||
pages = []
|
||
|
||
for node in nodes:
|
||
pages.append(node["page"])
|
||
flatten_outline_pages(node["children"], pages)
|
||
|
||
return pages
|
||
|
||
|
||
def find_end_page(target_node, outline_tree, total_pages):
|
||
"""
|
||
End page = first outline page after the last descendant page.
|
||
"""
|
||
subtree_pages = collect_subtree_pages(target_node)
|
||
last_section_page = max(subtree_pages)
|
||
|
||
all_outline_pages = flatten_outline_pages(outline_tree)
|
||
all_outline_pages = sorted(set(all_outline_pages))
|
||
|
||
for page in all_outline_pages:
|
||
if page > last_section_page:
|
||
return page
|
||
|
||
return total_pages
|
||
|
||
|
||
def extract_section():
|
||
reader = PdfReader(INPUT_PDF)
|
||
writer = PdfWriter()
|
||
|
||
outline_tree = build_outline_tree(reader)
|
||
total_pages = len(reader.pages)
|
||
|
||
target = find_section(outline_tree, TARGET_SECTION_TITLE)
|
||
if not target:
|
||
raise ValueError(f"Section '{TARGET_SECTION_TITLE}' not found")
|
||
|
||
start_page = target["page"]
|
||
end_page = find_end_page(target, outline_tree, total_pages)
|
||
|
||
for p in range(start_page, end_page):
|
||
page = reader.pages[p]
|
||
crop_page(page)
|
||
writer.add_page(page)
|
||
|
||
with open(OUTPUT_PDF, "wb") as f:
|
||
writer.write(f)
|
||
|
||
print(
|
||
f"Extracted '{target['title']}' "
|
||
f"(pages {start_page + 1}–{end_page})"
|
||
)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
extract_section()
|