from pypdf import PdfReader, PdfWriter INPUT_PDF = "pdfcreator/input.pdf" OUTPUT_PDF = "pdfcreator/extracted_section.pdf" TARGET_SECTION_TITLE = "1.3" def crop_page(page, top_ratio=0.12, bottom_ratio=0.12): """ Crop the visible area of a PDF page. top_ratio: fraction of page height to remove from the top bottom_ratio: fraction of page height to remove from the bottom """ llx, lly, urx, ury = page.mediabox height = ury - lly new_lly = lly + height * bottom_ratio new_ury = ury - height * top_ratio if new_ury <= new_lly: raise ValueError("Invalid crop ratios: page height would be negative") page.cropbox.lower_left = (llx, new_lly) page.cropbox.upper_right = (urx, new_ury) def build_outline_tree(reader): def _build(outline): tree = [] for item in outline: if isinstance(item, list): tree[-1]["children"] = _build(item) else: tree.append({ "title": item.title.strip(), "page": reader.get_destination_page_number(item), "children": [] }) return tree return _build(reader.outline) def find_section(nodes, title): for node in nodes: if node["title"] == title or node["title"].startswith(title + " "): return node found = find_section(node["children"], title) if found: return found return None def collect_subtree_pages(node, pages=None): """ Collect all page numbers belonging to this section and its descendants. """ if pages is None: pages = [] pages.append(node["page"]) for child in node["children"]: collect_subtree_pages(child, pages) return pages def flatten_outline_pages(nodes, pages=None): """ Collect all outline entry page numbers in document order. """ if pages is None: pages = [] for node in nodes: pages.append(node["page"]) flatten_outline_pages(node["children"], pages) return pages def find_end_page(target_node, outline_tree, total_pages): """ End page = first outline page after the last descendant page. """ subtree_pages = collect_subtree_pages(target_node) last_section_page = max(subtree_pages) all_outline_pages = flatten_outline_pages(outline_tree) all_outline_pages = sorted(set(all_outline_pages)) for page in all_outline_pages: if page > last_section_page: return page return total_pages def extract_section(): reader = PdfReader(INPUT_PDF) writer = PdfWriter() outline_tree = build_outline_tree(reader) total_pages = len(reader.pages) target = find_section(outline_tree, TARGET_SECTION_TITLE) if not target: raise ValueError(f"Section '{TARGET_SECTION_TITLE}' not found") start_page = target["page"] end_page = find_end_page(target, outline_tree, total_pages) for p in range(start_page, end_page): page = reader.pages[p] crop_page(page) writer.add_page(page) with open(OUTPUT_PDF, "wb") as f: writer.write(f) print( f"Extracted '{target['title']}' " f"(pages {start_page + 1}–{end_page})" ) if __name__ == "__main__": extract_section()