import re from pypdf import PdfReader, PdfWriter from reportlab.pdfgen import canvas from reportlab.lib.pagesizes import LETTER from io import BytesIO # ================= CONFIG ================= PDF_INPUTS = [ {"file": "pdfcreator/input.pdf", "sections": ["1.3", "2.1", "@111-114"]}, {"file": "pdfcreator/input2.pdf", "sections": ["3.2"]}, ] OUTPUT_PDF = "pdfcreator/extracted_sections.pdf" HEADER_CROP = 0.12 # top of first page of section FOOTER_CROP = 0.06 # bottom of all pages # ========================================= def strip_numbering(title): """ Remove leading numbering from a string like '1.3 Background' Returns 'Background'. """ return re.sub(r'^\d+(\.\d+)*\s+', '', title) # ---------- Outline utilities ------------ def parse_page_range(entry): """ Returns a list of zero-based page indices if entry is a page range. Page ranges must be prefixed with '@', e.g., "@1-10". Otherwise returns None (treated as section prefix). """ if entry.startswith("@"): s = entry[1:] # remove the @ try: start, end = s.split("-") start = int(start) - 1 # zero-based end = int(end) # inclusive in range return list(range(start, end)) except ValueError: print(f"[WARN] Invalid page range: {entry}") return None return None # not a page range def build_outline_tree(reader): def _build(outline): tree = [] for item in outline: if isinstance(item, list): tree[-1]["children"] = _build(item) else: tree.append({ "title": item.title.strip(), "page": reader.get_destination_page_number(item), "children": [] }) return tree return _build(reader.outline) def find_section_with_level(nodes, prefix, level=0): for node in nodes: if node["title"].startswith(prefix): return node, level found = find_section_with_level(node["children"], prefix, level + 1) if found[0]: return found return None, None def collect_subtree_pages(node, pages=None): if pages is None: pages = [] pages.append(node["page"]) for child in node["children"]: collect_subtree_pages(child, pages) return pages def flatten_outline_pages(nodes, pages=None): if pages is None: pages = [] for node in nodes: pages.append(node["page"]) flatten_outline_pages(node["children"], pages) return pages def find_end_page(target_node, outline_tree, total_pages): subtree_pages = collect_subtree_pages(target_node) last_page = max(subtree_pages) all_pages = sorted(set(flatten_outline_pages(outline_tree))) for p in all_pages: if p > last_page: return p return total_pages # ---------- Page manipulation ------------ def crop_page(page, top_ratio=0.0, bottom_ratio=0.0): llx, lly, urx, ury = page.mediabox height = ury - lly new_lly = lly + height * bottom_ratio new_ury = ury - height * top_ratio page.cropbox.lower_left = (llx, new_lly) page.cropbox.upper_right = (urx, new_ury) # ---------- TOC generation --------------- def create_toc_pdf(toc_entries, heading): buffer = BytesIO() c = canvas.Canvas(buffer, pagesize=LETTER) c.setFont("Helvetica-Bold", 16) c.drawString(50, 750, heading) c.setFont("Helvetica", 12) y = 720 for entry in toc_entries: line = f"{strip_numbering(entry['title'])} ........................ {entry['page']}" c.drawString(50, y, line) # flat: no indentation y -= 18 if y < 50: c.showPage() c.setFont("Helvetica", 12) y = 750 c.save() buffer.seek(0) return PdfReader(buffer) # ================= MAIN =================== content_writer = PdfWriter() toc_entries = [] current_page = 0 REFERENCE_BOX = None for pdf_info in PDF_INPUTS: reader = PdfReader(pdf_info["file"]) outline_tree = build_outline_tree(reader) total_pages = len(reader.pages) for entry in pdf_info["sections"]: page_indices = parse_page_range(entry) if page_indices: # --- Explicit page range --- toc_entries.append({ "title": f"Pages {entry[1:]}", # remove '@' for display "page": current_page + 1, "level": 0 }) for i, p in enumerate(page_indices): if p < 0 or p >= total_pages: print( f"[WARN] Page {p+1} out of range in {pdf_info['file']}") continue page = reader.pages[p] crop_page(page, top_ratio=HEADER_CROP, bottom_ratio=FOOTER_CROP) if REFERENCE_BOX is None: REFERENCE_BOX = ( page.cropbox.lower_left, page.cropbox.upper_right ) page.mediabox.lower_left = REFERENCE_BOX[0] page.mediabox.upper_right = REFERENCE_BOX[1] page.cropbox.lower_left = REFERENCE_BOX[0] page.cropbox.upper_right = REFERENCE_BOX[1] content_writer.add_page(page) current_page += 1 else: target, level = find_section_with_level( outline_tree, entry) if not target: print( f"[WARN] Section {entry} not found in {pdf_info['file']}") continue start_page = target["page"] end_page = find_end_page(target, outline_tree, total_pages) toc_entries.append({ "title": target["title"], # EXACT heading text "page": current_page + 1, # 1-based "level": level }) for i, p in enumerate(range(start_page, end_page)): page = reader.pages[p] if i == 0: crop_page(page, HEADER_CROP, FOOTER_CROP) else: crop_page(page, bottom_ratio=FOOTER_CROP) # Capture reference AFTER cropping if REFERENCE_BOX is None: REFERENCE_BOX = ( page.cropbox.lower_left, page.cropbox.upper_right ) # Normalize page size page.mediabox.lower_left = REFERENCE_BOX[0] page.mediabox.upper_right = REFERENCE_BOX[1] page.cropbox.lower_left = REFERENCE_BOX[0] page.cropbox.upper_right = REFERENCE_BOX[1] content_writer.add_page(page) current_page += 1 # ---------- Build final PDF --------------- final_writer = PdfWriter() # Derive TOC heading from first source document first_reader = PdfReader(PDF_INPUTS[0]["file"]) toc_heading = "Contents" if first_reader.outline else "Table of Contents" # Visible TOC pages toc_pdf = create_toc_pdf(toc_entries, toc_heading) toc_page_count = len(toc_pdf.pages) for page in toc_pdf.pages: final_writer.add_page(page) # Content pages for page in content_writer.pages: final_writer.add_page(page) bookmark_stack = {} for entry in toc_entries: parent = bookmark_stack.get(entry["level"] - 1) bm = final_writer.add_outline_item( title=entry["title"], # exact heading text page_number=(entry["page"] - 1) + toc_page_count, parent=parent ) bookmark_stack[entry["level"]] = bm # ---------- Write output ------------------ with open(OUTPUT_PDF, "wb") as f: final_writer.write(f) print(f"[OK] Created {OUTPUT_PDF}")