# driver.py from pypdf import PageObject from pdfaggregator import * from pypdf import PdfReader, PdfWriter import argparse PDF_INPUTS = [ {"file": "pdfcreator/input.pdf", "sections": ["5", "@20-30"]}, # {"file": "pdfcreator/input2.pdf", # "sections": ["3.1"]} ] OUTPUT_PDF = "pdfcreator/extracted_sections.pdf" # crop ratios HEADER_CROP = 0.1 FOOTER_CROP = 0.03 def main(pdf_inputs, output_pdf): content_writer = PdfWriter() toc_entries = [] current_page = 0 for pdf_info in pdf_inputs: # or change REFERENCE_BOX = None reader = PdfReader(pdf_info["file"]) outline_tree = build_outline_tree(reader) total_pages = len(reader.pages) for entry in pdf_info["sections"]: page_indices = parse_page_range(entry) if page_indices: # Explicit page range current_page, REFERENCE_BOX, toc_entry = extract_page_range( entry, reader, content_writer, current_page, REFERENCE_BOX, HEADER_CROP, FOOTER_CROP ) else: # Section prefix current_page, REFERENCE_BOX, toc_entry = extract_section_prefix( entry, reader, content_writer, current_page, REFERENCE_BOX, outline_tree, HEADER_CROP, FOOTER_CROP ) if toc_entry: toc_entries.append(toc_entry) # Create TOC PDF toc_pdf = create_toc_pdf(toc_entries) toc_page_count = len(toc_pdf.pages) final_writer = PdfWriter() # add TOC pages for page in toc_pdf.pages: final_writer.add_page(page) # add content pages for page in content_writer.pages: final_writer.add_page(page) # Add bookmarks bookmark_stack = {} for entry in toc_entries: parent = bookmark_stack.get(entry["level"] - 1) bm = final_writer.add_outline_item( title=entry["title"], page_number=(entry["page"] - 1) + toc_page_count, parent=parent ) bookmark_stack[entry["level"]] = bm with open(output_pdf, "wb") as f: final_writer.write(f) if __name__ == "__main__": parser = argparse.ArgumentParser( description="Extract sections and page ranges from PDFs and aggregate into a new PDF with TOC and bookmarks." ) parser.add_argument( "-o", "--output", required=True, help="Output PDF file" ) parser.add_argument( "-i", "--inputs", nargs="+", help="Inputs in the form file.pdf:section1,section2 or file.pdf:@1-10" ) args = parser.parse_args() pdf_inputs = parse_inputs(args) main(pdf_inputs, args.output)