Files
Code/python/pdfcreator/driver.py
local a5934e45b2 improved pdfcreator
1- use CLI
2- refactor code
2026-01-19 23:06:56 +00:00

100 lines
2.7 KiB
Python

# driver.py
from pypdf import PageObject
from pdfaggregator import *
from pypdf import PdfReader, PdfWriter
import argparse
PDF_INPUTS = [
{"file": "pdfcreator/input.pdf", "sections": ["5", "@20-30"]},
# {"file": "pdfcreator/input2.pdf",
# "sections": ["3.1"]}
]
OUTPUT_PDF = "pdfcreator/extracted_sections.pdf"
# crop ratios
HEADER_CROP = 0.1
FOOTER_CROP = 0.03
def main(pdf_inputs, output_pdf):
content_writer = PdfWriter()
toc_entries = []
current_page = 0
for pdf_info in pdf_inputs: # or change
REFERENCE_BOX = None
reader = PdfReader(pdf_info["file"])
outline_tree = build_outline_tree(reader)
total_pages = len(reader.pages)
for entry in pdf_info["sections"]:
page_indices = parse_page_range(entry)
if page_indices:
# Explicit page range
current_page, REFERENCE_BOX, toc_entry = extract_page_range(
entry, reader, content_writer, current_page, REFERENCE_BOX,
HEADER_CROP, FOOTER_CROP
)
else:
# Section prefix
current_page, REFERENCE_BOX, toc_entry = extract_section_prefix(
entry, reader, content_writer, current_page, REFERENCE_BOX,
outline_tree, HEADER_CROP, FOOTER_CROP
)
if toc_entry:
toc_entries.append(toc_entry)
# Create TOC PDF
toc_pdf = create_toc_pdf(toc_entries)
toc_page_count = len(toc_pdf.pages)
final_writer = PdfWriter()
# add TOC pages
for page in toc_pdf.pages:
final_writer.add_page(page)
# add content pages
for page in content_writer.pages:
final_writer.add_page(page)
# Add bookmarks
bookmark_stack = {}
for entry in toc_entries:
parent = bookmark_stack.get(entry["level"] - 1)
bm = final_writer.add_outline_item(
title=entry["title"],
page_number=(entry["page"] - 1) + toc_page_count,
parent=parent
)
bookmark_stack[entry["level"]] = bm
with open(output_pdf, "wb") as f:
final_writer.write(f)
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Extract sections and page ranges from PDFs and aggregate into a new PDF with TOC and bookmarks."
)
parser.add_argument(
"-o", "--output",
required=True,
help="Output PDF file"
)
parser.add_argument(
"-i", "--inputs",
nargs="+",
help="Inputs in the form file.pdf:section1,section2 or file.pdf:@1-10"
)
args = parser.parse_args()
pdf_inputs = parse_inputs(args)
main(pdf_inputs, args.output)