improved pdfcreator
1- use CLI 2- refactor code
This commit is contained in:
99
python/pdfcreator/driver.py
Normal file
99
python/pdfcreator/driver.py
Normal file
@@ -0,0 +1,99 @@
|
||||
# driver.py
|
||||
from pypdf import PageObject
|
||||
from pdfaggregator import *
|
||||
from pypdf import PdfReader, PdfWriter
|
||||
import argparse
|
||||
|
||||
|
||||
PDF_INPUTS = [
|
||||
{"file": "pdfcreator/input.pdf", "sections": ["5", "@20-30"]},
|
||||
# {"file": "pdfcreator/input2.pdf",
|
||||
# "sections": ["3.1"]}
|
||||
]
|
||||
|
||||
OUTPUT_PDF = "pdfcreator/extracted_sections.pdf"
|
||||
|
||||
# crop ratios
|
||||
HEADER_CROP = 0.1
|
||||
FOOTER_CROP = 0.03
|
||||
|
||||
|
||||
def main(pdf_inputs, output_pdf):
|
||||
|
||||
content_writer = PdfWriter()
|
||||
toc_entries = []
|
||||
current_page = 0
|
||||
|
||||
for pdf_info in pdf_inputs: # or change
|
||||
REFERENCE_BOX = None
|
||||
reader = PdfReader(pdf_info["file"])
|
||||
outline_tree = build_outline_tree(reader)
|
||||
total_pages = len(reader.pages)
|
||||
|
||||
for entry in pdf_info["sections"]:
|
||||
page_indices = parse_page_range(entry)
|
||||
|
||||
if page_indices:
|
||||
# Explicit page range
|
||||
current_page, REFERENCE_BOX, toc_entry = extract_page_range(
|
||||
entry, reader, content_writer, current_page, REFERENCE_BOX,
|
||||
HEADER_CROP, FOOTER_CROP
|
||||
)
|
||||
else:
|
||||
# Section prefix
|
||||
current_page, REFERENCE_BOX, toc_entry = extract_section_prefix(
|
||||
entry, reader, content_writer, current_page, REFERENCE_BOX,
|
||||
outline_tree, HEADER_CROP, FOOTER_CROP
|
||||
)
|
||||
|
||||
if toc_entry:
|
||||
toc_entries.append(toc_entry)
|
||||
|
||||
# Create TOC PDF
|
||||
toc_pdf = create_toc_pdf(toc_entries)
|
||||
toc_page_count = len(toc_pdf.pages)
|
||||
|
||||
final_writer = PdfWriter()
|
||||
# add TOC pages
|
||||
for page in toc_pdf.pages:
|
||||
final_writer.add_page(page)
|
||||
# add content pages
|
||||
for page in content_writer.pages:
|
||||
final_writer.add_page(page)
|
||||
|
||||
# Add bookmarks
|
||||
bookmark_stack = {}
|
||||
for entry in toc_entries:
|
||||
parent = bookmark_stack.get(entry["level"] - 1)
|
||||
bm = final_writer.add_outline_item(
|
||||
title=entry["title"],
|
||||
page_number=(entry["page"] - 1) + toc_page_count,
|
||||
parent=parent
|
||||
)
|
||||
bookmark_stack[entry["level"]] = bm
|
||||
|
||||
with open(output_pdf, "wb") as f:
|
||||
final_writer.write(f)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Extract sections and page ranges from PDFs and aggregate into a new PDF with TOC and bookmarks."
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-o", "--output",
|
||||
required=True,
|
||||
help="Output PDF file"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-i", "--inputs",
|
||||
nargs="+",
|
||||
help="Inputs in the form file.pdf:section1,section2 or file.pdf:@1-10"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
pdf_inputs = parse_inputs(args)
|
||||
|
||||
main(pdf_inputs, args.output)
|
||||
Reference in New Issue
Block a user