improved pdfcreator
1- use CLI 2- refactor code
This commit is contained in:
231
python/pdfcreator/pdfaggregator.py
Normal file
231
python/pdfcreator/pdfaggregator.py
Normal file
@@ -0,0 +1,231 @@
|
||||
# pdfaggregator.py
|
||||
import re
|
||||
from io import BytesIO
|
||||
from pypdf import PdfReader, PdfWriter, PageObject
|
||||
from reportlab.pdfgen import canvas
|
||||
from reportlab.lib.pagesizes import LETTER
|
||||
|
||||
# -----------------------------
|
||||
# Parsing / Section Utilities
|
||||
# -----------------------------
|
||||
|
||||
|
||||
def parse_page_range(entry):
|
||||
"""Return list of zero-based page indices if entry is a page range (@1-10)."""
|
||||
if entry.startswith("@"):
|
||||
s = entry[1:]
|
||||
try:
|
||||
start, end = s.split("-")
|
||||
start = int(start) - 1
|
||||
end = int(end)
|
||||
return list(range(start, end))
|
||||
except ValueError:
|
||||
print(f"[WARN] Invalid page range: {entry}")
|
||||
return None
|
||||
return None
|
||||
|
||||
|
||||
def strip_numbering(title):
|
||||
"""Remove leading numbering like '1.3 Background' -> 'Background'"""
|
||||
return re.sub(r'^\d+(\.\d+)*\s+', '', title)
|
||||
|
||||
|
||||
def crop_page(page, top_ratio=0.0, bottom_ratio=0.0):
|
||||
"""Crop the top/bottom of a page using ratios."""
|
||||
llx, lly, urx, ury = page.mediabox
|
||||
height = ury - lly
|
||||
new_lly = lly + height * bottom_ratio
|
||||
new_ury = ury - height * top_ratio
|
||||
page.cropbox.lower_left = (llx, new_lly)
|
||||
page.cropbox.upper_right = (urx, new_ury)
|
||||
|
||||
# -----------------------------
|
||||
# Outline / Section Tree
|
||||
# -----------------------------
|
||||
|
||||
|
||||
def find_section_with_level(nodes, prefix, level=0):
|
||||
"""Find a section node by prefix in outline tree."""
|
||||
for node in nodes:
|
||||
if node["title"].startswith(prefix):
|
||||
return node, level
|
||||
found = find_section_with_level(
|
||||
node.get("children", []), prefix, level + 1)
|
||||
if found[0]:
|
||||
return found
|
||||
return None, None
|
||||
|
||||
|
||||
def collect_subtree_pages(node, pages=None):
|
||||
"""Recursively collect pages of node and all its children."""
|
||||
if pages is None:
|
||||
pages = []
|
||||
pages.append(node["page"])
|
||||
for child in node.get("children", []):
|
||||
collect_subtree_pages(child, pages)
|
||||
return pages
|
||||
|
||||
|
||||
def flatten_outline_pages(nodes, pages=None):
|
||||
"""Flatten all pages from the outline tree."""
|
||||
if pages is None:
|
||||
pages = []
|
||||
for node in nodes:
|
||||
pages.append(node["page"])
|
||||
flatten_outline_pages(node.get("children", []), pages)
|
||||
return pages
|
||||
|
||||
|
||||
def find_end_page(target_node, outline_tree, total_pages):
|
||||
"""Find the last page of a section including its subsections."""
|
||||
subtree_pages = collect_subtree_pages(target_node)
|
||||
last_page = max(subtree_pages)
|
||||
all_pages = sorted(set(flatten_outline_pages(outline_tree)))
|
||||
for p in all_pages:
|
||||
if p > last_page:
|
||||
return p
|
||||
return total_pages
|
||||
|
||||
|
||||
def build_outline_tree(reader):
|
||||
"""
|
||||
Build a normalized outline tree from pypdf's reader.outline.
|
||||
|
||||
Each node:
|
||||
{
|
||||
"title": str,
|
||||
"page": int,
|
||||
"children": [ ... ]
|
||||
}
|
||||
"""
|
||||
def walk(items):
|
||||
tree = []
|
||||
for item in items:
|
||||
if isinstance(item, list):
|
||||
# children of previous item
|
||||
if tree:
|
||||
tree[-1]["children"] = walk(item)
|
||||
else:
|
||||
tree.append({
|
||||
"title": item.title.strip(),
|
||||
"page": reader.get_destination_page_number(item),
|
||||
"children": []
|
||||
})
|
||||
return tree
|
||||
|
||||
try:
|
||||
outline = reader.outline
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
if not outline:
|
||||
return []
|
||||
|
||||
return walk(outline)
|
||||
|
||||
|
||||
# -----------------------------
|
||||
# TOC Generation
|
||||
# -----------------------------
|
||||
|
||||
|
||||
def create_toc_pdf(toc_entries, heading="Table of Contents"):
|
||||
"""Generate a flat, unnumbered TOC PDF page in memory."""
|
||||
buffer = BytesIO()
|
||||
c = canvas.Canvas(buffer, pagesize=LETTER)
|
||||
|
||||
c.setFont("Helvetica-Bold", 16)
|
||||
c.drawString(50, 750, heading)
|
||||
|
||||
c.setFont("Helvetica", 12)
|
||||
y = 720
|
||||
|
||||
for entry in toc_entries:
|
||||
line = f"{strip_numbering(entry['title'])} ........................ {entry['page']}"
|
||||
c.drawString(50, y, line)
|
||||
y -= 18
|
||||
if y < 50:
|
||||
c.showPage()
|
||||
c.setFont("Helvetica", 12)
|
||||
y = 750
|
||||
|
||||
c.save()
|
||||
buffer.seek(0)
|
||||
return PdfReader(buffer)
|
||||
|
||||
|
||||
def extract_page_range(entry, reader, content_writer, current_page, REFERENCE_BOX, header_crop=0.05, footer_crop=0.03):
|
||||
"""Extract pages from an explicit @page-range entry."""
|
||||
page_indices = parse_page_range(entry)
|
||||
if not page_indices:
|
||||
return current_page, REFERENCE_BOX, None # nothing extracted
|
||||
|
||||
toc_entry = {"title": f"Pages {entry[1:]}",
|
||||
"page": current_page + 1, "level": 0}
|
||||
|
||||
for i, p in enumerate(page_indices):
|
||||
if p < 0 or p >= len(reader.pages):
|
||||
continue
|
||||
page = reader.pages[p]
|
||||
crop_page(page, header_crop, footer_crop)
|
||||
if REFERENCE_BOX is None:
|
||||
REFERENCE_BOX = (page.cropbox.lower_left, page.cropbox.upper_right)
|
||||
page.mediabox.lower_left = REFERENCE_BOX[0]
|
||||
page.mediabox.upper_right = REFERENCE_BOX[1]
|
||||
page.cropbox.lower_left = REFERENCE_BOX[0]
|
||||
page.cropbox.upper_right = REFERENCE_BOX[1]
|
||||
content_writer.add_page(page)
|
||||
current_page += 1
|
||||
|
||||
return current_page, REFERENCE_BOX, toc_entry
|
||||
|
||||
|
||||
def extract_section_prefix(entry, reader, content_writer, current_page, REFERENCE_BOX, outline_tree, header_crop=0.05, footer_crop=0.03):
|
||||
"""Extract pages from a section prefix entry in the PDF outline."""
|
||||
target, level = find_section_with_level(outline_tree, entry)
|
||||
if not target:
|
||||
print(
|
||||
f"[WARN] Section {entry} not found in PDF {reader.stream.name if hasattr(reader.stream, 'name') else ''}")
|
||||
return current_page, REFERENCE_BOX, None
|
||||
|
||||
start_page = target["page"]
|
||||
end_page = find_end_page(target, outline_tree, len(reader.pages))
|
||||
toc_entry = {"title": target["title"],
|
||||
"page": current_page + 1, "level": level}
|
||||
|
||||
for i, p in enumerate(range(start_page, end_page)):
|
||||
page = reader.pages[p]
|
||||
crop_page(page, header_crop if i == 0 else 0, footer_crop)
|
||||
if REFERENCE_BOX is None:
|
||||
REFERENCE_BOX = (page.cropbox.lower_left, page.cropbox.upper_right)
|
||||
page.mediabox.lower_left = REFERENCE_BOX[0]
|
||||
page.mediabox.upper_right = REFERENCE_BOX[1]
|
||||
page.cropbox.lower_left = REFERENCE_BOX[0]
|
||||
page.cropbox.upper_right = REFERENCE_BOX[1]
|
||||
content_writer.add_page(page)
|
||||
current_page += 1
|
||||
|
||||
return current_page, REFERENCE_BOX, toc_entry
|
||||
|
||||
|
||||
def parse_inputs(args):
|
||||
"""
|
||||
Parse CLI positional arguments into PDF_INPUTS structure.
|
||||
"""
|
||||
pdf_inputs = []
|
||||
|
||||
for item in args.inputs:
|
||||
if ":" not in item:
|
||||
raise ValueError(
|
||||
f"Invalid input '{item}'. Expected format: file.pdf:section1,section2"
|
||||
)
|
||||
|
||||
file_path, sections = item.split(":", 1)
|
||||
section_list = [s.strip() for s in sections.split(",") if s.strip()]
|
||||
|
||||
pdf_inputs.append({
|
||||
"file": file_path,
|
||||
"sections": section_list
|
||||
})
|
||||
|
||||
return pdf_inputs
|
||||
Reference in New Issue
Block a user