improved pdfcreator

1- use CLI
2- refactor code
This commit is contained in:
local
2026-01-19 23:06:56 +00:00
parent 6c4b78f274
commit a5934e45b2
8 changed files with 810 additions and 113 deletions

View File

@@ -1,23 +1,50 @@
from pypdf import PdfReader
from io import BytesIO
from reportlab.lib.pagesizes import LETTER
import re
from pypdf import PdfReader, PdfWriter
from reportlab.pdfgen import canvas
from pypdf import PdfReader, PdfWriter, PageObject
from reportlab.lib.pagesizes import LETTER
from io import BytesIO
# ================= CONFIG =================
# ----------- CONFIG -------------
PDF_INPUTS = [
{"file": "pdfcreator/input.pdf", "sections": ["1", "2.2", "3"]},
{"file": "pdfcreator/input2.pdf", "sections": ["3", "4"]},
{"file": "pdfcreator/input.pdf", "sections": ["1.3", "2.1", "@111-114"]},
{"file": "pdfcreator/input2.pdf", "sections": ["3.2"]},
]
OUTPUT_PDF = "pdfcreator/combined_sections.pdf"
OUTPUT_PDF = "pdfcreator/extracted_sections.pdf"
HEADER_CROP = 0.12 # top of first page of section
FOOTER_CROP = 0.06 # bottom of all pages
# =========================================
# Cropping ratios
HEADER_CROP = 0.1 # top of first page of section
FOOTER_CROP = 0.0 # bottom of pages
# --------------------------------
def strip_numbering(title):
"""
Remove leading numbering from a string like '1.3 Background'
Returns 'Background'.
"""
return re.sub(r'^\d+(\.\d+)*\s+', '', title)
# ----- HELPER FUNCTIONS ---------
# ---------- Outline utilities ------------
def parse_page_range(entry):
"""
Returns a list of zero-based page indices if entry is a page range.
Page ranges must be prefixed with '@', e.g., "@1-10".
Otherwise returns None (treated as section prefix).
"""
if entry.startswith("@"):
s = entry[1:] # remove the @
try:
start, end = s.split("-")
start = int(start) - 1 # zero-based
end = int(end) # inclusive in range
return list(range(start, end))
except ValueError:
print(f"[WARN] Invalid page range: {entry}")
return None
return None # not a page range
def build_outline_tree(reader):
@@ -36,14 +63,14 @@ def build_outline_tree(reader):
return _build(reader.outline)
def find_section(nodes, title):
def find_section_with_level(nodes, prefix, level=0):
for node in nodes:
if node["title"] == title or node["title"].startswith(title + " "):
return node
found = find_section(node["children"], title)
if found:
if node["title"].startswith(prefix):
return node, level
found = find_section_with_level(node["children"], prefix, level + 1)
if found[0]:
return found
return None
return None, None
def collect_subtree_pages(node, pages=None):
@@ -66,138 +93,178 @@ def flatten_outline_pages(nodes, pages=None):
def find_end_page(target_node, outline_tree, total_pages):
subtree_pages = collect_subtree_pages(target_node)
last_section_page = max(subtree_pages)
all_outline_pages = flatten_outline_pages(outline_tree)
all_outline_pages = sorted(set(all_outline_pages))
for page in all_outline_pages:
if page > last_section_page:
return page
last_page = max(subtree_pages)
all_pages = sorted(set(flatten_outline_pages(outline_tree)))
for p in all_pages:
if p > last_page:
return p
return total_pages
# ---------- Page manipulation ------------
def crop_page(page, top_ratio=0.0, bottom_ratio=0.0):
llx, lly, urx, ury = page.mediabox
height = ury - lly
new_lly = lly + height * bottom_ratio
new_ury = ury - height * top_ratio
if new_ury <= new_lly:
raise ValueError("Invalid crop ratios: page height would be negative")
page.cropbox.lower_left = (llx, new_lly)
page.cropbox.upper_right = (urx, new_ury)
def normalize_page_size(page, reference_box):
"""
Force page MediaBox and CropBox to match reference.
"""
page.mediabox.lower_left = reference_box.lower_left
page.mediabox.upper_right = reference_box.upper_right
# ---------- TOC generation ---------------
def create_toc_pdf(toc_entries, heading):
buffer = BytesIO()
c = canvas.Canvas(buffer, pagesize=LETTER)
page.cropbox.lower_left = reference_box.lower_left
page.cropbox.upper_right = reference_box.upper_right
c.setFont("Helvetica-Bold", 16)
c.drawString(50, 750, heading)
# --------------------------------
c.setFont("Helvetica", 12)
y = 720
for entry in toc_entries:
line = f"{strip_numbering(entry['title'])} ........................ {entry['page']}"
c.drawString(50, y, line) # flat: no indentation
y -= 18
if y < 50:
c.showPage()
c.setFont("Helvetica", 12)
y = 750
c.save()
buffer.seek(0)
return PdfReader(buffer)
# --------- MAIN PROCESS ----------
writer = PdfWriter()
toc_entries = [] # To build TOC later
current_page_index = 0
# ================= MAIN ===================
content_writer = PdfWriter()
toc_entries = []
current_page = 0
REFERENCE_BOX = None
for pdf_info in PDF_INPUTS:
file_path = pdf_info["file"]
sections_to_extract = pdf_info["sections"]
reader = PdfReader(file_path)
reader = PdfReader(pdf_info["file"])
outline_tree = build_outline_tree(reader)
total_pages = len(reader.pages)
for section_title in sections_to_extract:
target = find_section(outline_tree, section_title)
if not target:
print(f"[WARN] Section '{section_title}' not found in {file_path}")
continue
for entry in pdf_info["sections"]:
start_page = target["page"]
end_page = find_end_page(target, outline_tree, total_pages)
page_indices = parse_page_range(entry)
REFERENCE_BOX = None
# Add pages to combined PDF
for i, p in enumerate(range(start_page, end_page)):
page = reader.pages[p]
if page_indices:
# --- Explicit page range ---
toc_entries.append({
"title": f"Pages {entry[1:]}", # remove '@' for display
"page": current_page + 1,
"level": 0
})
for i, p in enumerate(page_indices):
if p < 0 or p >= total_pages:
print(
f"[WARN] Page {p+1} out of range in {pdf_info['file']}")
continue
page = reader.pages[p]
# Crop first page header+footer
if i == 0:
crop_page(page, top_ratio=HEADER_CROP,
bottom_ratio=FOOTER_CROP)
else:
crop_page(page, top_ratio=HEADER_CROP,
bottom_ratio=FOOTER_CROP)
# crop_page(page, bottom_ratio=FOOTER_CROP)
if REFERENCE_BOX is None:
REFERENCE_BOX = (
page.cropbox.lower_left,
page.cropbox.upper_right
)
page.mediabox.lower_left = REFERENCE_BOX[0]
page.mediabox.upper_right = REFERENCE_BOX[1]
page.cropbox.lower_left = REFERENCE_BOX[0]
page.cropbox.upper_right = REFERENCE_BOX[1]
if REFERENCE_BOX is None:
# Make a copy, not a reference
REFERENCE_BOX = (
page.cropbox.lower_left,
page.cropbox.upper_right
)
# Step 3: Normalize page size
page.mediabox.lower_left = REFERENCE_BOX[0]
page.mediabox.upper_right = REFERENCE_BOX[1]
page.cropbox.lower_left = REFERENCE_BOX[0]
page.cropbox.upper_right = REFERENCE_BOX[1]
content_writer.add_page(page)
current_page += 1
else:
writer.add_page(page)
target, level = find_section_with_level(
outline_tree, entry)
if not target:
print(
f"[WARN] Section {entry} not found in {pdf_info['file']}")
continue
# Track TOC
toc_entries.append({
"title": f"{section_title} ({file_path})",
"page": current_page_index + 1 # 1-based page number
})
current_page_index += (end_page - start_page)
start_page = target["page"]
end_page = find_end_page(target, outline_tree, total_pages)
# --------- ADD TOC PAGE(S) ----------
toc_entries.append({
"title": target["title"], # EXACT heading text
"page": current_page + 1, # 1-based
"level": level
})
for i, p in enumerate(range(start_page, end_page)):
page = reader.pages[p]
if i == 0:
crop_page(page, HEADER_CROP, FOOTER_CROP)
else:
crop_page(page, bottom_ratio=FOOTER_CROP)
# Capture reference AFTER cropping
if REFERENCE_BOX is None:
REFERENCE_BOX = (
page.cropbox.lower_left,
page.cropbox.upper_right
)
# Normalize page size
page.mediabox.lower_left = REFERENCE_BOX[0]
page.mediabox.upper_right = REFERENCE_BOX[1]
page.cropbox.lower_left = REFERENCE_BOX[0]
page.cropbox.upper_right = REFERENCE_BOX[1]
content_writer.add_page(page)
current_page += 1
def create_toc_pdf(toc_entries):
packet = BytesIO()
c = canvas.Canvas(packet, pagesize=LETTER)
c.setFont("Helvetica-Bold", 16)
c.drawString(50, 750, "Table of Contents")
c.setFont("Helvetica", 12)
y = 720
for entry in toc_entries:
text = f"{entry['title']} .... {entry['page']}"
c.drawString(50, y, text)
y -= 20
if y < 50:
c.showPage()
y = 750
c.save()
packet.seek(0)
return PdfReader(packet)
# ---------- Build final PDF ---------------
toc_pdf = create_toc_pdf(toc_entries)
# Combine TOC + extracted sections
final_writer = PdfWriter()
# TOC first
# Derive TOC heading from first source document
first_reader = PdfReader(PDF_INPUTS[0]["file"])
toc_heading = "Contents" if first_reader.outline else "Table of Contents"
# Visible TOC pages
toc_pdf = create_toc_pdf(toc_entries, toc_heading)
toc_page_count = len(toc_pdf.pages)
for page in toc_pdf.pages:
final_writer.add_page(page)
# Then extracted content
for page in writer.pages:
# Content pages
for page in content_writer.pages:
final_writer.add_page(page)
# Save
bookmark_stack = {}
for entry in toc_entries:
parent = bookmark_stack.get(entry["level"] - 1)
bm = final_writer.add_outline_item(
title=entry["title"], # exact heading text
page_number=(entry["page"] - 1) + toc_page_count,
parent=parent
)
bookmark_stack[entry["level"]] = bm
# ---------- Write output ------------------
with open(OUTPUT_PDF, "wb") as f:
final_writer.write(f)
# --------- WRITE OUTPUT -----------
with open(OUTPUT_PDF, "wb") as f:
final_writer.write(f)
print(f"[INFO] Combined PDF written to {OUTPUT_PDF} with TOC.")
print(f"[OK] Created {OUTPUT_PDF}")