improved pdfcreator
1- use CLI 2- refactor code
This commit is contained in:
@@ -1,23 +1,50 @@
|
||||
from pypdf import PdfReader
|
||||
from io import BytesIO
|
||||
from reportlab.lib.pagesizes import LETTER
|
||||
import re
|
||||
from pypdf import PdfReader, PdfWriter
|
||||
from reportlab.pdfgen import canvas
|
||||
from pypdf import PdfReader, PdfWriter, PageObject
|
||||
from reportlab.lib.pagesizes import LETTER
|
||||
from io import BytesIO
|
||||
|
||||
# ================= CONFIG =================
|
||||
|
||||
# ----------- CONFIG -------------
|
||||
PDF_INPUTS = [
|
||||
{"file": "pdfcreator/input.pdf", "sections": ["1", "2.2", "3"]},
|
||||
{"file": "pdfcreator/input2.pdf", "sections": ["3", "4"]},
|
||||
{"file": "pdfcreator/input.pdf", "sections": ["1.3", "2.1", "@111-114"]},
|
||||
{"file": "pdfcreator/input2.pdf", "sections": ["3.2"]},
|
||||
]
|
||||
OUTPUT_PDF = "pdfcreator/combined_sections.pdf"
|
||||
|
||||
OUTPUT_PDF = "pdfcreator/extracted_sections.pdf"
|
||||
|
||||
HEADER_CROP = 0.12 # top of first page of section
|
||||
FOOTER_CROP = 0.06 # bottom of all pages
|
||||
|
||||
# =========================================
|
||||
|
||||
|
||||
# Cropping ratios
|
||||
HEADER_CROP = 0.1 # top of first page of section
|
||||
FOOTER_CROP = 0.0 # bottom of pages
|
||||
# --------------------------------
|
||||
def strip_numbering(title):
|
||||
"""
|
||||
Remove leading numbering from a string like '1.3 Background'
|
||||
Returns 'Background'.
|
||||
"""
|
||||
return re.sub(r'^\d+(\.\d+)*\s+', '', title)
|
||||
|
||||
# ----- HELPER FUNCTIONS ---------
|
||||
|
||||
# ---------- Outline utilities ------------
|
||||
def parse_page_range(entry):
|
||||
"""
|
||||
Returns a list of zero-based page indices if entry is a page range.
|
||||
Page ranges must be prefixed with '@', e.g., "@1-10".
|
||||
Otherwise returns None (treated as section prefix).
|
||||
"""
|
||||
if entry.startswith("@"):
|
||||
s = entry[1:] # remove the @
|
||||
try:
|
||||
start, end = s.split("-")
|
||||
start = int(start) - 1 # zero-based
|
||||
end = int(end) # inclusive in range
|
||||
return list(range(start, end))
|
||||
except ValueError:
|
||||
print(f"[WARN] Invalid page range: {entry}")
|
||||
return None
|
||||
return None # not a page range
|
||||
|
||||
|
||||
def build_outline_tree(reader):
|
||||
@@ -36,14 +63,14 @@ def build_outline_tree(reader):
|
||||
return _build(reader.outline)
|
||||
|
||||
|
||||
def find_section(nodes, title):
|
||||
def find_section_with_level(nodes, prefix, level=0):
|
||||
for node in nodes:
|
||||
if node["title"] == title or node["title"].startswith(title + " "):
|
||||
return node
|
||||
found = find_section(node["children"], title)
|
||||
if found:
|
||||
if node["title"].startswith(prefix):
|
||||
return node, level
|
||||
found = find_section_with_level(node["children"], prefix, level + 1)
|
||||
if found[0]:
|
||||
return found
|
||||
return None
|
||||
return None, None
|
||||
|
||||
|
||||
def collect_subtree_pages(node, pages=None):
|
||||
@@ -66,138 +93,178 @@ def flatten_outline_pages(nodes, pages=None):
|
||||
|
||||
def find_end_page(target_node, outline_tree, total_pages):
|
||||
subtree_pages = collect_subtree_pages(target_node)
|
||||
last_section_page = max(subtree_pages)
|
||||
all_outline_pages = flatten_outline_pages(outline_tree)
|
||||
all_outline_pages = sorted(set(all_outline_pages))
|
||||
for page in all_outline_pages:
|
||||
if page > last_section_page:
|
||||
return page
|
||||
last_page = max(subtree_pages)
|
||||
|
||||
all_pages = sorted(set(flatten_outline_pages(outline_tree)))
|
||||
for p in all_pages:
|
||||
if p > last_page:
|
||||
return p
|
||||
return total_pages
|
||||
|
||||
|
||||
# ---------- Page manipulation ------------
|
||||
|
||||
def crop_page(page, top_ratio=0.0, bottom_ratio=0.0):
|
||||
llx, lly, urx, ury = page.mediabox
|
||||
height = ury - lly
|
||||
|
||||
new_lly = lly + height * bottom_ratio
|
||||
new_ury = ury - height * top_ratio
|
||||
if new_ury <= new_lly:
|
||||
raise ValueError("Invalid crop ratios: page height would be negative")
|
||||
|
||||
page.cropbox.lower_left = (llx, new_lly)
|
||||
page.cropbox.upper_right = (urx, new_ury)
|
||||
|
||||
|
||||
def normalize_page_size(page, reference_box):
|
||||
"""
|
||||
Force page MediaBox and CropBox to match reference.
|
||||
"""
|
||||
page.mediabox.lower_left = reference_box.lower_left
|
||||
page.mediabox.upper_right = reference_box.upper_right
|
||||
# ---------- TOC generation ---------------
|
||||
def create_toc_pdf(toc_entries, heading):
|
||||
buffer = BytesIO()
|
||||
c = canvas.Canvas(buffer, pagesize=LETTER)
|
||||
|
||||
page.cropbox.lower_left = reference_box.lower_left
|
||||
page.cropbox.upper_right = reference_box.upper_right
|
||||
c.setFont("Helvetica-Bold", 16)
|
||||
c.drawString(50, 750, heading)
|
||||
|
||||
# --------------------------------
|
||||
c.setFont("Helvetica", 12)
|
||||
y = 720
|
||||
|
||||
for entry in toc_entries:
|
||||
line = f"{strip_numbering(entry['title'])} ........................ {entry['page']}"
|
||||
c.drawString(50, y, line) # flat: no indentation
|
||||
y -= 18
|
||||
|
||||
if y < 50:
|
||||
c.showPage()
|
||||
c.setFont("Helvetica", 12)
|
||||
y = 750
|
||||
|
||||
c.save()
|
||||
buffer.seek(0)
|
||||
return PdfReader(buffer)
|
||||
|
||||
|
||||
# --------- MAIN PROCESS ----------
|
||||
writer = PdfWriter()
|
||||
toc_entries = [] # To build TOC later
|
||||
current_page_index = 0
|
||||
# ================= MAIN ===================
|
||||
|
||||
content_writer = PdfWriter()
|
||||
toc_entries = []
|
||||
current_page = 0
|
||||
REFERENCE_BOX = None
|
||||
|
||||
for pdf_info in PDF_INPUTS:
|
||||
file_path = pdf_info["file"]
|
||||
sections_to_extract = pdf_info["sections"]
|
||||
|
||||
reader = PdfReader(file_path)
|
||||
reader = PdfReader(pdf_info["file"])
|
||||
outline_tree = build_outline_tree(reader)
|
||||
total_pages = len(reader.pages)
|
||||
|
||||
for section_title in sections_to_extract:
|
||||
target = find_section(outline_tree, section_title)
|
||||
if not target:
|
||||
print(f"[WARN] Section '{section_title}' not found in {file_path}")
|
||||
continue
|
||||
for entry in pdf_info["sections"]:
|
||||
|
||||
start_page = target["page"]
|
||||
end_page = find_end_page(target, outline_tree, total_pages)
|
||||
page_indices = parse_page_range(entry)
|
||||
|
||||
REFERENCE_BOX = None
|
||||
# Add pages to combined PDF
|
||||
for i, p in enumerate(range(start_page, end_page)):
|
||||
page = reader.pages[p]
|
||||
if page_indices:
|
||||
# --- Explicit page range ---
|
||||
toc_entries.append({
|
||||
"title": f"Pages {entry[1:]}", # remove '@' for display
|
||||
"page": current_page + 1,
|
||||
"level": 0
|
||||
})
|
||||
|
||||
for i, p in enumerate(page_indices):
|
||||
if p < 0 or p >= total_pages:
|
||||
print(
|
||||
f"[WARN] Page {p+1} out of range in {pdf_info['file']}")
|
||||
continue
|
||||
page = reader.pages[p]
|
||||
|
||||
# Crop first page header+footer
|
||||
if i == 0:
|
||||
crop_page(page, top_ratio=HEADER_CROP,
|
||||
bottom_ratio=FOOTER_CROP)
|
||||
else:
|
||||
crop_page(page, top_ratio=HEADER_CROP,
|
||||
bottom_ratio=FOOTER_CROP)
|
||||
# crop_page(page, bottom_ratio=FOOTER_CROP)
|
||||
if REFERENCE_BOX is None:
|
||||
REFERENCE_BOX = (
|
||||
page.cropbox.lower_left,
|
||||
page.cropbox.upper_right
|
||||
)
|
||||
page.mediabox.lower_left = REFERENCE_BOX[0]
|
||||
page.mediabox.upper_right = REFERENCE_BOX[1]
|
||||
page.cropbox.lower_left = REFERENCE_BOX[0]
|
||||
page.cropbox.upper_right = REFERENCE_BOX[1]
|
||||
|
||||
if REFERENCE_BOX is None:
|
||||
# Make a copy, not a reference
|
||||
REFERENCE_BOX = (
|
||||
page.cropbox.lower_left,
|
||||
page.cropbox.upper_right
|
||||
)
|
||||
# Step 3: Normalize page size
|
||||
page.mediabox.lower_left = REFERENCE_BOX[0]
|
||||
page.mediabox.upper_right = REFERENCE_BOX[1]
|
||||
page.cropbox.lower_left = REFERENCE_BOX[0]
|
||||
page.cropbox.upper_right = REFERENCE_BOX[1]
|
||||
content_writer.add_page(page)
|
||||
current_page += 1
|
||||
else:
|
||||
|
||||
writer.add_page(page)
|
||||
target, level = find_section_with_level(
|
||||
outline_tree, entry)
|
||||
if not target:
|
||||
print(
|
||||
f"[WARN] Section {entry} not found in {pdf_info['file']}")
|
||||
continue
|
||||
|
||||
# Track TOC
|
||||
toc_entries.append({
|
||||
"title": f"{section_title} ({file_path})",
|
||||
"page": current_page_index + 1 # 1-based page number
|
||||
})
|
||||
current_page_index += (end_page - start_page)
|
||||
start_page = target["page"]
|
||||
end_page = find_end_page(target, outline_tree, total_pages)
|
||||
|
||||
# --------- ADD TOC PAGE(S) ----------
|
||||
toc_entries.append({
|
||||
"title": target["title"], # EXACT heading text
|
||||
"page": current_page + 1, # 1-based
|
||||
"level": level
|
||||
})
|
||||
|
||||
for i, p in enumerate(range(start_page, end_page)):
|
||||
page = reader.pages[p]
|
||||
|
||||
if i == 0:
|
||||
crop_page(page, HEADER_CROP, FOOTER_CROP)
|
||||
else:
|
||||
crop_page(page, bottom_ratio=FOOTER_CROP)
|
||||
|
||||
# Capture reference AFTER cropping
|
||||
if REFERENCE_BOX is None:
|
||||
REFERENCE_BOX = (
|
||||
page.cropbox.lower_left,
|
||||
page.cropbox.upper_right
|
||||
)
|
||||
|
||||
# Normalize page size
|
||||
page.mediabox.lower_left = REFERENCE_BOX[0]
|
||||
page.mediabox.upper_right = REFERENCE_BOX[1]
|
||||
page.cropbox.lower_left = REFERENCE_BOX[0]
|
||||
page.cropbox.upper_right = REFERENCE_BOX[1]
|
||||
|
||||
content_writer.add_page(page)
|
||||
current_page += 1
|
||||
|
||||
|
||||
def create_toc_pdf(toc_entries):
|
||||
packet = BytesIO()
|
||||
c = canvas.Canvas(packet, pagesize=LETTER)
|
||||
c.setFont("Helvetica-Bold", 16)
|
||||
c.drawString(50, 750, "Table of Contents")
|
||||
c.setFont("Helvetica", 12)
|
||||
y = 720
|
||||
for entry in toc_entries:
|
||||
text = f"{entry['title']} .... {entry['page']}"
|
||||
c.drawString(50, y, text)
|
||||
y -= 20
|
||||
if y < 50:
|
||||
c.showPage()
|
||||
y = 750
|
||||
c.save()
|
||||
packet.seek(0)
|
||||
return PdfReader(packet)
|
||||
# ---------- Build final PDF ---------------
|
||||
|
||||
|
||||
toc_pdf = create_toc_pdf(toc_entries)
|
||||
|
||||
# Combine TOC + extracted sections
|
||||
final_writer = PdfWriter()
|
||||
|
||||
# TOC first
|
||||
# Derive TOC heading from first source document
|
||||
first_reader = PdfReader(PDF_INPUTS[0]["file"])
|
||||
toc_heading = "Contents" if first_reader.outline else "Table of Contents"
|
||||
|
||||
# Visible TOC pages
|
||||
toc_pdf = create_toc_pdf(toc_entries, toc_heading)
|
||||
toc_page_count = len(toc_pdf.pages)
|
||||
|
||||
for page in toc_pdf.pages:
|
||||
final_writer.add_page(page)
|
||||
|
||||
# Then extracted content
|
||||
for page in writer.pages:
|
||||
# Content pages
|
||||
for page in content_writer.pages:
|
||||
final_writer.add_page(page)
|
||||
|
||||
# Save
|
||||
bookmark_stack = {}
|
||||
|
||||
for entry in toc_entries:
|
||||
parent = bookmark_stack.get(entry["level"] - 1)
|
||||
|
||||
bm = final_writer.add_outline_item(
|
||||
title=entry["title"], # exact heading text
|
||||
page_number=(entry["page"] - 1) + toc_page_count,
|
||||
parent=parent
|
||||
)
|
||||
|
||||
bookmark_stack[entry["level"]] = bm
|
||||
|
||||
|
||||
# ---------- Write output ------------------
|
||||
|
||||
with open(OUTPUT_PDF, "wb") as f:
|
||||
final_writer.write(f)
|
||||
|
||||
|
||||
# --------- WRITE OUTPUT -----------
|
||||
with open(OUTPUT_PDF, "wb") as f:
|
||||
final_writer.write(f)
|
||||
|
||||
print(f"[INFO] Combined PDF written to {OUTPUT_PDF} with TOC.")
|
||||
print(f"[OK] Created {OUTPUT_PDF}")
|
||||
|
||||
Reference in New Issue
Block a user