This commit is contained in:
local
2026-01-19 23:07:59 +00:00
parent a5934e45b2
commit 370e97d08d

View File

@@ -1,270 +0,0 @@
import re
from pypdf import PdfReader, PdfWriter
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import LETTER
from io import BytesIO
# ================= CONFIG =================
PDF_INPUTS = [
{"file": "pdfcreator/input.pdf", "sections": ["1.3", "2.1", "@111-114"]},
{"file": "pdfcreator/input2.pdf", "sections": ["3.2"]},
]
OUTPUT_PDF = "pdfcreator/extracted_sections.pdf"
HEADER_CROP = 0.12 # top of first page of section
FOOTER_CROP = 0.06 # bottom of all pages
# =========================================
def strip_numbering(title):
"""
Remove leading numbering from a string like '1.3 Background'
Returns 'Background'.
"""
return re.sub(r'^\d+(\.\d+)*\s+', '', title)
# ---------- Outline utilities ------------
def parse_page_range(entry):
"""
Returns a list of zero-based page indices if entry is a page range.
Page ranges must be prefixed with '@', e.g., "@1-10".
Otherwise returns None (treated as section prefix).
"""
if entry.startswith("@"):
s = entry[1:] # remove the @
try:
start, end = s.split("-")
start = int(start) - 1 # zero-based
end = int(end) # inclusive in range
return list(range(start, end))
except ValueError:
print(f"[WARN] Invalid page range: {entry}")
return None
return None # not a page range
def build_outline_tree(reader):
def _build(outline):
tree = []
for item in outline:
if isinstance(item, list):
tree[-1]["children"] = _build(item)
else:
tree.append({
"title": item.title.strip(),
"page": reader.get_destination_page_number(item),
"children": []
})
return tree
return _build(reader.outline)
def find_section_with_level(nodes, prefix, level=0):
for node in nodes:
if node["title"].startswith(prefix):
return node, level
found = find_section_with_level(node["children"], prefix, level + 1)
if found[0]:
return found
return None, None
def collect_subtree_pages(node, pages=None):
if pages is None:
pages = []
pages.append(node["page"])
for child in node["children"]:
collect_subtree_pages(child, pages)
return pages
def flatten_outline_pages(nodes, pages=None):
if pages is None:
pages = []
for node in nodes:
pages.append(node["page"])
flatten_outline_pages(node["children"], pages)
return pages
def find_end_page(target_node, outline_tree, total_pages):
subtree_pages = collect_subtree_pages(target_node)
last_page = max(subtree_pages)
all_pages = sorted(set(flatten_outline_pages(outline_tree)))
for p in all_pages:
if p > last_page:
return p
return total_pages
# ---------- Page manipulation ------------
def crop_page(page, top_ratio=0.0, bottom_ratio=0.0):
llx, lly, urx, ury = page.mediabox
height = ury - lly
new_lly = lly + height * bottom_ratio
new_ury = ury - height * top_ratio
page.cropbox.lower_left = (llx, new_lly)
page.cropbox.upper_right = (urx, new_ury)
# ---------- TOC generation ---------------
def create_toc_pdf(toc_entries, heading):
buffer = BytesIO()
c = canvas.Canvas(buffer, pagesize=LETTER)
c.setFont("Helvetica-Bold", 16)
c.drawString(50, 750, heading)
c.setFont("Helvetica", 12)
y = 720
for entry in toc_entries:
line = f"{strip_numbering(entry['title'])} ........................ {entry['page']}"
c.drawString(50, y, line) # flat: no indentation
y -= 18
if y < 50:
c.showPage()
c.setFont("Helvetica", 12)
y = 750
c.save()
buffer.seek(0)
return PdfReader(buffer)
# ================= MAIN ===================
content_writer = PdfWriter()
toc_entries = []
current_page = 0
REFERENCE_BOX = None
for pdf_info in PDF_INPUTS:
reader = PdfReader(pdf_info["file"])
outline_tree = build_outline_tree(reader)
total_pages = len(reader.pages)
for entry in pdf_info["sections"]:
page_indices = parse_page_range(entry)
if page_indices:
# --- Explicit page range ---
toc_entries.append({
"title": f"Pages {entry[1:]}", # remove '@' for display
"page": current_page + 1,
"level": 0
})
for i, p in enumerate(page_indices):
if p < 0 or p >= total_pages:
print(
f"[WARN] Page {p+1} out of range in {pdf_info['file']}")
continue
page = reader.pages[p]
crop_page(page, top_ratio=HEADER_CROP,
bottom_ratio=FOOTER_CROP)
if REFERENCE_BOX is None:
REFERENCE_BOX = (
page.cropbox.lower_left,
page.cropbox.upper_right
)
page.mediabox.lower_left = REFERENCE_BOX[0]
page.mediabox.upper_right = REFERENCE_BOX[1]
page.cropbox.lower_left = REFERENCE_BOX[0]
page.cropbox.upper_right = REFERENCE_BOX[1]
content_writer.add_page(page)
current_page += 1
else:
target, level = find_section_with_level(
outline_tree, entry)
if not target:
print(
f"[WARN] Section {entry} not found in {pdf_info['file']}")
continue
start_page = target["page"]
end_page = find_end_page(target, outline_tree, total_pages)
toc_entries.append({
"title": target["title"], # EXACT heading text
"page": current_page + 1, # 1-based
"level": level
})
for i, p in enumerate(range(start_page, end_page)):
page = reader.pages[p]
if i == 0:
crop_page(page, HEADER_CROP, FOOTER_CROP)
else:
crop_page(page, bottom_ratio=FOOTER_CROP)
# Capture reference AFTER cropping
if REFERENCE_BOX is None:
REFERENCE_BOX = (
page.cropbox.lower_left,
page.cropbox.upper_right
)
# Normalize page size
page.mediabox.lower_left = REFERENCE_BOX[0]
page.mediabox.upper_right = REFERENCE_BOX[1]
page.cropbox.lower_left = REFERENCE_BOX[0]
page.cropbox.upper_right = REFERENCE_BOX[1]
content_writer.add_page(page)
current_page += 1
# ---------- Build final PDF ---------------
final_writer = PdfWriter()
# Derive TOC heading from first source document
first_reader = PdfReader(PDF_INPUTS[0]["file"])
toc_heading = "Contents" if first_reader.outline else "Table of Contents"
# Visible TOC pages
toc_pdf = create_toc_pdf(toc_entries, toc_heading)
toc_page_count = len(toc_pdf.pages)
for page in toc_pdf.pages:
final_writer.add_page(page)
# Content pages
for page in content_writer.pages:
final_writer.add_page(page)
bookmark_stack = {}
for entry in toc_entries:
parent = bookmark_stack.get(entry["level"] - 1)
bm = final_writer.add_outline_item(
title=entry["title"], # exact heading text
page_number=(entry["page"] - 1) + toc_page_count,
parent=parent
)
bookmark_stack[entry["level"]] = bm
# ---------- Write output ------------------
with open(OUTPUT_PDF, "wb") as f:
final_writer.write(f)
print(f"[OK] Created {OUTPUT_PDF}")