update pdf creator
TOC, harmonize page size
This commit is contained in:
@@ -1,28 +1,23 @@
|
|||||||
from pypdf import PdfReader, PdfWriter
|
from pypdf import PdfReader
|
||||||
|
from io import BytesIO
|
||||||
|
from reportlab.lib.pagesizes import LETTER
|
||||||
|
from reportlab.pdfgen import canvas
|
||||||
|
from pypdf import PdfReader, PdfWriter, PageObject
|
||||||
|
|
||||||
INPUT_PDF = "pdfcreator/input.pdf"
|
# ----------- CONFIG -------------
|
||||||
OUTPUT_PDF = "pdfcreator/extracted_section.pdf"
|
PDF_INPUTS = [
|
||||||
TARGET_SECTION_TITLE = "1.3"
|
{"file": "pdfcreator/input.pdf", "sections": ["1", "2.2", "3"]},
|
||||||
|
{"file": "pdfcreator/input2.pdf", "sections": ["3", "4"]},
|
||||||
|
]
|
||||||
|
OUTPUT_PDF = "pdfcreator/combined_sections.pdf"
|
||||||
|
|
||||||
|
|
||||||
def crop_page(page, top_ratio=0.12, bottom_ratio=0.12):
|
# Cropping ratios
|
||||||
"""
|
HEADER_CROP = 0.1 # top of first page of section
|
||||||
Crop the visible area of a PDF page.
|
FOOTER_CROP = 0.0 # bottom of pages
|
||||||
|
# --------------------------------
|
||||||
|
|
||||||
top_ratio: fraction of page height to remove from the top
|
# ----- HELPER FUNCTIONS ---------
|
||||||
bottom_ratio: fraction of page height to remove from the bottom
|
|
||||||
"""
|
|
||||||
llx, lly, urx, ury = page.mediabox
|
|
||||||
height = ury - lly
|
|
||||||
|
|
||||||
new_lly = lly + height * bottom_ratio
|
|
||||||
new_ury = ury - height * top_ratio
|
|
||||||
|
|
||||||
if new_ury <= new_lly:
|
|
||||||
raise ValueError("Invalid crop ratios: page height would be negative")
|
|
||||||
|
|
||||||
page.cropbox.lower_left = (llx, new_lly)
|
|
||||||
page.cropbox.upper_right = (urx, new_ury)
|
|
||||||
|
|
||||||
|
|
||||||
def build_outline_tree(reader):
|
def build_outline_tree(reader):
|
||||||
@@ -38,7 +33,6 @@ def build_outline_tree(reader):
|
|||||||
"children": []
|
"children": []
|
||||||
})
|
})
|
||||||
return tree
|
return tree
|
||||||
|
|
||||||
return _build(reader.outline)
|
return _build(reader.outline)
|
||||||
|
|
||||||
|
|
||||||
@@ -53,77 +47,157 @@ def find_section(nodes, title):
|
|||||||
|
|
||||||
|
|
||||||
def collect_subtree_pages(node, pages=None):
|
def collect_subtree_pages(node, pages=None):
|
||||||
"""
|
|
||||||
Collect all page numbers belonging to this section and its descendants.
|
|
||||||
"""
|
|
||||||
if pages is None:
|
if pages is None:
|
||||||
pages = []
|
pages = []
|
||||||
|
|
||||||
pages.append(node["page"])
|
pages.append(node["page"])
|
||||||
for child in node["children"]:
|
for child in node["children"]:
|
||||||
collect_subtree_pages(child, pages)
|
collect_subtree_pages(child, pages)
|
||||||
|
|
||||||
return pages
|
return pages
|
||||||
|
|
||||||
|
|
||||||
def flatten_outline_pages(nodes, pages=None):
|
def flatten_outline_pages(nodes, pages=None):
|
||||||
"""
|
|
||||||
Collect all outline entry page numbers in document order.
|
|
||||||
"""
|
|
||||||
if pages is None:
|
if pages is None:
|
||||||
pages = []
|
pages = []
|
||||||
|
|
||||||
for node in nodes:
|
for node in nodes:
|
||||||
pages.append(node["page"])
|
pages.append(node["page"])
|
||||||
flatten_outline_pages(node["children"], pages)
|
flatten_outline_pages(node["children"], pages)
|
||||||
|
|
||||||
return pages
|
return pages
|
||||||
|
|
||||||
|
|
||||||
def find_end_page(target_node, outline_tree, total_pages):
|
def find_end_page(target_node, outline_tree, total_pages):
|
||||||
"""
|
|
||||||
End page = first outline page after the last descendant page.
|
|
||||||
"""
|
|
||||||
subtree_pages = collect_subtree_pages(target_node)
|
subtree_pages = collect_subtree_pages(target_node)
|
||||||
last_section_page = max(subtree_pages)
|
last_section_page = max(subtree_pages)
|
||||||
|
|
||||||
all_outline_pages = flatten_outline_pages(outline_tree)
|
all_outline_pages = flatten_outline_pages(outline_tree)
|
||||||
all_outline_pages = sorted(set(all_outline_pages))
|
all_outline_pages = sorted(set(all_outline_pages))
|
||||||
|
|
||||||
for page in all_outline_pages:
|
for page in all_outline_pages:
|
||||||
if page > last_section_page:
|
if page > last_section_page:
|
||||||
return page
|
return page
|
||||||
|
|
||||||
return total_pages
|
return total_pages
|
||||||
|
|
||||||
|
|
||||||
def extract_section():
|
def crop_page(page, top_ratio=0.0, bottom_ratio=0.0):
|
||||||
reader = PdfReader(INPUT_PDF)
|
llx, lly, urx, ury = page.mediabox
|
||||||
writer = PdfWriter()
|
height = ury - lly
|
||||||
|
new_lly = lly + height * bottom_ratio
|
||||||
|
new_ury = ury - height * top_ratio
|
||||||
|
if new_ury <= new_lly:
|
||||||
|
raise ValueError("Invalid crop ratios: page height would be negative")
|
||||||
|
page.cropbox.lower_left = (llx, new_lly)
|
||||||
|
page.cropbox.upper_right = (urx, new_ury)
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_page_size(page, reference_box):
|
||||||
|
"""
|
||||||
|
Force page MediaBox and CropBox to match reference.
|
||||||
|
"""
|
||||||
|
page.mediabox.lower_left = reference_box.lower_left
|
||||||
|
page.mediabox.upper_right = reference_box.upper_right
|
||||||
|
|
||||||
|
page.cropbox.lower_left = reference_box.lower_left
|
||||||
|
page.cropbox.upper_right = reference_box.upper_right
|
||||||
|
|
||||||
|
# --------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
# --------- MAIN PROCESS ----------
|
||||||
|
writer = PdfWriter()
|
||||||
|
toc_entries = [] # To build TOC later
|
||||||
|
current_page_index = 0
|
||||||
|
|
||||||
|
for pdf_info in PDF_INPUTS:
|
||||||
|
file_path = pdf_info["file"]
|
||||||
|
sections_to_extract = pdf_info["sections"]
|
||||||
|
|
||||||
|
reader = PdfReader(file_path)
|
||||||
outline_tree = build_outline_tree(reader)
|
outline_tree = build_outline_tree(reader)
|
||||||
total_pages = len(reader.pages)
|
total_pages = len(reader.pages)
|
||||||
|
|
||||||
target = find_section(outline_tree, TARGET_SECTION_TITLE)
|
for section_title in sections_to_extract:
|
||||||
|
target = find_section(outline_tree, section_title)
|
||||||
if not target:
|
if not target:
|
||||||
raise ValueError(f"Section '{TARGET_SECTION_TITLE}' not found")
|
print(f"[WARN] Section '{section_title}' not found in {file_path}")
|
||||||
|
continue
|
||||||
|
|
||||||
start_page = target["page"]
|
start_page = target["page"]
|
||||||
end_page = find_end_page(target, outline_tree, total_pages)
|
end_page = find_end_page(target, outline_tree, total_pages)
|
||||||
|
|
||||||
for p in range(start_page, end_page):
|
REFERENCE_BOX = None
|
||||||
|
# Add pages to combined PDF
|
||||||
|
for i, p in enumerate(range(start_page, end_page)):
|
||||||
page = reader.pages[p]
|
page = reader.pages[p]
|
||||||
crop_page(page)
|
|
||||||
|
# Crop first page header+footer
|
||||||
|
if i == 0:
|
||||||
|
crop_page(page, top_ratio=HEADER_CROP,
|
||||||
|
bottom_ratio=FOOTER_CROP)
|
||||||
|
else:
|
||||||
|
crop_page(page, top_ratio=HEADER_CROP,
|
||||||
|
bottom_ratio=FOOTER_CROP)
|
||||||
|
# crop_page(page, bottom_ratio=FOOTER_CROP)
|
||||||
|
|
||||||
|
if REFERENCE_BOX is None:
|
||||||
|
# Make a copy, not a reference
|
||||||
|
REFERENCE_BOX = (
|
||||||
|
page.cropbox.lower_left,
|
||||||
|
page.cropbox.upper_right
|
||||||
|
)
|
||||||
|
# Step 3: Normalize page size
|
||||||
|
page.mediabox.lower_left = REFERENCE_BOX[0]
|
||||||
|
page.mediabox.upper_right = REFERENCE_BOX[1]
|
||||||
|
page.cropbox.lower_left = REFERENCE_BOX[0]
|
||||||
|
page.cropbox.upper_right = REFERENCE_BOX[1]
|
||||||
|
|
||||||
writer.add_page(page)
|
writer.add_page(page)
|
||||||
|
|
||||||
with open(OUTPUT_PDF, "wb") as f:
|
# Track TOC
|
||||||
writer.write(f)
|
toc_entries.append({
|
||||||
|
"title": f"{section_title} ({file_path})",
|
||||||
|
"page": current_page_index + 1 # 1-based page number
|
||||||
|
})
|
||||||
|
current_page_index += (end_page - start_page)
|
||||||
|
|
||||||
print(
|
# --------- ADD TOC PAGE(S) ----------
|
||||||
f"Extracted '{target['title']}' "
|
|
||||||
f"(pages {start_page + 1}–{end_page})"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
def create_toc_pdf(toc_entries):
|
||||||
extract_section()
|
packet = BytesIO()
|
||||||
|
c = canvas.Canvas(packet, pagesize=LETTER)
|
||||||
|
c.setFont("Helvetica-Bold", 16)
|
||||||
|
c.drawString(50, 750, "Table of Contents")
|
||||||
|
c.setFont("Helvetica", 12)
|
||||||
|
y = 720
|
||||||
|
for entry in toc_entries:
|
||||||
|
text = f"{entry['title']} .... {entry['page']}"
|
||||||
|
c.drawString(50, y, text)
|
||||||
|
y -= 20
|
||||||
|
if y < 50:
|
||||||
|
c.showPage()
|
||||||
|
y = 750
|
||||||
|
c.save()
|
||||||
|
packet.seek(0)
|
||||||
|
return PdfReader(packet)
|
||||||
|
|
||||||
|
|
||||||
|
toc_pdf = create_toc_pdf(toc_entries)
|
||||||
|
|
||||||
|
# Combine TOC + extracted sections
|
||||||
|
final_writer = PdfWriter()
|
||||||
|
|
||||||
|
# TOC first
|
||||||
|
for page in toc_pdf.pages:
|
||||||
|
final_writer.add_page(page)
|
||||||
|
|
||||||
|
# Then extracted content
|
||||||
|
for page in writer.pages:
|
||||||
|
final_writer.add_page(page)
|
||||||
|
|
||||||
|
# Save
|
||||||
|
with open(OUTPUT_PDF, "wb") as f:
|
||||||
|
final_writer.write(f)
|
||||||
|
|
||||||
|
|
||||||
|
# --------- WRITE OUTPUT -----------
|
||||||
|
with open(OUTPUT_PDF, "wb") as f:
|
||||||
|
final_writer.write(f)
|
||||||
|
|
||||||
|
print(f"[INFO] Combined PDF written to {OUTPUT_PDF} with TOC.")
|
||||||
|
|||||||
Reference in New Issue
Block a user