diff --git a/python/.vscode/launch.json b/python/.vscode/launch.json index 7774467..dac6996 100644 --- a/python/.vscode/launch.json +++ b/python/.vscode/launch.json @@ -4,12 +4,17 @@ // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 "version": "0.2.0", "configurations": [ - { - "name": "Python Debugger: Current File", + "name": "Debug PDF Aggregator", "type": "debugpy", "request": "launch", "program": "${file}", + "args": [ + "-o", + "${workspaceFolder}/pdfcreator/out.pdf", + "-i", + "${workspaceFolder}/pdfcreator/input.pdf:5,@50-60", + ], "console": "integratedTerminal" } ] diff --git a/python/pdfcreator/README.md b/python/pdfcreator/README.md new file mode 100644 index 0000000..bab4f6e --- /dev/null +++ b/python/pdfcreator/README.md @@ -0,0 +1,161 @@ +# PDF Aggregator & Section Extractor + +## Purpose + +This Python tool allows you to **extract sections or page ranges from one or more PDFs** and aggregate them into a **single PDF** with: + +- Table of Contents (TOC) reflecting section headings +- Optional bookmarks corresponding to TOC entries +- Cropped pages to remove headers and footers for a clean layout +- Flat TOC structure with unnumbered section titles + +It is designed for **research papers, reports, or multi-chapter PDFs** where you need to assemble a subset of content into a new document, while preserving readability and navigation. + +--- + +## Key Features + +- **Section-based extraction:** Extract sections (e.g., `1.3`, `5`) and all subsections automatically. +- **Page-range extraction:** Extract explicit page ranges using a special syntax (e.g., `@10-20`). +- **Header/Footer cropping:** Automatically removes unwanted headers and footers while maintaining consistent page sizes across multiple PDFs. +- **Flat Table of Contents:** TOC shows only the section titles from the original PDFs; numbering and indentation are stripped. +- **Bookmarks:** Optional PDF bookmarks that match the TOC entries for easy navigation. +- **Multi-source aggregation:** Accepts multiple PDFs and merges the specified sections or page ranges into a single output file. +- **Command-line driven:** Fully configurable via CLI arguments and compatible with debugging in VS Code. + +--- + +## Design Choices + +1. **Separation of concerns** + - `pdfaggregator.py`: All core PDF handling functions (section extraction, cropping, TOC creation). + - `driver.py`: CLI entry point that parses arguments, validates inputs, and orchestrates aggregation. + - `test_pdfaggregator.py`: Unit tests for parsing, extraction, TOC, and page handling. + +2. **Level-aware section matching** + - Section numbers (e.g., `5`) match only top-level chapters, not subsections like `5.1`. + - Page ranges (prefixed with `@`) are treated explicitly and bypass section matching. + +3. **Argument parsing** + - Uses Python’s standard `argparse` for CLI argument handling. + - Input format supports multiple PDFs and multiple sections per file, e.g.: + + ``` + -i input.pdf:5,@50-60 input2.pdf:3.2 + ``` + +4. **Debug-friendly and testable** + - CLI parsing separated from `main()` for easier unit testing. + - Compatible with VS Code debugging (`launch.json` with `${file}` for portable paths). + +5. **TOC and bookmark handling** + - TOC entries reflect **section headings only**; numbering stripped. + - Bookmarks are linked to corresponding pages in the final PDF. + - Cropping and page normalization ensures consistent layout. + +6. **Unit tests** + - Core functions, including `parse_inputs`, page extraction, cropping, and TOC creation, are covered. + - Uses `unittest` and avoids direct CLI dependency for testability. + +--- + +## How it Works (High-Level Flow) + +1. **Parse CLI inputs** + - Extract PDF file paths and associated sections or page ranges. + - Normalize section references (e.g., `@10-20`). + +2. **Iterate through PDFs** + - Load PDF using `PyPDF2`. + - Build outline tree (hierarchical sections) if available. + +3. **Process sections/page ranges** + - For each requested section: + - Find matching section heading in outline (level-aware). + - Determine start and end pages, including all subsections. + - For page ranges prefixed with `@`, extract exact pages. + +4. **Crop pages** + - Remove headers and footers based on configurable ratios. + - Normalize page size across all PDFs for visual consistency. + +5. **Assemble final PDF** + - Add a Table of Contents as the first pages. + - Merge all extracted pages. + - Add bookmarks corresponding to TOC entries. + +6. **Write output** + - Save aggregated PDF to specified output path. + - TOC shows **titles only**, flat structure, and correct page references. + +--- + + +---------------------+ + | CLI / Inputs | + | - PDF files | + | - Sections/@pages | + +----------+----------+ + | + v + +---------------------+ + | parse_inputs() | + | Normalize sections | + | Detect page ranges | + +----------+----------+ + | + v + +---------------------+ + | Iterate PDFs | + | Load PDF with | + | PyPDF2 | + +----------+----------+ + | + +-------------+--------------+ + | | + v v + +-------------------+ +------------------+ + | Section Extraction| | Page Range (@) | + | - Match outline | | - Extract pages | + | - Level-aware | | directly | + | - Subsections | +------------------+ + +--------+----------+ + | + v + +----------------------+ + | Crop Pages | + | - Remove headers | + | - Remove footers | + | - Normalize size | + +----------+-----------+ + | + v + +----------------------+ + | TOC & Bookmarks | + | - Extracted section | + | titles only | + | - Flat TOC structure | + | - Bookmarks linked | + +----------+-----------+ + | + v + +----------------------+ + | Merge PDF Pages | + | - TOC pages first | + | - All extracted pages| + +----------+-----------+ + | + v + +----------------------+ + | Output PDF | + | - TOC & bookmarks | + | - Cropped & normalized| + +----------------------+ + + +--- +## Example Usage + +```bash +python driver.py \ + -o aggregated.pdf \ + -i input.pdf:5,@50-60 input2.pdf:3.2 diff --git a/python/pdfcreator/driver.py b/python/pdfcreator/driver.py new file mode 100644 index 0000000..14376e9 --- /dev/null +++ b/python/pdfcreator/driver.py @@ -0,0 +1,99 @@ +# driver.py +from pypdf import PageObject +from pdfaggregator import * +from pypdf import PdfReader, PdfWriter +import argparse + + +PDF_INPUTS = [ + {"file": "pdfcreator/input.pdf", "sections": ["5", "@20-30"]}, + # {"file": "pdfcreator/input2.pdf", + # "sections": ["3.1"]} +] + +OUTPUT_PDF = "pdfcreator/extracted_sections.pdf" + +# crop ratios +HEADER_CROP = 0.1 +FOOTER_CROP = 0.03 + + +def main(pdf_inputs, output_pdf): + + content_writer = PdfWriter() + toc_entries = [] + current_page = 0 + + for pdf_info in pdf_inputs: # or change + REFERENCE_BOX = None + reader = PdfReader(pdf_info["file"]) + outline_tree = build_outline_tree(reader) + total_pages = len(reader.pages) + + for entry in pdf_info["sections"]: + page_indices = parse_page_range(entry) + + if page_indices: + # Explicit page range + current_page, REFERENCE_BOX, toc_entry = extract_page_range( + entry, reader, content_writer, current_page, REFERENCE_BOX, + HEADER_CROP, FOOTER_CROP + ) + else: + # Section prefix + current_page, REFERENCE_BOX, toc_entry = extract_section_prefix( + entry, reader, content_writer, current_page, REFERENCE_BOX, + outline_tree, HEADER_CROP, FOOTER_CROP + ) + + if toc_entry: + toc_entries.append(toc_entry) + + # Create TOC PDF + toc_pdf = create_toc_pdf(toc_entries) + toc_page_count = len(toc_pdf.pages) + + final_writer = PdfWriter() + # add TOC pages + for page in toc_pdf.pages: + final_writer.add_page(page) + # add content pages + for page in content_writer.pages: + final_writer.add_page(page) + + # Add bookmarks + bookmark_stack = {} + for entry in toc_entries: + parent = bookmark_stack.get(entry["level"] - 1) + bm = final_writer.add_outline_item( + title=entry["title"], + page_number=(entry["page"] - 1) + toc_page_count, + parent=parent + ) + bookmark_stack[entry["level"]] = bm + + with open(output_pdf, "wb") as f: + final_writer.write(f) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Extract sections and page ranges from PDFs and aggregate into a new PDF with TOC and bookmarks." + ) + + parser.add_argument( + "-o", "--output", + required=True, + help="Output PDF file" + ) + + parser.add_argument( + "-i", "--inputs", + nargs="+", + help="Inputs in the form file.pdf:section1,section2 or file.pdf:@1-10" + ) + + args = parser.parse_args() + pdf_inputs = parse_inputs(args) + + main(pdf_inputs, args.output) diff --git a/python/pdfcreator/extracted_section.pdf b/python/pdfcreator/extracted_section.pdf deleted file mode 100644 index ee3736a..0000000 Binary files a/python/pdfcreator/extracted_section.pdf and /dev/null differ diff --git a/python/pdfcreator/main.py b/python/pdfcreator/main.py index d4448cd..3448bb2 100644 --- a/python/pdfcreator/main.py +++ b/python/pdfcreator/main.py @@ -1,23 +1,50 @@ -from pypdf import PdfReader -from io import BytesIO -from reportlab.lib.pagesizes import LETTER +import re +from pypdf import PdfReader, PdfWriter from reportlab.pdfgen import canvas -from pypdf import PdfReader, PdfWriter, PageObject +from reportlab.lib.pagesizes import LETTER +from io import BytesIO + +# ================= CONFIG ================= -# ----------- CONFIG ------------- PDF_INPUTS = [ - {"file": "pdfcreator/input.pdf", "sections": ["1", "2.2", "3"]}, - {"file": "pdfcreator/input2.pdf", "sections": ["3", "4"]}, + {"file": "pdfcreator/input.pdf", "sections": ["1.3", "2.1", "@111-114"]}, + {"file": "pdfcreator/input2.pdf", "sections": ["3.2"]}, ] -OUTPUT_PDF = "pdfcreator/combined_sections.pdf" + +OUTPUT_PDF = "pdfcreator/extracted_sections.pdf" + +HEADER_CROP = 0.12 # top of first page of section +FOOTER_CROP = 0.06 # bottom of all pages + +# ========================================= -# Cropping ratios -HEADER_CROP = 0.1 # top of first page of section -FOOTER_CROP = 0.0 # bottom of pages -# -------------------------------- +def strip_numbering(title): + """ + Remove leading numbering from a string like '1.3 Background' + Returns 'Background'. + """ + return re.sub(r'^\d+(\.\d+)*\s+', '', title) -# ----- HELPER FUNCTIONS --------- + +# ---------- Outline utilities ------------ +def parse_page_range(entry): + """ + Returns a list of zero-based page indices if entry is a page range. + Page ranges must be prefixed with '@', e.g., "@1-10". + Otherwise returns None (treated as section prefix). + """ + if entry.startswith("@"): + s = entry[1:] # remove the @ + try: + start, end = s.split("-") + start = int(start) - 1 # zero-based + end = int(end) # inclusive in range + return list(range(start, end)) + except ValueError: + print(f"[WARN] Invalid page range: {entry}") + return None + return None # not a page range def build_outline_tree(reader): @@ -36,14 +63,14 @@ def build_outline_tree(reader): return _build(reader.outline) -def find_section(nodes, title): +def find_section_with_level(nodes, prefix, level=0): for node in nodes: - if node["title"] == title or node["title"].startswith(title + " "): - return node - found = find_section(node["children"], title) - if found: + if node["title"].startswith(prefix): + return node, level + found = find_section_with_level(node["children"], prefix, level + 1) + if found[0]: return found - return None + return None, None def collect_subtree_pages(node, pages=None): @@ -66,138 +93,178 @@ def flatten_outline_pages(nodes, pages=None): def find_end_page(target_node, outline_tree, total_pages): subtree_pages = collect_subtree_pages(target_node) - last_section_page = max(subtree_pages) - all_outline_pages = flatten_outline_pages(outline_tree) - all_outline_pages = sorted(set(all_outline_pages)) - for page in all_outline_pages: - if page > last_section_page: - return page + last_page = max(subtree_pages) + + all_pages = sorted(set(flatten_outline_pages(outline_tree))) + for p in all_pages: + if p > last_page: + return p return total_pages +# ---------- Page manipulation ------------ + def crop_page(page, top_ratio=0.0, bottom_ratio=0.0): llx, lly, urx, ury = page.mediabox height = ury - lly + new_lly = lly + height * bottom_ratio new_ury = ury - height * top_ratio - if new_ury <= new_lly: - raise ValueError("Invalid crop ratios: page height would be negative") + page.cropbox.lower_left = (llx, new_lly) page.cropbox.upper_right = (urx, new_ury) -def normalize_page_size(page, reference_box): - """ - Force page MediaBox and CropBox to match reference. - """ - page.mediabox.lower_left = reference_box.lower_left - page.mediabox.upper_right = reference_box.upper_right +# ---------- TOC generation --------------- +def create_toc_pdf(toc_entries, heading): + buffer = BytesIO() + c = canvas.Canvas(buffer, pagesize=LETTER) - page.cropbox.lower_left = reference_box.lower_left - page.cropbox.upper_right = reference_box.upper_right + c.setFont("Helvetica-Bold", 16) + c.drawString(50, 750, heading) -# -------------------------------- + c.setFont("Helvetica", 12) + y = 720 + + for entry in toc_entries: + line = f"{strip_numbering(entry['title'])} ........................ {entry['page']}" + c.drawString(50, y, line) # flat: no indentation + y -= 18 + + if y < 50: + c.showPage() + c.setFont("Helvetica", 12) + y = 750 + + c.save() + buffer.seek(0) + return PdfReader(buffer) -# --------- MAIN PROCESS ---------- -writer = PdfWriter() -toc_entries = [] # To build TOC later -current_page_index = 0 +# ================= MAIN =================== + +content_writer = PdfWriter() +toc_entries = [] +current_page = 0 +REFERENCE_BOX = None for pdf_info in PDF_INPUTS: - file_path = pdf_info["file"] - sections_to_extract = pdf_info["sections"] - - reader = PdfReader(file_path) + reader = PdfReader(pdf_info["file"]) outline_tree = build_outline_tree(reader) total_pages = len(reader.pages) - for section_title in sections_to_extract: - target = find_section(outline_tree, section_title) - if not target: - print(f"[WARN] Section '{section_title}' not found in {file_path}") - continue + for entry in pdf_info["sections"]: - start_page = target["page"] - end_page = find_end_page(target, outline_tree, total_pages) + page_indices = parse_page_range(entry) - REFERENCE_BOX = None - # Add pages to combined PDF - for i, p in enumerate(range(start_page, end_page)): - page = reader.pages[p] + if page_indices: + # --- Explicit page range --- + toc_entries.append({ + "title": f"Pages {entry[1:]}", # remove '@' for display + "page": current_page + 1, + "level": 0 + }) + + for i, p in enumerate(page_indices): + if p < 0 or p >= total_pages: + print( + f"[WARN] Page {p+1} out of range in {pdf_info['file']}") + continue + page = reader.pages[p] - # Crop first page header+footer - if i == 0: crop_page(page, top_ratio=HEADER_CROP, bottom_ratio=FOOTER_CROP) - else: - crop_page(page, top_ratio=HEADER_CROP, - bottom_ratio=FOOTER_CROP) - # crop_page(page, bottom_ratio=FOOTER_CROP) + if REFERENCE_BOX is None: + REFERENCE_BOX = ( + page.cropbox.lower_left, + page.cropbox.upper_right + ) + page.mediabox.lower_left = REFERENCE_BOX[0] + page.mediabox.upper_right = REFERENCE_BOX[1] + page.cropbox.lower_left = REFERENCE_BOX[0] + page.cropbox.upper_right = REFERENCE_BOX[1] - if REFERENCE_BOX is None: - # Make a copy, not a reference - REFERENCE_BOX = ( - page.cropbox.lower_left, - page.cropbox.upper_right - ) - # Step 3: Normalize page size - page.mediabox.lower_left = REFERENCE_BOX[0] - page.mediabox.upper_right = REFERENCE_BOX[1] - page.cropbox.lower_left = REFERENCE_BOX[0] - page.cropbox.upper_right = REFERENCE_BOX[1] + content_writer.add_page(page) + current_page += 1 + else: - writer.add_page(page) + target, level = find_section_with_level( + outline_tree, entry) + if not target: + print( + f"[WARN] Section {entry} not found in {pdf_info['file']}") + continue - # Track TOC - toc_entries.append({ - "title": f"{section_title} ({file_path})", - "page": current_page_index + 1 # 1-based page number - }) - current_page_index += (end_page - start_page) + start_page = target["page"] + end_page = find_end_page(target, outline_tree, total_pages) -# --------- ADD TOC PAGE(S) ---------- + toc_entries.append({ + "title": target["title"], # EXACT heading text + "page": current_page + 1, # 1-based + "level": level + }) + + for i, p in enumerate(range(start_page, end_page)): + page = reader.pages[p] + + if i == 0: + crop_page(page, HEADER_CROP, FOOTER_CROP) + else: + crop_page(page, bottom_ratio=FOOTER_CROP) + + # Capture reference AFTER cropping + if REFERENCE_BOX is None: + REFERENCE_BOX = ( + page.cropbox.lower_left, + page.cropbox.upper_right + ) + + # Normalize page size + page.mediabox.lower_left = REFERENCE_BOX[0] + page.mediabox.upper_right = REFERENCE_BOX[1] + page.cropbox.lower_left = REFERENCE_BOX[0] + page.cropbox.upper_right = REFERENCE_BOX[1] + + content_writer.add_page(page) + current_page += 1 -def create_toc_pdf(toc_entries): - packet = BytesIO() - c = canvas.Canvas(packet, pagesize=LETTER) - c.setFont("Helvetica-Bold", 16) - c.drawString(50, 750, "Table of Contents") - c.setFont("Helvetica", 12) - y = 720 - for entry in toc_entries: - text = f"{entry['title']} .... {entry['page']}" - c.drawString(50, y, text) - y -= 20 - if y < 50: - c.showPage() - y = 750 - c.save() - packet.seek(0) - return PdfReader(packet) +# ---------- Build final PDF --------------- - -toc_pdf = create_toc_pdf(toc_entries) - -# Combine TOC + extracted sections final_writer = PdfWriter() -# TOC first +# Derive TOC heading from first source document +first_reader = PdfReader(PDF_INPUTS[0]["file"]) +toc_heading = "Contents" if first_reader.outline else "Table of Contents" + +# Visible TOC pages +toc_pdf = create_toc_pdf(toc_entries, toc_heading) +toc_page_count = len(toc_pdf.pages) + for page in toc_pdf.pages: final_writer.add_page(page) -# Then extracted content -for page in writer.pages: +# Content pages +for page in content_writer.pages: final_writer.add_page(page) -# Save +bookmark_stack = {} + +for entry in toc_entries: + parent = bookmark_stack.get(entry["level"] - 1) + + bm = final_writer.add_outline_item( + title=entry["title"], # exact heading text + page_number=(entry["page"] - 1) + toc_page_count, + parent=parent + ) + + bookmark_stack[entry["level"]] = bm + + +# ---------- Write output ------------------ + with open(OUTPUT_PDF, "wb") as f: final_writer.write(f) - -# --------- WRITE OUTPUT ----------- -with open(OUTPUT_PDF, "wb") as f: - final_writer.write(f) - -print(f"[INFO] Combined PDF written to {OUTPUT_PDF} with TOC.") +print(f"[OK] Created {OUTPUT_PDF}") diff --git a/python/pdfcreator/pdfaggregator.py b/python/pdfcreator/pdfaggregator.py new file mode 100644 index 0000000..9039188 --- /dev/null +++ b/python/pdfcreator/pdfaggregator.py @@ -0,0 +1,231 @@ +# pdfaggregator.py +import re +from io import BytesIO +from pypdf import PdfReader, PdfWriter, PageObject +from reportlab.pdfgen import canvas +from reportlab.lib.pagesizes import LETTER + +# ----------------------------- +# Parsing / Section Utilities +# ----------------------------- + + +def parse_page_range(entry): + """Return list of zero-based page indices if entry is a page range (@1-10).""" + if entry.startswith("@"): + s = entry[1:] + try: + start, end = s.split("-") + start = int(start) - 1 + end = int(end) + return list(range(start, end)) + except ValueError: + print(f"[WARN] Invalid page range: {entry}") + return None + return None + + +def strip_numbering(title): + """Remove leading numbering like '1.3 Background' -> 'Background'""" + return re.sub(r'^\d+(\.\d+)*\s+', '', title) + + +def crop_page(page, top_ratio=0.0, bottom_ratio=0.0): + """Crop the top/bottom of a page using ratios.""" + llx, lly, urx, ury = page.mediabox + height = ury - lly + new_lly = lly + height * bottom_ratio + new_ury = ury - height * top_ratio + page.cropbox.lower_left = (llx, new_lly) + page.cropbox.upper_right = (urx, new_ury) + +# ----------------------------- +# Outline / Section Tree +# ----------------------------- + + +def find_section_with_level(nodes, prefix, level=0): + """Find a section node by prefix in outline tree.""" + for node in nodes: + if node["title"].startswith(prefix): + return node, level + found = find_section_with_level( + node.get("children", []), prefix, level + 1) + if found[0]: + return found + return None, None + + +def collect_subtree_pages(node, pages=None): + """Recursively collect pages of node and all its children.""" + if pages is None: + pages = [] + pages.append(node["page"]) + for child in node.get("children", []): + collect_subtree_pages(child, pages) + return pages + + +def flatten_outline_pages(nodes, pages=None): + """Flatten all pages from the outline tree.""" + if pages is None: + pages = [] + for node in nodes: + pages.append(node["page"]) + flatten_outline_pages(node.get("children", []), pages) + return pages + + +def find_end_page(target_node, outline_tree, total_pages): + """Find the last page of a section including its subsections.""" + subtree_pages = collect_subtree_pages(target_node) + last_page = max(subtree_pages) + all_pages = sorted(set(flatten_outline_pages(outline_tree))) + for p in all_pages: + if p > last_page: + return p + return total_pages + + +def build_outline_tree(reader): + """ + Build a normalized outline tree from pypdf's reader.outline. + + Each node: + { + "title": str, + "page": int, + "children": [ ... ] + } + """ + def walk(items): + tree = [] + for item in items: + if isinstance(item, list): + # children of previous item + if tree: + tree[-1]["children"] = walk(item) + else: + tree.append({ + "title": item.title.strip(), + "page": reader.get_destination_page_number(item), + "children": [] + }) + return tree + + try: + outline = reader.outline + except Exception: + return [] + + if not outline: + return [] + + return walk(outline) + + +# ----------------------------- +# TOC Generation +# ----------------------------- + + +def create_toc_pdf(toc_entries, heading="Table of Contents"): + """Generate a flat, unnumbered TOC PDF page in memory.""" + buffer = BytesIO() + c = canvas.Canvas(buffer, pagesize=LETTER) + + c.setFont("Helvetica-Bold", 16) + c.drawString(50, 750, heading) + + c.setFont("Helvetica", 12) + y = 720 + + for entry in toc_entries: + line = f"{strip_numbering(entry['title'])} ........................ {entry['page']}" + c.drawString(50, y, line) + y -= 18 + if y < 50: + c.showPage() + c.setFont("Helvetica", 12) + y = 750 + + c.save() + buffer.seek(0) + return PdfReader(buffer) + + +def extract_page_range(entry, reader, content_writer, current_page, REFERENCE_BOX, header_crop=0.05, footer_crop=0.03): + """Extract pages from an explicit @page-range entry.""" + page_indices = parse_page_range(entry) + if not page_indices: + return current_page, REFERENCE_BOX, None # nothing extracted + + toc_entry = {"title": f"Pages {entry[1:]}", + "page": current_page + 1, "level": 0} + + for i, p in enumerate(page_indices): + if p < 0 or p >= len(reader.pages): + continue + page = reader.pages[p] + crop_page(page, header_crop, footer_crop) + if REFERENCE_BOX is None: + REFERENCE_BOX = (page.cropbox.lower_left, page.cropbox.upper_right) + page.mediabox.lower_left = REFERENCE_BOX[0] + page.mediabox.upper_right = REFERENCE_BOX[1] + page.cropbox.lower_left = REFERENCE_BOX[0] + page.cropbox.upper_right = REFERENCE_BOX[1] + content_writer.add_page(page) + current_page += 1 + + return current_page, REFERENCE_BOX, toc_entry + + +def extract_section_prefix(entry, reader, content_writer, current_page, REFERENCE_BOX, outline_tree, header_crop=0.05, footer_crop=0.03): + """Extract pages from a section prefix entry in the PDF outline.""" + target, level = find_section_with_level(outline_tree, entry) + if not target: + print( + f"[WARN] Section {entry} not found in PDF {reader.stream.name if hasattr(reader.stream, 'name') else ''}") + return current_page, REFERENCE_BOX, None + + start_page = target["page"] + end_page = find_end_page(target, outline_tree, len(reader.pages)) + toc_entry = {"title": target["title"], + "page": current_page + 1, "level": level} + + for i, p in enumerate(range(start_page, end_page)): + page = reader.pages[p] + crop_page(page, header_crop if i == 0 else 0, footer_crop) + if REFERENCE_BOX is None: + REFERENCE_BOX = (page.cropbox.lower_left, page.cropbox.upper_right) + page.mediabox.lower_left = REFERENCE_BOX[0] + page.mediabox.upper_right = REFERENCE_BOX[1] + page.cropbox.lower_left = REFERENCE_BOX[0] + page.cropbox.upper_right = REFERENCE_BOX[1] + content_writer.add_page(page) + current_page += 1 + + return current_page, REFERENCE_BOX, toc_entry + + +def parse_inputs(args): + """ + Parse CLI positional arguments into PDF_INPUTS structure. + """ + pdf_inputs = [] + + for item in args.inputs: + if ":" not in item: + raise ValueError( + f"Invalid input '{item}'. Expected format: file.pdf:section1,section2" + ) + + file_path, sections = item.split(":", 1) + section_list = [s.strip() for s in sections.split(",") if s.strip()] + + pdf_inputs.append({ + "file": file_path, + "sections": section_list + }) + + return pdf_inputs diff --git a/python/pdfcreator/terminal.sh b/python/pdfcreator/terminal.sh new file mode 100755 index 0000000..d7016a9 --- /dev/null +++ b/python/pdfcreator/terminal.sh @@ -0,0 +1,6 @@ +#!/bin/bash +export CT2_CUDA_ALLOW_FP16=1 + +# 'mamba run' executes the command within the context of the environment +# without needing to source .bashrc or shell hooks manually. +mamba run -n base python ~/family-repo/Code/python/pdfcreator/driver.py "$@" \ No newline at end of file diff --git a/python/pdfcreator/tests.py b/python/pdfcreator/tests.py new file mode 100644 index 0000000..4150dc2 --- /dev/null +++ b/python/pdfcreator/tests.py @@ -0,0 +1,128 @@ +import unittest +from pypdf import PdfWriter, PageObject +from types import SimpleNamespace +from pdfaggregator import parse_inputs, strip_numbering, crop_page, extract_page_range, extract_section_prefix, parse_page_range, find_section_with_level, find_end_page + + +class TestPdfExtractionFunctions(unittest.TestCase): + def setUp(self): + # Dummy PDF with 5 blank pages + self.writer = PdfWriter() + for _ in range(5): + self.writer.add_page( + PageObject.create_blank_page(width=600, height=800)) + self.reader = self.writer # pypdf writer can be used as reader for pages list + self.content_writer = PdfWriter() + self.outline_tree = [{"title": "Section1", "page": 0, "children": [ + {"title": "Section1.1", "page": 1, "children": []}]}] + + def test_extract_page_range(self): + current_page, REFERENCE_BOX, toc = extract_page_range( + "@1-3", self.reader, self.content_writer, 0, None) + self.assertEqual(len(self.content_writer.pages), 3) # pages 0 and 1 + self.assertEqual(toc["title"], "Pages 1-3") + self.assertEqual(current_page, 3) + + def test_extract_section_prefix(self): + current_page, REFERENCE_BOX, toc = extract_section_prefix( + "Section1", self.reader, self.content_writer, 0, None, self.outline_tree) + self.assertEqual(len(self.content_writer.pages), + 5) # page 0 + subsection 1 + self.assertEqual(toc["title"], "Section1") + self.assertEqual(current_page, 5) + + +class TestPdfAggregator(unittest.TestCase): + + def test_parse_page_range(self): + self.assertEqual(parse_page_range("@1-5"), [0, 1, 2, 3, 4]) + self.assertEqual(parse_page_range("@10-12"), [9, 10, 11]) + self.assertIsNone(parse_page_range("1.3")) + self.assertIsNone(parse_page_range("Introduction-Overview")) + + def test_strip_numbering(self): + self.assertEqual(strip_numbering("1.3 Background"), "Background") + self.assertEqual(strip_numbering( + "2.1.5 Experimental Setup"), "Experimental Setup") + self.assertEqual(strip_numbering("NoNumberingHere"), "NoNumberingHere") + + def test_crop_page(self): + page = PageObject.create_blank_page(width=600, height=800) + crop_page(page, top_ratio=0.1, bottom_ratio=0.05) + llx, lly = page.cropbox.lower_left + urx, ury = page.cropbox.upper_right + self.assertAlmostEqual(ury - lly, 800 * 0.85) + + +class TestParseInputs(unittest.TestCase): + + def test_single_pdf_single_section(self): + args = SimpleNamespace( + inputs=["doc1.pdf:1.3"] + ) + + result = parse_inputs(args) + + self.assertEqual(result, [ + { + "file": "doc1.pdf", + "sections": ["1.3"] + } + ]) + + def test_single_pdf_multiple_sections(self): + args = SimpleNamespace( + inputs=["doc1.pdf:1.3,2.1,@10-20"] + ) + + result = parse_inputs(args) + + self.assertEqual(result, [ + { + "file": "doc1.pdf", + "sections": ["1.3", "2.1", "@10-20"] + } + ]) + + def test_multiple_pdfs(self): + args = SimpleNamespace( + inputs=[ + "doc1.pdf:1.3,@5-10", + "doc2.pdf:Introduction,3.2" + ] + ) + + result = parse_inputs(args) + + self.assertEqual(result, [ + { + "file": "doc1.pdf", + "sections": ["1.3", "@5-10"] + }, + { + "file": "doc2.pdf", + "sections": ["Introduction", "3.2"] + } + ]) + + def test_whitespace_is_trimmed(self): + args = SimpleNamespace( + inputs=["doc1.pdf: 1.3 , @5-10 , Introduction "] + ) + + result = parse_inputs(args) + + self.assertEqual(result[0]["sections"], [ + "1.3", "@5-10", "Introduction"]) + + def test_missing_colon_raises_error(self): + args = SimpleNamespace( + inputs=["doc1.pdf"] + ) + + with self.assertRaises(ValueError): + parse_inputs(args) + + +if __name__ == "__main__": + unittest.main()