import unittest from pypdf import PdfWriter, PageObject from types import SimpleNamespace from pdfaggregator import parse_inputs, strip_numbering, crop_page, extract_page_range, extract_section_prefix, parse_page_range, find_section_with_level, find_end_page class TestPdfExtractionFunctions(unittest.TestCase): def setUp(self): # Dummy PDF with 5 blank pages self.writer = PdfWriter() for _ in range(5): self.writer.add_page( PageObject.create_blank_page(width=600, height=800)) self.reader = self.writer # pypdf writer can be used as reader for pages list self.content_writer = PdfWriter() self.outline_tree = [{"title": "Section1", "page": 0, "children": [ {"title": "Section1.1", "page": 1, "children": []}]}] def test_extract_page_range(self): current_page, REFERENCE_BOX, toc = extract_page_range( "@1-3", self.reader, self.content_writer, 0, None) self.assertEqual(len(self.content_writer.pages), 3) # pages 0 and 1 self.assertEqual(toc["title"], "Pages 1-3") self.assertEqual(current_page, 3) def test_extract_section_prefix(self): current_page, REFERENCE_BOX, toc = extract_section_prefix( "Section1", self.reader, self.content_writer, 0, None, self.outline_tree) self.assertEqual(len(self.content_writer.pages), 5) # page 0 + subsection 1 self.assertEqual(toc["title"], "Section1") self.assertEqual(current_page, 5) class TestPdfAggregator(unittest.TestCase): def test_parse_page_range(self): self.assertEqual(parse_page_range("@1-5"), [0, 1, 2, 3, 4]) self.assertEqual(parse_page_range("@10-12"), [9, 10, 11]) self.assertIsNone(parse_page_range("1.3")) self.assertIsNone(parse_page_range("Introduction-Overview")) def test_strip_numbering(self): self.assertEqual(strip_numbering("1.3 Background"), "Background") self.assertEqual(strip_numbering( "2.1.5 Experimental Setup"), "Experimental Setup") self.assertEqual(strip_numbering("NoNumberingHere"), "NoNumberingHere") def test_crop_page(self): page = PageObject.create_blank_page(width=600, height=800) crop_page(page, top_ratio=0.1, bottom_ratio=0.05) llx, lly = page.cropbox.lower_left urx, ury = page.cropbox.upper_right self.assertAlmostEqual(ury - lly, 800 * 0.85) class TestParseInputs(unittest.TestCase): def test_single_pdf_single_section(self): args = SimpleNamespace( inputs=["doc1.pdf:1.3"] ) result = parse_inputs(args) self.assertEqual(result, [ { "file": "doc1.pdf", "sections": ["1.3"] } ]) def test_single_pdf_multiple_sections(self): args = SimpleNamespace( inputs=["doc1.pdf:1.3,2.1,@10-20"] ) result = parse_inputs(args) self.assertEqual(result, [ { "file": "doc1.pdf", "sections": ["1.3", "2.1", "@10-20"] } ]) def test_multiple_pdfs(self): args = SimpleNamespace( inputs=[ "doc1.pdf:1.3,@5-10", "doc2.pdf:Introduction,3.2" ] ) result = parse_inputs(args) self.assertEqual(result, [ { "file": "doc1.pdf", "sections": ["1.3", "@5-10"] }, { "file": "doc2.pdf", "sections": ["Introduction", "3.2"] } ]) def test_whitespace_is_trimmed(self): args = SimpleNamespace( inputs=["doc1.pdf: 1.3 , @5-10 , Introduction "] ) result = parse_inputs(args) self.assertEqual(result[0]["sections"], [ "1.3", "@5-10", "Introduction"]) def test_missing_colon_raises_error(self): args = SimpleNamespace( inputs=["doc1.pdf"] ) with self.assertRaises(ValueError): parse_inputs(args) if __name__ == "__main__": unittest.main()