129 lines
4.1 KiB
Python
129 lines
4.1 KiB
Python
import unittest
|
|
from pypdf import PdfWriter, PageObject
|
|
from types import SimpleNamespace
|
|
from pdfaggregator import parse_inputs, strip_numbering, crop_page, extract_page_range, extract_section_prefix, parse_page_range, find_section_with_level, find_end_page
|
|
|
|
|
|
class TestPdfExtractionFunctions(unittest.TestCase):
|
|
def setUp(self):
|
|
# Dummy PDF with 5 blank pages
|
|
self.writer = PdfWriter()
|
|
for _ in range(5):
|
|
self.writer.add_page(
|
|
PageObject.create_blank_page(width=600, height=800))
|
|
self.reader = self.writer # pypdf writer can be used as reader for pages list
|
|
self.content_writer = PdfWriter()
|
|
self.outline_tree = [{"title": "Section1", "page": 0, "children": [
|
|
{"title": "Section1.1", "page": 1, "children": []}]}]
|
|
|
|
def test_extract_page_range(self):
|
|
current_page, REFERENCE_BOX, toc = extract_page_range(
|
|
"@1-3", self.reader, self.content_writer, 0, None)
|
|
self.assertEqual(len(self.content_writer.pages), 3) # pages 0 and 1
|
|
self.assertEqual(toc["title"], "Pages 1-3")
|
|
self.assertEqual(current_page, 3)
|
|
|
|
def test_extract_section_prefix(self):
|
|
current_page, REFERENCE_BOX, toc = extract_section_prefix(
|
|
"Section1", self.reader, self.content_writer, 0, None, self.outline_tree)
|
|
self.assertEqual(len(self.content_writer.pages),
|
|
5) # page 0 + subsection 1
|
|
self.assertEqual(toc["title"], "Section1")
|
|
self.assertEqual(current_page, 5)
|
|
|
|
|
|
class TestPdfAggregator(unittest.TestCase):
|
|
|
|
def test_parse_page_range(self):
|
|
self.assertEqual(parse_page_range("@1-5"), [0, 1, 2, 3, 4])
|
|
self.assertEqual(parse_page_range("@10-12"), [9, 10, 11])
|
|
self.assertIsNone(parse_page_range("1.3"))
|
|
self.assertIsNone(parse_page_range("Introduction-Overview"))
|
|
|
|
def test_strip_numbering(self):
|
|
self.assertEqual(strip_numbering("1.3 Background"), "Background")
|
|
self.assertEqual(strip_numbering(
|
|
"2.1.5 Experimental Setup"), "Experimental Setup")
|
|
self.assertEqual(strip_numbering("NoNumberingHere"), "NoNumberingHere")
|
|
|
|
def test_crop_page(self):
|
|
page = PageObject.create_blank_page(width=600, height=800)
|
|
crop_page(page, top_ratio=0.1, bottom_ratio=0.05)
|
|
llx, lly = page.cropbox.lower_left
|
|
urx, ury = page.cropbox.upper_right
|
|
self.assertAlmostEqual(ury - lly, 800 * 0.85)
|
|
|
|
|
|
class TestParseInputs(unittest.TestCase):
|
|
|
|
def test_single_pdf_single_section(self):
|
|
args = SimpleNamespace(
|
|
inputs=["doc1.pdf:1.3"]
|
|
)
|
|
|
|
result = parse_inputs(args)
|
|
|
|
self.assertEqual(result, [
|
|
{
|
|
"file": "doc1.pdf",
|
|
"sections": ["1.3"]
|
|
}
|
|
])
|
|
|
|
def test_single_pdf_multiple_sections(self):
|
|
args = SimpleNamespace(
|
|
inputs=["doc1.pdf:1.3,2.1,@10-20"]
|
|
)
|
|
|
|
result = parse_inputs(args)
|
|
|
|
self.assertEqual(result, [
|
|
{
|
|
"file": "doc1.pdf",
|
|
"sections": ["1.3", "2.1", "@10-20"]
|
|
}
|
|
])
|
|
|
|
def test_multiple_pdfs(self):
|
|
args = SimpleNamespace(
|
|
inputs=[
|
|
"doc1.pdf:1.3,@5-10",
|
|
"doc2.pdf:Introduction,3.2"
|
|
]
|
|
)
|
|
|
|
result = parse_inputs(args)
|
|
|
|
self.assertEqual(result, [
|
|
{
|
|
"file": "doc1.pdf",
|
|
"sections": ["1.3", "@5-10"]
|
|
},
|
|
{
|
|
"file": "doc2.pdf",
|
|
"sections": ["Introduction", "3.2"]
|
|
}
|
|
])
|
|
|
|
def test_whitespace_is_trimmed(self):
|
|
args = SimpleNamespace(
|
|
inputs=["doc1.pdf: 1.3 , @5-10 , Introduction "]
|
|
)
|
|
|
|
result = parse_inputs(args)
|
|
|
|
self.assertEqual(result[0]["sections"], [
|
|
"1.3", "@5-10", "Introduction"])
|
|
|
|
def test_missing_colon_raises_error(self):
|
|
args = SimpleNamespace(
|
|
inputs=["doc1.pdf"]
|
|
)
|
|
|
|
with self.assertRaises(ValueError):
|
|
parse_inputs(args)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
unittest.main()
|