Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
jakep-allenai committed Oct 30, 2024
2 parents 43aa4f2 + c652c7e commit a0e0917
Show file tree
Hide file tree
Showing 6 changed files with 44 additions and 8 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ requires-python = ">=3.8"
dependencies = [
"cached-path",
"smart_open",
"pypdf",
"pypdf>=5.1.0",
"pymupdf",
"pypdfium2",
"cryptography",
Expand Down
Binary file added tests/gnarly_pdfs/failing_pdf_pg9.pdf
Binary file not shown.
Binary file added tests/gnarly_pdfs/not_parsing.pdf
Binary file not shown.
Binary file added tests/gnarly_pdfs/not_parsing2.pdf
Binary file not shown.
14 changes: 7 additions & 7 deletions tests/test_anchor.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def testLargePromptHint1(self):

print(anchor_text)
print(len(anchor_text))
self.assertLess(len(anchor_text), 1000)
self.assertLessEqual(len(anchor_text), 1000)

def testLargePromptHint2(self):
local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "large_prompt_hint2.pdf")
Expand All @@ -83,7 +83,7 @@ def testLargePromptHint2(self):

print(anchor_text)
print(len(anchor_text))
self.assertLess(len(anchor_text), 4000)
self.assertLessEqual(len(anchor_text), 4000)

def testLargePromptHint3(self):
local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "large_prompt_hint3.pdf")
Expand All @@ -92,7 +92,7 @@ def testLargePromptHint3(self):

print(anchor_text)
print(len(anchor_text))
self.assertLess(len(anchor_text), 4000)
self.assertLessEqual(len(anchor_text), 4000)

def testNewsPaperPromptHint(self):
local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "newspaper.pdf")
Expand All @@ -101,7 +101,7 @@ def testNewsPaperPromptHint(self):

print(anchor_text)
print(len(anchor_text))
self.assertLess(len(anchor_text), 4000)
self.assertLessEqual(len(anchor_text), 4000)

def testTobaccoPaperMissingParagraphs(self):
local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "tobacco_missed_tokens_pg1.pdf")
Expand All @@ -110,7 +110,7 @@ def testTobaccoPaperMissingParagraphs(self):

print(anchor_text)
print(len(anchor_text))
self.assertLess(len(anchor_text), 4000)
self.assertLessEqual(len(anchor_text), 4000)

def testAnchorOtherLengths(self):
local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "tobacco_missed_tokens_pg1.pdf")
Expand All @@ -119,13 +119,13 @@ def testAnchorOtherLengths(self):

print(anchor_text)
print(len(anchor_text))
self.assertLess(len(anchor_text), 2000)
self.assertLessEqual(len(anchor_text), 2000)

anchor_text = get_anchor_text(local_pdf_path, 1, pdf_engine="pdfreport", target_length=6000)

print(anchor_text)
print(len(anchor_text))
self.assertLess(len(anchor_text), 6000)
self.assertLessEqual(len(anchor_text), 6000)

def testFailingAnchor(self):
local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "failing_anchor_pg4.pdf")
Expand Down
36 changes: 36 additions & 0 deletions tests/test_birrpipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,42 @@ def get_s3_bytes_side_effect(s3_client, s3_path, start_index=None, end_index=Non


class TestBuildPageQuery(unittest.TestCase):
def testNotParsing(self):
file = os.path.join(
os.path.dirname(__file__),
"gnarly_pdfs",
"not_parsing.pdf"
)

for page in range(1,9):
query = build_page_query(file, "not_parsing.pdf", page, 1024, 6000)
print(query)

def testNotParsing2(self):
file = os.path.join(
os.path.dirname(__file__),
"gnarly_pdfs",
"not_parsing2.pdf"
)

for page in range(1,10):
query = build_page_query(file, "not_parsing2.pdf", page, 1024, 6000)
print(query)

def testNotParsingHugeMemoryUsage(self):
file = os.path.join(
os.path.dirname(__file__),
"gnarly_pdfs",
"failing_pdf_pg9.pdf"
)

print("Starting to parse bad pdf")

query = build_page_query(file, "failing_pdf_pg9.pdf", 9, 1024, 6000)

print(query)


def testRotation(self):
# First, generate and save the non-rotated image
query = build_page_query(os.path.join(
Expand Down

0 comments on commit a0e0917

Please sign in to comment.