Skip to content

Commit

Permalink
More fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
jakep-allenai committed Nov 18, 2024
1 parent 8793fc7 commit 9e2e09b
Show file tree
Hide file tree
Showing 4 changed files with 19 additions and 4 deletions.
2 changes: 0 additions & 2 deletions pdelfin/beakerpipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -811,9 +811,7 @@ async def main():
asyncio.run(main())

# TODO
# - Fix loading of the model checkpoints, it's so flakey now, maybe use datasets
# - Add logging of failed pages and have the stats function read them
# - Fallback to different method if < 2% of pages are failed, make that configurable
# - Sglang commit a fix for the context length issue
# - pypdf fix for the 'v' error
# - Get a solid benchmark on the stream vs non stream approach
Expand Down
16 changes: 15 additions & 1 deletion pdelfin/prompts/prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,24 @@ class PageResponse:
natural_text: Optional[str]

def __post_init__(self):
# Validate that rotation_correction is one of the allowed values
# Validate rotation_correction is one of the allowed values
if self.rotation_correction not in {0, 90, 180, 270}:
raise ValueError("rotation_correction must be one of [0, 90, 180, 270].")

# Type checks
if not isinstance(self.primary_language, (str, type(None))):
raise TypeError("primary_language must be of type Optional[str].")
if not isinstance(self.is_rotation_valid, bool):
raise TypeError("is_rotation_valid must be of type bool.")
if not isinstance(self.rotation_correction, int):
raise TypeError("rotation_correction must be of type int.")
if not isinstance(self.is_table, bool):
raise TypeError("is_table must be of type bool.")
if not isinstance(self.is_diagram, bool):
raise TypeError("is_diagram must be of type bool.")
if not isinstance(self.natural_text, (str, type(None))):
raise TypeError("natural_text must be of type Optional[str].")

def openai_response_format_schema() -> dict:
return {
"type": "json_schema",
Expand Down
2 changes: 1 addition & 1 deletion pdelfin/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
_MINOR = "1"
# On main and in a nightly release the patch should be one ahead of the last
# released build.
_PATCH = "30"
_PATCH = "32"
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
# https://semver.org/#is-v123-a-semantic-version for the semantics.
_SUFFIX = ""
Expand Down
3 changes: 3 additions & 0 deletions scripts/beaker/Dockerfile-inference
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,9 @@ WORKDIR sglang/python
RUN git checkout eff468dd5a3d24646560eb044276585f7a11ac3c
RUN /root/.local/bin/uv pip install --system --no-cache -e .[all]

# TODO You can remove this once pypdf > 5.10 comes out
RUN /root/.local/bin/uv pip install --system --no-cache git+https://github.com/py-pdf/pypdf.git@c6e43374ab002d76811ec85333fdc2c82c268251

WORKDIR /root
COPY pdelfin pdelfin

Expand Down

0 comments on commit 9e2e09b

Please sign in to comment.