generated from allenai/python-package-template
-
Notifications
You must be signed in to change notification settings - Fork 233
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
ae7efd3
commit 505e08c
Showing
1 changed file
with
98 additions
and
10 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,14 +1,102 @@ | ||
# In this script, we assume that you have used the olmocr.bench.convert script to convert | ||
# a number of PDFs, each perhaps number of repeat times. | ||
import os | ||
import argparse | ||
from difflib import SequenceMatcher | ||
from collections import Counter | ||
|
||
# Now, we automatically search for some rules amongst these. Such that the chosen | ||
# diff is correct amongst most of the "clean" model, and perhaps wrong most often | ||
import syntok.segmenter as segmenter | ||
import syntok.tokenizer as tokenizer | ||
|
||
def parse_sentences(text: str) -> list[str]: | ||
""" | ||
Splits a text into a list of sentence strings using syntok. | ||
""" | ||
sentences = [] | ||
for paragraph in segmenter.process(text): | ||
for sentence in paragraph: | ||
# Collect token values, stripping out empty strings | ||
token_values = [token.value for token in sentence if token.value.strip()] | ||
# Join them with a space | ||
sentence_str = " ".join(token_values) | ||
sentences.append(sentence_str) | ||
return sentences | ||
|
||
def merge_median_document(base: str, candidates: list[str]) -> str: | ||
# Split base into sentences using syntok | ||
# Find matching sentences using from fuzzysearch import find_near_matches from amongst candidates | ||
# For each sentence, we build a list of candidates, with their counts | ||
def compare_votes_for_file(base_text: str, candidate_texts: list[str]) -> None: | ||
""" | ||
For each sentence in the base text, finds the best matching sentence from | ||
each candidate text (using a similarity threshold). If any candidate sentences | ||
differ from the base sentence, prints the base sentence along with each unique | ||
variant and the number of times it was chosen. | ||
""" | ||
base_sentences = parse_sentences(base_text) | ||
# Parse all candidate texts into lists of sentences | ||
candidate_sentences_list = [parse_sentences(ct) for ct in candidate_texts] | ||
|
||
# Return a new document where for each sentence, we replace each sentence using the one which | ||
# appears most often inside of candidates | ||
for b_sentence in base_sentences: | ||
votes = [] | ||
for c_sentences in candidate_sentences_list: | ||
best_ratio = 0.0 | ||
best_candidate = None | ||
|
||
# Find the candidate sentence with the highest similarity to b_sentence | ||
for c_sentence in c_sentences: | ||
ratio = SequenceMatcher(None, b_sentence, c_sentence).ratio() | ||
if ratio > best_ratio: | ||
best_ratio = ratio | ||
best_candidate = c_sentence | ||
|
||
# Append the candidate if it passes the similarity threshold (e.g., 0.7) | ||
if best_ratio > 0.7 and best_candidate is not None: | ||
votes.append(best_candidate) | ||
|
||
# Only consider variants that differ from the base sentence | ||
variant_votes = [vote for vote in votes if vote != b_sentence] | ||
if variant_votes: | ||
print("Base Sentence:") | ||
print(b_sentence) | ||
print("Variants:") | ||
counts = Counter(variant_votes) | ||
for variant, count in counts.items(): | ||
print(f"{count}x: {variant}") | ||
print("-" * 40) | ||
|
||
def main(): | ||
parser = argparse.ArgumentParser( | ||
description="Compares sentences from base and candidate texts, printing differences." | ||
) | ||
parser.add_argument( | ||
"--base", | ||
default=os.path.join(os.path.dirname(__file__), "chatgpt"), | ||
help="Path to the folder containing base .md files." | ||
) | ||
parser.add_argument( | ||
"--compare", | ||
default=os.path.join(os.path.dirname(__file__), "olmocr"), | ||
help="Path to the folder containing candidate .md files." | ||
) | ||
args = parser.parse_args() | ||
|
||
base_path = args.base | ||
compare_path = args.compare | ||
|
||
# Collect all .md files from the base and compare folders | ||
base_files = [f for f in os.listdir(base_path) if f.endswith(".md")] | ||
compare_files = [f for f in os.listdir(compare_path) if f.endswith(".md")] | ||
|
||
# Read all candidate texts at once | ||
candidate_texts = [] | ||
for cf in compare_files: | ||
with open(os.path.join(compare_path, cf), "r", encoding="utf-8") as f: | ||
candidate_texts.append(f.read()) | ||
|
||
# Process each base file and print out the vote differences | ||
for bf in base_files: | ||
base_file_path = os.path.join(base_path, bf) | ||
with open(base_file_path, "r", encoding="utf-8") as f: | ||
base_text = f.read() | ||
|
||
print(f"Results for base file: {bf}") | ||
compare_votes_for_file(base_text, candidate_texts) | ||
print("=" * 80) | ||
|
||
if __name__ == "__main__": | ||
main() |