automine draft

allenai · Feb 27, 2025 · 505e08c · 505e08c
1 parent ae7efd3
commit 505e08c
Showing 1 changed file with 98 additions and 10 deletions.
diff --git a/olmocr/bench/miners/automine.py b/olmocr/bench/miners/automine.py
@@ -1,14 +1,102 @@
-# In this script, we assume that you have used the olmocr.bench.convert script to convert
-# a number of PDFs, each perhaps number of repeat times.
+import os
+import argparse
+from difflib import SequenceMatcher
+from collections import Counter
 
-# Now, we automatically search for some rules amongst these. Such that the chosen
-# diff is correct amongst most of the "clean" model, and perhaps wrong most often
+import syntok.segmenter as segmenter
+import syntok.tokenizer as tokenizer
 
+def parse_sentences(text: str) -> list[str]:
+    """
+    Splits a text into a list of sentence strings using syntok.
+    """
+    sentences = []
+    for paragraph in segmenter.process(text):
+        for sentence in paragraph:
+            # Collect token values, stripping out empty strings
+            token_values = [token.value for token in sentence if token.value.strip()]
+            # Join them with a space
+            sentence_str = " ".join(token_values)
+            sentences.append(sentence_str)
+    return sentences
 
-def merge_median_document(base: str, candidates: list[str]) -> str:
-    # Split base into sentences using syntok
-    # Find matching sentences using from fuzzysearch import find_near_matches from amongst candidates
-    # For each sentence, we build a list of candidates, with their counts
+def compare_votes_for_file(base_text: str, candidate_texts: list[str]) -> None:
+    """
+    For each sentence in the base text, finds the best matching sentence from
+    each candidate text (using a similarity threshold). If any candidate sentences
+    differ from the base sentence, prints the base sentence along with each unique
+    variant and the number of times it was chosen.
+    """
+    base_sentences = parse_sentences(base_text)
+    # Parse all candidate texts into lists of sentences
+    candidate_sentences_list = [parse_sentences(ct) for ct in candidate_texts]
 
-    # Return a new document where for each sentence, we replace each sentence using the one which
-    # appears most often inside of candidates
+    for b_sentence in base_sentences:
+        votes = []
+        for c_sentences in candidate_sentences_list:
+            best_ratio = 0.0
+            best_candidate = None
+
+            # Find the candidate sentence with the highest similarity to b_sentence
+            for c_sentence in c_sentences:
+                ratio = SequenceMatcher(None, b_sentence, c_sentence).ratio()
+                if ratio > best_ratio:
+                    best_ratio = ratio
+                    best_candidate = c_sentence
+
+            # Append the candidate if it passes the similarity threshold (e.g., 0.7)
+            if best_ratio > 0.7 and best_candidate is not None:
+                votes.append(best_candidate)
+
+        # Only consider variants that differ from the base sentence
+        variant_votes = [vote for vote in votes if vote != b_sentence]
+        if variant_votes:
+            print("Base Sentence:")
+            print(b_sentence)
+            print("Variants:")
+            counts = Counter(variant_votes)
+            for variant, count in counts.items():
+                print(f"{count}x: {variant}")
+            print("-" * 40)
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Compares sentences from base and candidate texts, printing differences."
+    )
+    parser.add_argument(
+        "--base",
+        default=os.path.join(os.path.dirname(__file__), "chatgpt"),
+        help="Path to the folder containing base .md files."
+    )
+    parser.add_argument(
+        "--compare",
+        default=os.path.join(os.path.dirname(__file__), "olmocr"),
+        help="Path to the folder containing candidate .md files."
+    )
+    args = parser.parse_args()
+
+    base_path = args.base
+    compare_path = args.compare
+
+    # Collect all .md files from the base and compare folders
+    base_files = [f for f in os.listdir(base_path) if f.endswith(".md")]
+    compare_files = [f for f in os.listdir(compare_path) if f.endswith(".md")]
+
+    # Read all candidate texts at once
+    candidate_texts = []
+    for cf in compare_files:
+        with open(os.path.join(compare_path, cf), "r", encoding="utf-8") as f:
+            candidate_texts.append(f.read())
+
+    # Process each base file and print out the vote differences
+    for bf in base_files:
+        base_file_path = os.path.join(base_path, bf)
+        with open(base_file_path, "r", encoding="utf-8") as f:
+            base_text = f.read()
+
+        print(f"Results for base file: {bf}")
+        compare_votes_for_file(base_text, candidate_texts)
+        print("=" * 80)
+
+if __name__ == "__main__":
+    main()