Add metrics for ScoreFilter benchmarks (#1385)

sarahyurick · web-flow · commit 2ac59c8df3fb · 2026-01-16T14:50:45.000-08:00
* Add metrics for `ScoreFilter` benchmarks

Signed-off-by: Sarah Yurick &lt;sarahyurick@gmail.com&gt;

* greptile suggestion

Signed-off-by: Sarah Yurick &lt;sarahyurick@gmail.com&gt;

* use exact_value

Signed-off-by: Sarah Yurick &lt;sarahyurick@gmail.com&gt;

* add more things

Signed-off-by: Sarah Yurick &lt;sarahyurick@gmail.com&gt;

---------

Signed-off-by: Sarah Yurick &lt;sarahyurick@gmail.com&gt;
diff --git a/benchmarking/nightly-benchmark.yaml b/benchmarking/nightly-benchmark.yaml
@@ -248,10 +248,25 @@ entries:
       --yaml-config={curator_repo_dir}/nemo_curator/config/text/heuristic_filter_english_pipeline.yaml
       --overrides="stages.0._target_=nemo_curator.stages.text.io.reader.ParquetReader"
     timeout_s: 20000
+    sink_data:
+      - name: slack
+        additional_metrics:
+          - num_kept_documents
+          - throughput_docs_per_sec
     ray:
       num_cpus: 64
       num_gpus: 0
       enable_object_spilling: false
+    requirements:
+      # ensure the total number of documents processed is correct
+      - metric: num_documents_processed
+        exact_value: 2119489
+    # account for stochastic filters
+      - metric: num_kept_documents
+        min_value: 2090470
+        max_value: 2090490
+      - metric: throughput_docs_per_sec
+        min_value: 19000
 
   - name: score_filter_xenna
     enabled: true
@@ -263,10 +278,25 @@ entries:
       --yaml-config={curator_repo_dir}/nemo_curator/config/text/heuristic_filter_english_pipeline.yaml
       --overrides="stages.0._target_=nemo_curator.stages.text.io.reader.ParquetReader"
     timeout_s: 20000
+    sink_data:
+      - name: slack
+        additional_metrics:
+          - num_kept_documents
+          - throughput_docs_per_sec
     ray:
       num_cpus: 64
       num_gpus: 0
       enable_object_spilling: false
+    requirements:
+      # ensure the total number of documents processed is correct
+      - metric: num_documents_processed
+        exact_value: 2119489
+    # account for stochastic filters
+      - metric: num_kept_documents
+        min_value: 2090470
+        max_value: 2090490
+      - metric: throughput_docs_per_sec
+        min_value: 8500
 
   - name: image_curation
     enabled: true
diff --git a/benchmarking/scripts/score_filter_benchmark.py b/benchmarking/scripts/score_filter_benchmark.py
@@ -96,12 +96,13 @@ def run_score_filter_classification_benchmark(  # noqa: PLR0913
         output_tasks = pipeline.run(executor)
         run_time_taken = time.perf_counter() - run_start_time
 
-        # task._metadata is a dictionary of metadata for the task, but will not be used here.
-        # Instead simply use the num_items property of the task to get the number of documents processed.
-        num_documents_processed = sum(task.num_items for task in output_tasks)
+        # _stage_perf[0] is the file partitioning stage, so _stage_perf[1] is the file reading stage
+        num_documents_processed = sum(task._stage_perf[1].num_items_processed for task in output_tasks)
+        num_kept_documents = sum(task._stage_perf[-1].num_items_processed for task in output_tasks)
 
         logger.success(f"Benchmark completed in {run_time_taken:.2f}s")
-        logger.success(f"Processed {num_documents_processed} documents")
+        logger.success(f"Processed {num_documents_processed} rows (documents)")
+        logger.success(f"Kept {num_kept_documents} out of {num_documents_processed} rows (documents)")
         success = True
 
     except Exception as e:  # noqa: BLE001
@@ -111,6 +112,7 @@ def run_score_filter_classification_benchmark(  # noqa: PLR0913
         output_tasks = []
         run_time_taken = time.perf_counter() - run_start_time
         num_documents_processed = 0
+        num_kept_documents = 0
         success = False
 
     return {
@@ -124,6 +126,7 @@ def run_score_filter_classification_benchmark(  # noqa: PLR0913
             "is_success": success,
             "time_taken_s": run_time_taken,
             "num_documents_processed": num_documents_processed,
+            "num_kept_documents": num_kept_documents,
             "num_output_tasks": len(output_tasks),
             "throughput_docs_per_sec": num_documents_processed / run_time_taken if run_time_taken > 0 else 0,
         },