Skip to content

Commit 2ac59c8

Browse files
authored
Add metrics for ScoreFilter benchmarks (#1385)
* Add metrics for `ScoreFilter` benchmarks Signed-off-by: Sarah Yurick <[email protected]> * greptile suggestion Signed-off-by: Sarah Yurick <[email protected]> * use exact_value Signed-off-by: Sarah Yurick <[email protected]> * add more things Signed-off-by: Sarah Yurick <[email protected]> --------- Signed-off-by: Sarah Yurick <[email protected]>
1 parent 2f8f3f8 commit 2ac59c8

File tree

2 files changed

+37
-4
lines changed

2 files changed

+37
-4
lines changed

benchmarking/nightly-benchmark.yaml

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -248,10 +248,25 @@ entries:
248248
--yaml-config={curator_repo_dir}/nemo_curator/config/text/heuristic_filter_english_pipeline.yaml
249249
--overrides="stages.0._target_=nemo_curator.stages.text.io.reader.ParquetReader"
250250
timeout_s: 20000
251+
sink_data:
252+
- name: slack
253+
additional_metrics:
254+
- num_kept_documents
255+
- throughput_docs_per_sec
251256
ray:
252257
num_cpus: 64
253258
num_gpus: 0
254259
enable_object_spilling: false
260+
requirements:
261+
# ensure the total number of documents processed is correct
262+
- metric: num_documents_processed
263+
exact_value: 2119489
264+
# account for stochastic filters
265+
- metric: num_kept_documents
266+
min_value: 2090470
267+
max_value: 2090490
268+
- metric: throughput_docs_per_sec
269+
min_value: 19000
255270

256271
- name: score_filter_xenna
257272
enabled: true
@@ -263,10 +278,25 @@ entries:
263278
--yaml-config={curator_repo_dir}/nemo_curator/config/text/heuristic_filter_english_pipeline.yaml
264279
--overrides="stages.0._target_=nemo_curator.stages.text.io.reader.ParquetReader"
265280
timeout_s: 20000
281+
sink_data:
282+
- name: slack
283+
additional_metrics:
284+
- num_kept_documents
285+
- throughput_docs_per_sec
266286
ray:
267287
num_cpus: 64
268288
num_gpus: 0
269289
enable_object_spilling: false
290+
requirements:
291+
# ensure the total number of documents processed is correct
292+
- metric: num_documents_processed
293+
exact_value: 2119489
294+
# account for stochastic filters
295+
- metric: num_kept_documents
296+
min_value: 2090470
297+
max_value: 2090490
298+
- metric: throughput_docs_per_sec
299+
min_value: 8500
270300

271301
- name: image_curation
272302
enabled: true

benchmarking/scripts/score_filter_benchmark.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -96,12 +96,13 @@ def run_score_filter_classification_benchmark( # noqa: PLR0913
9696
output_tasks = pipeline.run(executor)
9797
run_time_taken = time.perf_counter() - run_start_time
9898

99-
# task._metadata is a dictionary of metadata for the task, but will not be used here.
100-
# Instead simply use the num_items property of the task to get the number of documents processed.
101-
num_documents_processed = sum(task.num_items for task in output_tasks)
99+
# _stage_perf[0] is the file partitioning stage, so _stage_perf[1] is the file reading stage
100+
num_documents_processed = sum(task._stage_perf[1].num_items_processed for task in output_tasks)
101+
num_kept_documents = sum(task._stage_perf[-1].num_items_processed for task in output_tasks)
102102

103103
logger.success(f"Benchmark completed in {run_time_taken:.2f}s")
104-
logger.success(f"Processed {num_documents_processed} documents")
104+
logger.success(f"Processed {num_documents_processed} rows (documents)")
105+
logger.success(f"Kept {num_kept_documents} out of {num_documents_processed} rows (documents)")
105106
success = True
106107

107108
except Exception as e: # noqa: BLE001
@@ -111,6 +112,7 @@ def run_score_filter_classification_benchmark( # noqa: PLR0913
111112
output_tasks = []
112113
run_time_taken = time.perf_counter() - run_start_time
113114
num_documents_processed = 0
115+
num_kept_documents = 0
114116
success = False
115117

116118
return {
@@ -124,6 +126,7 @@ def run_score_filter_classification_benchmark( # noqa: PLR0913
124126
"is_success": success,
125127
"time_taken_s": run_time_taken,
126128
"num_documents_processed": num_documents_processed,
129+
"num_kept_documents": num_kept_documents,
127130
"num_output_tasks": len(output_tasks),
128131
"throughput_docs_per_sec": num_documents_processed / run_time_taken if run_time_taken > 0 else 0,
129132
},

0 commit comments

Comments
 (0)