tensorzero · shuyangli · Feb 25, 2026 · GabrielBianconi · Mar 2, 2026 · GabrielBianconi
diff --git a/docs/evaluations/inference-evaluations/cli-reference.mdx b/docs/evaluations/inference-evaluations/cli-reference.mdx
@@ -84,6 +84,22 @@ Each evaluator's precision threshold should be a positive number.
 
 If adaptive stopping is enabled for all evaluators, then the evaluation will stop once all evaluators have met their targets or all datapoints have been evaluated.
 
+#### `--cutoffs EVALUATOR=CUTOFF[,...]`
+
+- **Example:** `--cutoffs exact_match=0.95,llm_judge=0.8`
+- **Required:** no (default: none)
+
+Sets a per-evaluator threshold at which the test is passing.
+
+This can be useful for applications where the evaluations are run as an automated test.
+If the average value of this evaluator does not meet the cutoff, the evaluations binary will return a nonzero status code.
+
+- For evaluators with `optimize = "max"`, runs fail when `mean < cutoff`.
+- For evaluators with `optimize = "min"`, runs fail when `mean > cutoff`.
+
+The `cutoff` field in evaluator config is deprecated. Prefer using this CLI `--cutoffs` flag instead.
+If both evaluator-config `cutoff` and CLI `--cutoffs` are provided for the same evaluator, the CLI value is used.
+
 #### `--config-file PATH`
 
 - **Example:** `--config-file /path/to/tensorzero.toml`
@@ -188,12 +204,12 @@ The variant name should be present in your TensorZero configuration file.
 
 The evaluations process exits with a status code of `0` if the evaluation was successful, and a status code of `1` if the evaluation failed.
 
-If you configure a `cutoff` for any of your evaluators, the evaluation will fail if the average score for any evaluator is below its cutoff.
+If you pass `--cutoffs`, the evaluation will fail if any evaluator violates its cutoff threshold.
 
 <Tip>
 
 The exit status code is helpful for integrating TensorZero Evaluations into your CI/CD pipeline.
 
-You can define sanity checks for your variants with `cutoff` to detect performance regressions early before shipping to production.
+You can define sanity checks for your variants with `--cutoffs` to detect performance regressions early before shipping to production.
 
 </Tip>
diff --git a/docs/evaluations/inference-evaluations/configuration-reference.mdx b/docs/evaluations/inference-evaluations/configuration-reference.mdx
@@ -88,14 +88,12 @@ type = "llm_judge"
 
 <Accordion title='type: "exact_match"' defaultOpen="true">
 
-###### `cutoff`
+###### `cutoff` (deprecated)
 
 - **Type:** float
 - **Required:** no
 
-Sets a user defined threshold at which the test is passing.
-This can be useful for applications where the evaluations are run as an automated test.
-If the average value of this evaluator is below the cutoff, the evaluations binary will return a nonzero status code.
+Use CLI [`--cutoffs`](/evaluations/inference-evaluations/cli-reference) for evaluation pass/fail thresholds.
 
 </Accordion>
 
@@ -181,24 +179,12 @@ optimize = "max"
 # ...
 ```
 
-###### `cutoff`
+###### `cutoff` (deprecated)
 
 - **Type:** float
 - **Required:** no
 
-Sets a user defined threshold at which the test is passing.
-This may be useful for applications where the evaluations are run as an automated test.
-If the average value of this evaluator is below the cutoff (when `optimize` is `max`) or above the cutoff (when `optimize` is `min`), the evaluations binary will return a nonzero status code.
-
-```toml
-// tensorzero.toml
-[evaluations.email-guardrails.evaluators.check-signature]
-# ...
-type = "llm_judge"
-optimize = "max" # Example: Maximize score
-cutoff = 0.8 # Example: Consider passing if average score is >= 0.8
-# ...
-```
+Use CLI [`--cutoffs`](/evaluations/inference-evaluations/cli-referenc) for evaluation pass/fail thresholds.
 
 ###### `description`
 

diff --git a/docs/evaluations/inference-evaluations/tutorial.mdx b/docs/evaluations/inference-evaluations/tutorial.mdx
@@ -138,7 +138,6 @@ Let's do that:
 type = "llm_judge"
 output_type = "boolean"  # LLM judge should generate a boolean (or float)
 optimize = "max"  # higher is better
-cutoff = 0.95  # if the variant scores <95% = bad
 
 [evaluations.haiku_eval.evaluators.valid_haiku.variants.gpt_4o_mini_judge]
 type = "chat_completion"
@@ -167,7 +166,6 @@ But unlike regular functions, only one variant can be active at a time during ev
 type = "llm_judge"
 output_type = "boolean"
 optimize = "max"
-cutoff = 0.95
 
 [evaluations.haiku_eval.evaluators.valid_haiku.variants.gpt_4o_mini_judge]
 type = "chat_completion"
@@ -194,7 +192,6 @@ Let's define another evalutor that counts the number of metaphors in our haiku.
 type = "llm_judge"
 output_type = "float"  # LLM judge should generate a boolean (or float)
 optimize = "max"
-cutoff = 1  # <1 metaphor per haiku = bad
 ```
 
 We can also use different variant types for evaluators.
@@ -250,7 +247,8 @@ You can run evaluations using the TensorZero Evaluations CLI tool or the TensorZ
 <Tip>
 
 The TensorZero Evaluations CLI tool can be helpful for CI/CD.
-It'll exit with code 0 if all evaluations succeed (average score vs. `cutoff`), or code 1 otherwise.
+It'll exit with code 0 if all evaluations succeed, or code 1 otherwise.
+Use `--cutoffs` to enforce pass/fail thresholds from the CLI.
 
 </Tip>
 
@@ -269,7 +267,8 @@ docker compose run --rm evaluations \
     --evaluation-name haiku_eval \
     --dataset-name haiku_dataset \
     --variant-name gpt_4o \
-    --concurrency 5
+    --concurrency 5 \
+    --cutoffs valid_haiku=0.95,metaphor_count=1
 ```
 
 <Accordion title="Docker Compose">