From dd3a916eef0b08c2c1d8e694297987569c4348b6 Mon Sep 17 00:00:00 2001
From: Tyrin Todd <tyrin@modaic.dev>
Date: Tue, 10 Feb 2026 11:57:19 -0800
Subject: [PATCH] (no commit message)

---
 README.md    | 312 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 config.json  |  69 ++++++++++++
 program.json |  40 +++++++
 3 files changed, 420 insertions(+), 1 deletion(-)
 create mode 100644 config.json
 create mode 100644 program.json
diff --git a/README.md b/README.md
index 134b8db..77c405b 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,312 @@
-# emotion
+# Bench
 
+Modaic internal SDK for benchmarking judges and training confidence probes.
+
+## Installation
+
+```bash
+cd cli
+uv sync
+```
+
+## CLI Commands
+
+All commands are run from the `cli` directory via `uv run mo <command>`.
+
+### `create`
+
+Create benchmark datasets for training confidence probes. This command runs a judge on examples, extracts embeddings via Modal, and pushes the resulting dataset to HuggingFace Hub.
+
+**Subcommands:**
+
+- `create ppe` - Create dataset from PPE (human-preference + correctness) benchmarks
+- `create judge_bench` - Create dataset from the JudgeBench benchmark
+
+**Usage:**
+
+```bash
+# Interactive mode (recommended) - prompts for configuration
+uv run mo create ppe
+uv run mo create judge_bench
+
+# With config file
+uv run mo create ppe --config config.yaml
+uv run mo create judge_bench --config config.yaml
+```
+
+**Options:**
+
+| Option     | Short | Description                |
+| ---------- | ----- | -------------------------- |
+| `--config` | `-c`  | Path to config file (YAML) |
+
+**Config File Example:**
+
+```yaml
+judge: tyrin/ppe-judge-gepa
+output: tytodd/my-probe-dataset
+n_train: 500
+n_test: 100
+embedding_layer: -1  # -1 for middle layer
+```
+
+**What it does:**
+
+1. Loads examples from the benchmark dataset
+2. Runs the specified judge on each example to get predictions
+3. Extracts embeddings from the judge's LLM via Modal (GPU)
+4. Creates a HuggingFace dataset with columns: `question`, `response_a`, `response_b`, `label`, `predicted`, `messages`, `embeddings`
+5. Pushes to HuggingFace Hub
+
+---
+
+### `train`
+
+Train a confidence probe on an embeddings dataset created with `create`.
+
+**Usage:**
+
+```bash
+# Interactive mode (recommended) - prompts for all configuration
+uv run mo train
+
+# With config file
+uv run mo train --config config.yaml
+
+# With CLI arguments
+uv run mo train --dataset tytodd/my-embeddings --epochs 10 --lr 0.0001
+```
+
+**Options:**
+
+| Option           | Short | Description                                                                       | Default           |
+| ---------------- | ----- | --------------------------------------------------------------------------------- | ----------------- |
+| `--config`       | `-c`  | Path to config file (YAML)                                                        | -                 |
+| `--dataset`      | `-d`  | Dataset path (HuggingFace Hub or local) (must be a dataset created with `create`) | -                 |
+| `--model-path`   | `-m`  | Output path for trained model                                                     | `{dataset}_probe` |
+| `--batch-size`   |       | Batch size                                                                        | 4                 |
+| `--epochs`       |       | Number of training epochs                                                         | 10                |
+| `--lr`           |       | Learning rate                                                                     | 0.0001            |
+| `--weight-decay` |       | Weight decay                                                                      | 0.01              |
+| `--test-size`    |       | Validation split ratio (if no test split)                                         | 0.2               |
+| `--seed`         |       | Random seed                                                                       | 42                |
+| `--project`      |       | W&B project name                                                                  | model_path        |
+| `--hub-path`     |       | HuggingFace Hub path to push model                                                | -                 |
+
+**Config File Example:**
+
+```yaml
+dataset_path: tytodd/my-probe-dataset
+model_path: ./best_probe
+hub_path: tytodd/my-probe  # Optional: push to HF Hub
+batch_size: 4
+epochs: 10
+learning_rate: 0.0001
+weight_decay: 0.01
+test_size: 0.2
+seed: 42
+```
+
+**What it does:**
+
+1. Loads an embeddings dataset (from HuggingFace Hub or local)
+2. Creates binary labels: 1 if `predicted == label`, 0 otherwise
+3. Trains a linear probe using MSE loss (Brier score optimization)
+4. Logs metrics to Weights & Biases (Brier, ECE, MCE, Kuiper, AUROC)
+5. Saves the best model based on validation Brier score
+6. Optionally pushes to HuggingFace Hub
+
+---
+
+### `eval`
+
+Evaluate a trained confidence probe on a dataset. Computes calibration and discrimination metrics.
+
+**Usage:**
+
+```bash
+# Interactive mode (recommended) - prompts for probe and dataset
+uv run mo eval
+
+# With CLI arguments
+uv run mo eval --probe tytodd/my-probe --dataset tytodd/my-embeddings
+
+# Evaluate on train split instead of test
+uv run mo eval --probe tytodd/my-probe --dataset tytodd/my-embeddings --split train
+```
+
+**Options:**
+
+| Option                       | Short | Description                              | Default      |
+| ---------------------------- | ----- | ---------------------------------------- | ------------ |
+| `--probe`                    | `-p`  | Probe path (HuggingFace Hub or local)    | -            |
+| `--dataset`                  | `-d`  | Dataset path (HuggingFace Hub or local)  | -            |
+| `--split`                    | `-s`  | Dataset split to evaluate on             | test         |
+| `--batch-size`               | `-b`  | Batch size for evaluation                | 64           |
+| `--normalize/--no-normalize` | `-n`  | Normalize embeddings with StandardScaler | probe config |
+
+**Metrics computed:**
+
+| Metric      | Description                                       |
+| ----------- | ------------------------------------------------- |
+| Brier Score | Mean squared error between predictions and labels |
+| Accuracy    | Classification accuracy at 0.5 threshold          |
+| F1 Score    | Harmonic mean of precision and recall             |
+| ECE         | Expected Calibration Error (10 bins)              |
+| MCE         | Maximum Calibration Error                         |
+| Kuiper      | Kuiper statistic for calibration                  |
+| AUROC       | Area Under the ROC Curve (discrimination)         |
+
+**What it does:**
+
+1. Loads a pretrained probe from HuggingFace Hub or local path
+2. Loads a dataset created with `create`
+3. Creates binary labels: 1 if `predicted == label`, 0 otherwise
+4. Runs inference and computes calibration/discrimination metrics
+5. Displays results in a formatted table
+
+---
+
+### `compile`
+
+Compile (optimize) a judge using GEPA over a dataset. GEPA iteratively improves the judge's prompt based on training examples.
+
+**Subcommands:**
+
+- `compile` (base) - Compile with custom dataset and parameter mapping
+- `compile ppe` - Compile specifically for PPE datasets (human-preference + correctness)
+
+**Usage:**
+
+```bash
+# Interactive mode
+uv run mo compile
+uv run mo compile ppe
+
+# With config file
+uv run mo compile --config config.yaml
+uv run mo compile ppe --config config.yaml
+```
+
+**Options:**
+
+| Option     | Short | Description                |
+| ---------- | ----- | -------------------------- |
+| `--config` | `-c`  | Path to config file (YAML) |
+
+**Config File Example:**
+
+```yaml
+judge: tyrin/ppe-judge
+dataset: tytodd/ppe-human-preference
+inputs: # selects which input columns of the dataset to use (not necearry if using a compile subcommand like ppe or judge_bench) 
+  - name: question
+  - name: response_a
+    column: response_A  # Map param name to dataset column
+  - name: response_b
+    column: response_B
+label_column: label
+n_train: 100
+n_val: 50
+base_model: gpt-4o-mini
+reflection_model: gpt-4o
+output: tyrin/ppe-judge-gepa
+seed: 42
+```
+
+**What it does:**
+
+1. Loads a judge from Modaic Hub
+2. Loads training/validation examples from a HuggingFace dataset
+3. Maps judge parameters to dataset columns
+4. Runs GEPA optimization to improve the judge's prompt
+5. Pushes the optimized judge to Modaic Hub
+
+---
+
+### `embed`
+
+Regenerate embeddings for an existing dataset using a different model or layer. Useful for experimenting with different embedding configurations without re-running the judge.
+
+**Usage:**
+
+```bash
+# Interactive mode
+uv run mo embed
+
+# With CLI arguments
+uv run mo embed --dataset tytodd/my-dataset --hf-model Qwen/Qwen3-VL-32B-Instruct --layer -1
+```
+
+**Options:**
+
+| Option       | Short | Description                              |
+| ------------ | ----- | ---------------------------------------- |
+| `--dataset`  | `-d`  | Dataset path (HuggingFace Hub or local)  |
+| `--hf-model` | `-m`  | HuggingFace model path for embeddings    |
+| `--layer`    | `-l`  | Hidden layer index (-1 for middle layer) |
+
+**What it does:**
+
+1. Loads an existing dataset (must have a `messages` column)
+2. Regenerates embeddings using the specified model/layer via Modal
+3. Replaces the `embeddings` column in the dataset
+4. Prompts to push the updated dataset to HuggingFace Hub
+
+**Example workflow:**
+
+```bash
+# Original dataset was created with layer 32
+# Now try middle layer instead
+uv run mo embed \
+  --dataset tytodd/my-embeddings \
+  --hf-model Qwen/Qwen3-VL-32B-Instruct \
+  --layer -1
+```
+
+---
+
+## Recommended Embedding Layers
+
+When extracting embeddings, use these recommended layer indices for best probe performance:
+
+| Model         | HuggingFace Path                    | Recommended Layer |
+| ------------- | ----------------------------------- | ----------------- |
+| GPT-OSS 20B   | `openai/gpt-oss-20b`                | 8                 |
+| Qwen3-VL 32B  | `Qwen/Qwen3-VL-32B-Instruct`        | 16                |
+| Llama 3.3 70B | `meta-llama/Llama-3.3-70B-Instruct` | 32                |
+
+Use `-1` for the middle layer if experimenting with an unlisted model.
+
+---
+
+## Typical Workflow
+
+```bash
+# 1. Create a probe dataset from a benchmark
+uv run mo create ppe
+
+# 2. Train a confidence probe
+uv run mo train --dataset tytodd/ppe-qwen3-embeddings
+
+# 3. Evaluate the probe on a test set
+uv run mo eval --probe tytodd/my-probe --dataset tytodd/ppe-qwen3-embeddings
+
+# 4. (Optional) Compile/optimize a judge with GEPA
+uv run mo compile ppe
+
+# 5. (Optional) Re-embed with different layer
+uv run mo embed --dataset tytodd/my-dataset --layer 32
+```
+
+## Environment Variables
+
+Create a `.env` file with:
+
+```bash
+OPENAI_API_KEY=...
+WANDB_API_KEY=...
+HF_TOKEN=...
+MODAIC_TOKEN=...
+TOGETHER_API_KEY=...
+```
diff --git a/config.json b/config.json
new file mode 100644
index 0000000..6c37c8c
--- /dev/null
+++ b/config.json
@@ -0,0 +1,69 @@
+{
+  "model": null,
+  "signature": {
+    "description": "Classify the emotions expressed in the given text. Multiple emotions can be present simultaneously.\n\nFirst, reason through the text carefully, identifying any emotional cues, tone,\nand context. Consider both explicit and implicit emotional expressions.\n\nThen output the detected emotions as a list from these options:\nadmiration, amusement, anger, annoyance, approval, caring, confusion, curiosity,\ndesire, disappointment, disapproval, disgust, embarrassment, excitement, fear,\ngratitude, grief, joy, love, nervousness, optimism, pride, realization, relief,\nremorse, sadness, surprise, neutral\n\nIf no clear emotion is detected, output [\"neutral\"].",
+    "properties": {
+      "text": {
+        "__dspy_field_type": "input",
+        "desc": "The text to classify for emotions",
+        "prefix": "Text:",
+        "title": "Text",
+        "type": "string"
+      },
+      "reasoning": {
+        "__dspy_field_type": "output",
+        "desc": "Step-by-step analysis of emotional cues in the text",
+        "prefix": "Reasoning:",
+        "title": "Reasoning",
+        "type": "string"
+      },
+      "label": {
+        "__dspy_field_type": "output",
+        "desc": "List of detected emotions",
+        "items": {
+          "enum": [
+            "admiration",
+            "amusement",
+            "anger",
+            "annoyance",
+            "approval",
+            "caring",
+            "confusion",
+            "curiosity",
+            "desire",
+            "disappointment",
+            "disapproval",
+            "disgust",
+            "embarrassment",
+            "excitement",
+            "fear",
+            "gratitude",
+            "grief",
+            "joy",
+            "love",
+            "nervousness",
+            "optimism",
+            "pride",
+            "realization",
+            "relief",
+            "remorse",
+            "sadness",
+            "surprise",
+            "neutral"
+          ],
+          "type": "string"
+        },
+        "prefix": "Label:",
+        "title": "Label",
+        "type": "array"
+      }
+    },
+    "required": [
+      "text",
+      "reasoning",
+      "label"
+    ],
+    "title": "EmotionSig",
+    "type": "object"
+  }
+}
\ No newline at end of file
diff --git a/program.json b/program.json
new file mode 100644
index 0000000..03b51c9
--- /dev/null
+++ b/program.json
@@ -0,0 +1,40 @@
+{
+  "traces": [],
+  "train": [],
+  "demos": [],
+  "signature": {
+    "instructions": "Classify the emotions expressed in the given text. Multiple emotions can be present simultaneously.\n\nFirst, reason through the text carefully, identifying any emotional cues, tone,\nand context. Consider both explicit and implicit emotional expressions.\n\nThen output the detected emotions as a list from these options:\nadmiration, amusement, anger, annoyance, approval, caring, confusion, curiosity,\ndesire, disappointment, disapproval, disgust, embarrassment, excitement, fear,\ngratitude, grief, joy, love, nervousness, optimism, pride, realization, relief,\nremorse, sadness, surprise, neutral\n\nIf no clear emotion is detected, output [\"neutral\"].",
+    "fields": [
+      {
+        "prefix": "Text:",
+        "description": "The text to classify for emotions"
+      },
+      {
+        "prefix": "Reasoning:",
+        "description": "Step-by-step analysis of emotional cues in the text"
+      },
+      {
+        "prefix": "Label:",
+        "description": "List of detected emotions"
+      }
+    ]
+  },
+  "lm": {
+    "model": "together_ai/Qwen/Qwen3-VL-32B-Instruct",
+    "model_type": "chat",
+    "cache": true,
+    "num_retries": 3,
+    "finetuning_model": null,
+    "launch_kwargs": {},
+    "train_kwargs": {},
+    "temperature": null,
+    "max_tokens": null
+  },
+  "metadata": {
+    "dependency_versions": {
+      "python": "3.11",
+      "dspy": "3.1.2",
+      "cloudpickle": "3.1"
+    }
+  }
+}
\ No newline at end of file