diff --git a/README.md b/README.md
index 22dc50c..77c405b 100644
--- a/README.md
+++ b/README.md
@@ -5,12 +5,13 @@ Modaic internal SDK for benchmarking judges and training confidence probes.
 ## Installation
 
 ```bash
+cd cli
 uv sync
 ```
 
 ## CLI Commands
 
-All commands can be run via `uv run bench <command>` or using the shorthand `uv run <command>`.
+All commands are run from the `cli` directory via `uv run mo <command>`.
 
 ### `create`
 
@@ -25,15 +26,12 @@ Create benchmark datasets for training confidence probes. This command runs a ju
 
 ```bash
 # Interactive mode (recommended) - prompts for configuration
-uv run create ppe
-uv run create judge_bench
+uv run mo create ppe
+uv run mo create judge_bench
 
 # With config file
-uv run create ppe --config config.yaml
-uv run create judge_bench --config config.yaml
-
-# Full command form
-uv run bench create ppe --config config.yaml
+uv run mo create ppe --config config.yaml
+uv run mo create judge_bench --config config.yaml
 ```
 
 **Options:**
@@ -70,16 +68,13 @@ Train a confidence probe on an embeddings dataset created with `create`.
 
 ```bash
 # Interactive mode (recommended) - prompts for all configuration
-uv run train
+uv run mo train
 
 # With config file
-uv run train --config config.yaml
+uv run mo train --config config.yaml
 
 # With CLI arguments
-uv run train --dataset tytodd/my-embeddings --epochs 10 --lr 0.0001
-
-# Full command form
-uv run bench train --config config.yaml
+uv run mo train --dataset tytodd/my-embeddings --epochs 10 --lr 0.0001
 ```
 
 **Options:**
@@ -131,16 +126,13 @@ Evaluate a trained confidence probe on a dataset. Computes calibration and discr
 
 ```bash
 # Interactive mode (recommended) - prompts for probe and dataset
-uv run eval
+uv run mo eval
 
 # With CLI arguments
-uv run eval --probe tytodd/my-probe --dataset tytodd/my-embeddings
+uv run mo eval --probe tytodd/my-probe --dataset tytodd/my-embeddings
 
 # Evaluate on train split instead of test
-uv run eval --probe tytodd/my-probe --dataset tytodd/my-embeddings --split train
-
-# Full command form
-uv run bench eval --probe tytodd/my-probe --dataset tytodd/my-embeddings
+uv run mo eval --probe tytodd/my-probe --dataset tytodd/my-embeddings --split train
 ```
 
 **Options:**
@@ -188,15 +180,12 @@ Compile (optimize) a judge using GEPA over a dataset. GEPA iteratively improves
 
 ```bash
 # Interactive mode
-uv run compile
-uv run compile ppe
+uv run mo compile
+uv run mo compile ppe
 
 # With config file
-uv run compile --config config.yaml
-uv run compile ppe --config config.yaml
-
-# Full command form
-uv run bench compile --config config.yaml
+uv run mo compile --config config.yaml
+uv run mo compile ppe --config config.yaml
 ```
 
 **Options:**
@@ -235,7 +224,7 @@ seed: 42
 
 ---
 
-### `reembed`
+### `embed`
 
 Regenerate embeddings for an existing dataset using a different model or layer. Useful for experimenting with different embedding configurations without re-running the judge.
 
@@ -243,13 +232,10 @@ Regenerate embeddings for an existing dataset using a different model or layer.
 
 ```bash
 # Interactive mode
-uv run reembed
+uv run mo embed
 
 # With CLI arguments
-uv run reembed --dataset tytodd/my-dataset --hf-model Qwen/Qwen3-VL-32B-Instruct --layer -1
-
-# Full command form
-uv run bench reembed --dataset tytodd/my-dataset
+uv run mo embed --dataset tytodd/my-dataset --hf-model Qwen/Qwen3-VL-32B-Instruct --layer -1
 ```
 
 **Options:**
@@ -272,7 +258,7 @@ uv run bench reembed --dataset tytodd/my-dataset
 ```bash
 # Original dataset was created with layer 32
 # Now try middle layer instead
-uv run reembed \
+uv run mo embed \
   --dataset tytodd/my-embeddings \
   --hf-model Qwen/Qwen3-VL-32B-Instruct \
   --layer -1
@@ -298,19 +284,19 @@ Use `-1` for the middle layer if experimenting with an unlisted model.
 
 ```bash
 # 1. Create a probe dataset from a benchmark
-uv run create ppe
+uv run mo create ppe
 
 # 2. Train a confidence probe
-uv run train --dataset tytodd/ppe-qwen3-embeddings
+uv run mo train --dataset tytodd/ppe-qwen3-embeddings
 
 # 3. Evaluate the probe on a test set
-uv run eval --probe tytodd/my-probe --dataset tytodd/ppe-qwen3-embeddings
+uv run mo eval --probe tytodd/my-probe --dataset tytodd/ppe-qwen3-embeddings
 
 # 4. (Optional) Compile/optimize a judge with GEPA
-uv run compile ppe
+uv run mo compile ppe
 
 # 5. (Optional) Re-embed with different layer
-uv run reembed --dataset tytodd/my-dataset --layer 32
+uv run mo embed --dataset tytodd/my-dataset --layer 32
 ```
 
 ## Environment Variables
@@ -321,5 +307,6 @@ Create a `.env` file with:
 OPENAI_API_KEY=...
 WANDB_API_KEY=...
 HF_TOKEN=...
-MODAIC_API_KEY=...
+MODAIC_TOKEN=...
+TOGETHER_API_KEY=...
 ```
diff --git a/config.json b/config.json
index 63fe6fc..0e72b46 100644
--- a/config.json
+++ b/config.json
@@ -34,10 +34,6 @@
       "label": {
         "__dspy_field_type": "output",
         "desc": "Which response is better: 'A>B' or 'B>A'",
-        "enum": [
-          "A>B",
-          "B>A"
-        ],
         "prefix": "Label:",
         "title": "Label",
         "type": "string"
diff --git a/probe.json b/probe.json
new file mode 100644
index 0000000..6de82a1
--- /dev/null
+++ b/probe.json
@@ -0,0 +1 @@
+{"probe_version":"v1","embedding_dim":5120,"model_path":"Qwen/Qwen3-VL-32B-Instruct","dropout":0.0,"layer_index":16,"num_layers":65,"probe_type":"linear"}
\ No newline at end of file
diff --git a/probe.safetensors b/probe.safetensors
new file mode 100644
index 0000000..c739465
Binary files /dev/null and b/probe.safetensors differ
diff --git a/program.json b/program.json
index 2846878..c76d76c 100644
--- a/program.json
+++ b/program.json
@@ -41,7 +41,7 @@
   "metadata": {
     "dependency_versions": {
       "python": "3.11",
-      "dspy": "3.1.0",
+      "dspy": "3.1.2",
       "cloudpickle": "3.1"
     }
   }