Compare commits
1 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| f12ab167ea |
67
README.md
67
README.md
@@ -5,12 +5,13 @@ Modaic internal SDK for benchmarking judges and training confidence probes.
|
|||||||
## Installation
|
## Installation
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
|
cd cli
|
||||||
uv sync
|
uv sync
|
||||||
```
|
```
|
||||||
|
|
||||||
## CLI Commands
|
## CLI Commands
|
||||||
|
|
||||||
All commands can be run via `uv run bench <command>` or using the shorthand `uv run <command>`.
|
All commands are run from the `cli` directory via `uv run mo <command>`.
|
||||||
|
|
||||||
### `create`
|
### `create`
|
||||||
|
|
||||||
@@ -25,15 +26,12 @@ Create benchmark datasets for training confidence probes. This command runs a ju
|
|||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Interactive mode (recommended) - prompts for configuration
|
# Interactive mode (recommended) - prompts for configuration
|
||||||
uv run create ppe
|
uv run mo create ppe
|
||||||
uv run create judge_bench
|
uv run mo create judge_bench
|
||||||
|
|
||||||
# With config file
|
# With config file
|
||||||
uv run create ppe --config config.yaml
|
uv run mo create ppe --config config.yaml
|
||||||
uv run create judge_bench --config config.yaml
|
uv run mo create judge_bench --config config.yaml
|
||||||
|
|
||||||
# Full command form
|
|
||||||
uv run bench create ppe --config config.yaml
|
|
||||||
```
|
```
|
||||||
|
|
||||||
**Options:**
|
**Options:**
|
||||||
@@ -70,16 +68,13 @@ Train a confidence probe on an embeddings dataset created with `create`.
|
|||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Interactive mode (recommended) - prompts for all configuration
|
# Interactive mode (recommended) - prompts for all configuration
|
||||||
uv run train
|
uv run mo train
|
||||||
|
|
||||||
# With config file
|
# With config file
|
||||||
uv run train --config config.yaml
|
uv run mo train --config config.yaml
|
||||||
|
|
||||||
# With CLI arguments
|
# With CLI arguments
|
||||||
uv run train --dataset tytodd/my-embeddings --epochs 10 --lr 0.0001
|
uv run mo train --dataset tytodd/my-embeddings --epochs 10 --lr 0.0001
|
||||||
|
|
||||||
# Full command form
|
|
||||||
uv run bench train --config config.yaml
|
|
||||||
```
|
```
|
||||||
|
|
||||||
**Options:**
|
**Options:**
|
||||||
@@ -131,16 +126,13 @@ Evaluate a trained confidence probe on a dataset. Computes calibration and discr
|
|||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Interactive mode (recommended) - prompts for probe and dataset
|
# Interactive mode (recommended) - prompts for probe and dataset
|
||||||
uv run eval
|
uv run mo eval
|
||||||
|
|
||||||
# With CLI arguments
|
# With CLI arguments
|
||||||
uv run eval --probe tytodd/my-probe --dataset tytodd/my-embeddings
|
uv run mo eval --probe tytodd/my-probe --dataset tytodd/my-embeddings
|
||||||
|
|
||||||
# Evaluate on train split instead of test
|
# Evaluate on train split instead of test
|
||||||
uv run eval --probe tytodd/my-probe --dataset tytodd/my-embeddings --split train
|
uv run mo eval --probe tytodd/my-probe --dataset tytodd/my-embeddings --split train
|
||||||
|
|
||||||
# Full command form
|
|
||||||
uv run bench eval --probe tytodd/my-probe --dataset tytodd/my-embeddings
|
|
||||||
```
|
```
|
||||||
|
|
||||||
**Options:**
|
**Options:**
|
||||||
@@ -188,15 +180,12 @@ Compile (optimize) a judge using GEPA over a dataset. GEPA iteratively improves
|
|||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Interactive mode
|
# Interactive mode
|
||||||
uv run compile
|
uv run mo compile
|
||||||
uv run compile ppe
|
uv run mo compile ppe
|
||||||
|
|
||||||
# With config file
|
# With config file
|
||||||
uv run compile --config config.yaml
|
uv run mo compile --config config.yaml
|
||||||
uv run compile ppe --config config.yaml
|
uv run mo compile ppe --config config.yaml
|
||||||
|
|
||||||
# Full command form
|
|
||||||
uv run bench compile --config config.yaml
|
|
||||||
```
|
```
|
||||||
|
|
||||||
**Options:**
|
**Options:**
|
||||||
@@ -235,7 +224,7 @@ seed: 42
|
|||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
### `reembed`
|
### `embed`
|
||||||
|
|
||||||
Regenerate embeddings for an existing dataset using a different model or layer. Useful for experimenting with different embedding configurations without re-running the judge.
|
Regenerate embeddings for an existing dataset using a different model or layer. Useful for experimenting with different embedding configurations without re-running the judge.
|
||||||
|
|
||||||
@@ -243,13 +232,10 @@ Regenerate embeddings for an existing dataset using a different model or layer.
|
|||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Interactive mode
|
# Interactive mode
|
||||||
uv run reembed
|
uv run mo embed
|
||||||
|
|
||||||
# With CLI arguments
|
# With CLI arguments
|
||||||
uv run reembed --dataset tytodd/my-dataset --hf-model Qwen/Qwen3-VL-32B-Instruct --layer -1
|
uv run mo embed --dataset tytodd/my-dataset --hf-model Qwen/Qwen3-VL-32B-Instruct --layer -1
|
||||||
|
|
||||||
# Full command form
|
|
||||||
uv run bench reembed --dataset tytodd/my-dataset
|
|
||||||
```
|
```
|
||||||
|
|
||||||
**Options:**
|
**Options:**
|
||||||
@@ -272,7 +258,7 @@ uv run bench reembed --dataset tytodd/my-dataset
|
|||||||
```bash
|
```bash
|
||||||
# Original dataset was created with layer 32
|
# Original dataset was created with layer 32
|
||||||
# Now try middle layer instead
|
# Now try middle layer instead
|
||||||
uv run reembed \
|
uv run mo embed \
|
||||||
--dataset tytodd/my-embeddings \
|
--dataset tytodd/my-embeddings \
|
||||||
--hf-model Qwen/Qwen3-VL-32B-Instruct \
|
--hf-model Qwen/Qwen3-VL-32B-Instruct \
|
||||||
--layer -1
|
--layer -1
|
||||||
@@ -298,19 +284,19 @@ Use `-1` for the middle layer if experimenting with an unlisted model.
|
|||||||
|
|
||||||
```bash
|
```bash
|
||||||
# 1. Create a probe dataset from a benchmark
|
# 1. Create a probe dataset from a benchmark
|
||||||
uv run create ppe
|
uv run mo create ppe
|
||||||
|
|
||||||
# 2. Train a confidence probe
|
# 2. Train a confidence probe
|
||||||
uv run train --dataset tytodd/ppe-qwen3-embeddings
|
uv run mo train --dataset tytodd/ppe-qwen3-embeddings
|
||||||
|
|
||||||
# 3. Evaluate the probe on a test set
|
# 3. Evaluate the probe on a test set
|
||||||
uv run eval --probe tytodd/my-probe --dataset tytodd/ppe-qwen3-embeddings
|
uv run mo eval --probe tytodd/my-probe --dataset tytodd/ppe-qwen3-embeddings
|
||||||
|
|
||||||
# 4. (Optional) Compile/optimize a judge with GEPA
|
# 4. (Optional) Compile/optimize a judge with GEPA
|
||||||
uv run compile ppe
|
uv run mo compile ppe
|
||||||
|
|
||||||
# 5. (Optional) Re-embed with different layer
|
# 5. (Optional) Re-embed with different layer
|
||||||
uv run reembed --dataset tytodd/my-dataset --layer 32
|
uv run mo embed --dataset tytodd/my-dataset --layer 32
|
||||||
```
|
```
|
||||||
|
|
||||||
## Environment Variables
|
## Environment Variables
|
||||||
@@ -321,5 +307,6 @@ Create a `.env` file with:
|
|||||||
OPENAI_API_KEY=...
|
OPENAI_API_KEY=...
|
||||||
WANDB_API_KEY=...
|
WANDB_API_KEY=...
|
||||||
HF_TOKEN=...
|
HF_TOKEN=...
|
||||||
MODAIC_API_KEY=...
|
MODAIC_TOKEN=...
|
||||||
|
TOGETHER_API_KEY=...
|
||||||
```
|
```
|
||||||
|
|||||||
@@ -34,10 +34,6 @@
|
|||||||
"label": {
|
"label": {
|
||||||
"__dspy_field_type": "output",
|
"__dspy_field_type": "output",
|
||||||
"desc": "Which response is better: 'A>B' or 'B>A'",
|
"desc": "Which response is better: 'A>B' or 'B>A'",
|
||||||
"enum": [
|
|
||||||
"A>B",
|
|
||||||
"B>A"
|
|
||||||
],
|
|
||||||
"prefix": "Label:",
|
"prefix": "Label:",
|
||||||
"title": "Label",
|
"title": "Label",
|
||||||
"type": "string"
|
"type": "string"
|
||||||
|
|||||||
1
probe.json
Normal file
1
probe.json
Normal file
@@ -0,0 +1 @@
|
|||||||
|
{"probe_version":"v1","embedding_dim":5120,"model_path":"Qwen/Qwen3-VL-32B-Instruct","dropout":0.0,"layer_index":16,"num_layers":65,"probe_type":"linear"}
|
||||||
BIN
probe.safetensors
Normal file
BIN
probe.safetensors
Normal file
Binary file not shown.
@@ -41,7 +41,7 @@
|
|||||||
"metadata": {
|
"metadata": {
|
||||||
"dependency_versions": {
|
"dependency_versions": {
|
||||||
"python": "3.11",
|
"python": "3.11",
|
||||||
"dspy": "3.1.0",
|
"dspy": "3.1.2",
|
||||||
"cloudpickle": "3.1"
|
"cloudpickle": "3.1"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user