Update README.md

This commit is contained in:
2026-01-11 00:44:31 +00:00
parent eef0c53100
commit 2f11676d42

View File

@@ -7,7 +7,7 @@ license: MIT
A packaged version of an open source red-teaming framework that uses the power of [DSPy](https://github.com/stanfordnlp/dspy) to red-team language models through automated attack generation and optimization. A packaged version of an open source red-teaming framework that uses the power of [DSPy](https://github.com/stanfordnlp/dspy) to red-team language models through automated attack generation and optimization.
## Quick Start ## Quick Start
Run this agent within a new project: Run this program within a new project:
```bash ```bash
uv init uv init
@@ -31,9 +31,9 @@ import json
import dspy import dspy
from tqdm import tqdm from tqdm import tqdm
from dspy.teleprompt import MIPROv2 from dspy.teleprompt import MIPROv2
from modaic import AutoAgent from modaic import AutoProgram
redteam_agent = AutoAgent.from_precompiled("farouk1/redteam", config_options={"num_layers": 3}) redteam = AutoProgram.from_precompiled("farouk1/redteam", config_options={"num_layers": 3})
def main(): def main():
with open("advbench_subset.json", "r") as f: with open("advbench_subset.json", "r") as f:
@@ -50,7 +50,7 @@ def main():
litellm.cache = None litellm.cache = None
for ex in tqdm(trainset, desc="Raw Input Score"): for ex in tqdm(trainset, desc="Raw Input Score"):
base_score += redteam_agent.attack_program.metric( base_score += redtean.attack_program.metric(
intent=ex.harmful_intent, attack_prompt=ex.harmful_intent, eval_round=True intent=ex.harmful_intent, attack_prompt=ex.harmful_intent, eval_round=True
) )
base_score /= len(trainset) base_score /= len(trainset)
@@ -58,11 +58,11 @@ def main():
print(f"Baseline Score: {base_score}") print(f"Baseline Score: {base_score}")
# evaluating architecture with no compilation # evaluating architecture with no compilation
attacker_prog = redteam_agent attacker_prog = redteam.attack_program
print(f"\n--- Evaluating Initial Architecture ---") print(f"\n--- Evaluating Initial Architecture ---")
redteam_agent.attack_program.eval_program(attacker_prog, trainset) redteam.eval_program(attacker_prog, trainset)
optimizer = MIPROv2(metric=redteam_agent.attack_program.metric, auto=None) optimizer = MIPROv2(metric=redteam.attack_program.metric, auto=None)
best_prog = optimizer.compile( best_prog = optimizer.compile(
attacker_prog, attacker_prog,
trainset=trainset, trainset=trainset,
@@ -74,7 +74,7 @@ def main():
# evaluating architecture DSPy post-compilation # evaluating architecture DSPy post-compilation
print(f"\n--- Evaluating Optimized Architecture ---") print(f"\n--- Evaluating Optimized Architecture ---")
redteam_agent.attack_program.eval_program(best_prog, trainset) redteam.attack_program.eval_program(best_prog, trainset)
if __name__ == "__main__": if __name__ == "__main__":
main() main()
@@ -82,7 +82,7 @@ if __name__ == "__main__":
### Configuration ### Configuration
The red-team agent can be configured via the `config_options` parameter in `AutoAgent.from_precompiled`: The red-team program can be configured via the `config_options` parameter in `AutoProgram.from_precompiled`:
```python ```python
class RedTeamConfig(PrecompiledConfig): class RedTeamConfig(PrecompiledConfig):