Update README.md
This commit is contained in:
18
README.md
18
README.md
@@ -7,7 +7,7 @@ license: MIT
|
||||
A packaged version of an open source red-teaming framework that uses the power of [DSPy](https://github.com/stanfordnlp/dspy) to red-team language models through automated attack generation and optimization.
|
||||
|
||||
## Quick Start
|
||||
Run this agent within a new project:
|
||||
Run this program within a new project:
|
||||
|
||||
```bash
|
||||
uv init
|
||||
@@ -31,9 +31,9 @@ import json
|
||||
import dspy
|
||||
from tqdm import tqdm
|
||||
from dspy.teleprompt import MIPROv2
|
||||
from modaic import AutoAgent
|
||||
from modaic import AutoProgram
|
||||
|
||||
redteam_agent = AutoAgent.from_precompiled("farouk1/redteam", config_options={"num_layers": 3})
|
||||
redteam = AutoProgram.from_precompiled("farouk1/redteam", config_options={"num_layers": 3})
|
||||
|
||||
def main():
|
||||
with open("advbench_subset.json", "r") as f:
|
||||
@@ -50,7 +50,7 @@ def main():
|
||||
|
||||
litellm.cache = None
|
||||
for ex in tqdm(trainset, desc="Raw Input Score"):
|
||||
base_score += redteam_agent.attack_program.metric(
|
||||
base_score += redtean.attack_program.metric(
|
||||
intent=ex.harmful_intent, attack_prompt=ex.harmful_intent, eval_round=True
|
||||
)
|
||||
base_score /= len(trainset)
|
||||
@@ -58,11 +58,11 @@ def main():
|
||||
print(f"Baseline Score: {base_score}")
|
||||
|
||||
# evaluating architecture with no compilation
|
||||
attacker_prog = redteam_agent
|
||||
attacker_prog = redteam.attack_program
|
||||
print(f"\n--- Evaluating Initial Architecture ---")
|
||||
redteam_agent.attack_program.eval_program(attacker_prog, trainset)
|
||||
redteam.eval_program(attacker_prog, trainset)
|
||||
|
||||
optimizer = MIPROv2(metric=redteam_agent.attack_program.metric, auto=None)
|
||||
optimizer = MIPROv2(metric=redteam.attack_program.metric, auto=None)
|
||||
best_prog = optimizer.compile(
|
||||
attacker_prog,
|
||||
trainset=trainset,
|
||||
@@ -74,7 +74,7 @@ def main():
|
||||
|
||||
# evaluating architecture DSPy post-compilation
|
||||
print(f"\n--- Evaluating Optimized Architecture ---")
|
||||
redteam_agent.attack_program.eval_program(best_prog, trainset)
|
||||
redteam.attack_program.eval_program(best_prog, trainset)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -82,7 +82,7 @@ if __name__ == "__main__":
|
||||
|
||||
### Configuration
|
||||
|
||||
The red-team agent can be configured via the `config_options` parameter in `AutoAgent.from_precompiled`:
|
||||
The red-team program can be configured via the `config_options` parameter in `AutoProgram.from_precompiled`:
|
||||
|
||||
```python
|
||||
class RedTeamConfig(PrecompiledConfig):
|
||||
|
||||
Reference in New Issue
Block a user