Update README.md
This commit is contained in:
18
README.md
18
README.md
@@ -7,7 +7,7 @@ license: MIT
|
|||||||
A packaged version of an open source red-teaming framework that uses the power of [DSPy](https://github.com/stanfordnlp/dspy) to red-team language models through automated attack generation and optimization.
|
A packaged version of an open source red-teaming framework that uses the power of [DSPy](https://github.com/stanfordnlp/dspy) to red-team language models through automated attack generation and optimization.
|
||||||
|
|
||||||
## Quick Start
|
## Quick Start
|
||||||
Run this agent within a new project:
|
Run this program within a new project:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
uv init
|
uv init
|
||||||
@@ -31,9 +31,9 @@ import json
|
|||||||
import dspy
|
import dspy
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
from dspy.teleprompt import MIPROv2
|
from dspy.teleprompt import MIPROv2
|
||||||
from modaic import AutoAgent
|
from modaic import AutoProgram
|
||||||
|
|
||||||
redteam_agent = AutoAgent.from_precompiled("farouk1/redteam", config_options={"num_layers": 3})
|
redteam = AutoProgram.from_precompiled("farouk1/redteam", config_options={"num_layers": 3})
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
with open("advbench_subset.json", "r") as f:
|
with open("advbench_subset.json", "r") as f:
|
||||||
@@ -50,7 +50,7 @@ def main():
|
|||||||
|
|
||||||
litellm.cache = None
|
litellm.cache = None
|
||||||
for ex in tqdm(trainset, desc="Raw Input Score"):
|
for ex in tqdm(trainset, desc="Raw Input Score"):
|
||||||
base_score += redteam_agent.attack_program.metric(
|
base_score += redtean.attack_program.metric(
|
||||||
intent=ex.harmful_intent, attack_prompt=ex.harmful_intent, eval_round=True
|
intent=ex.harmful_intent, attack_prompt=ex.harmful_intent, eval_round=True
|
||||||
)
|
)
|
||||||
base_score /= len(trainset)
|
base_score /= len(trainset)
|
||||||
@@ -58,11 +58,11 @@ def main():
|
|||||||
print(f"Baseline Score: {base_score}")
|
print(f"Baseline Score: {base_score}")
|
||||||
|
|
||||||
# evaluating architecture with no compilation
|
# evaluating architecture with no compilation
|
||||||
attacker_prog = redteam_agent
|
attacker_prog = redteam.attack_program
|
||||||
print(f"\n--- Evaluating Initial Architecture ---")
|
print(f"\n--- Evaluating Initial Architecture ---")
|
||||||
redteam_agent.attack_program.eval_program(attacker_prog, trainset)
|
redteam.eval_program(attacker_prog, trainset)
|
||||||
|
|
||||||
optimizer = MIPROv2(metric=redteam_agent.attack_program.metric, auto=None)
|
optimizer = MIPROv2(metric=redteam.attack_program.metric, auto=None)
|
||||||
best_prog = optimizer.compile(
|
best_prog = optimizer.compile(
|
||||||
attacker_prog,
|
attacker_prog,
|
||||||
trainset=trainset,
|
trainset=trainset,
|
||||||
@@ -74,7 +74,7 @@ def main():
|
|||||||
|
|
||||||
# evaluating architecture DSPy post-compilation
|
# evaluating architecture DSPy post-compilation
|
||||||
print(f"\n--- Evaluating Optimized Architecture ---")
|
print(f"\n--- Evaluating Optimized Architecture ---")
|
||||||
redteam_agent.attack_program.eval_program(best_prog, trainset)
|
redteam.attack_program.eval_program(best_prog, trainset)
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
@@ -82,7 +82,7 @@ if __name__ == "__main__":
|
|||||||
|
|
||||||
### Configuration
|
### Configuration
|
||||||
|
|
||||||
The red-team agent can be configured via the `config_options` parameter in `AutoAgent.from_precompiled`:
|
The red-team program can be configured via the `config_options` parameter in `AutoProgram.from_precompiled`:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
class RedTeamConfig(PrecompiledConfig):
|
class RedTeamConfig(PrecompiledConfig):
|
||||||
|
|||||||
Reference in New Issue
Block a user