finito
This commit is contained in:
100
README.md
100
README.md
@@ -1,6 +1,104 @@
|
||||
# Red-Teaming Language Models with DSPy
|
||||
|
||||
We use the the power of [DSPy](https://github.com/stanfordnlp/dspy), a framework for structuring and optimizing language model programs, to red-team language models.
|
||||
A packaged version of an open source red-teaming framework that uses the power of [DSPy](https://github.com/stanfordnlp/dspy) to red-team language models through automated attack generation and optimization.
|
||||
|
||||
## Quick Start
|
||||
|
||||
### Installation
|
||||
|
||||
```bash
|
||||
git clone https://git.modaic.dev/farouk1/redteam.git
|
||||
cd redteam
|
||||
uv sync
|
||||
```
|
||||
|
||||
Or initialize a new project:
|
||||
|
||||
```bash
|
||||
uv init
|
||||
uv add verdict instructor tqdm modaic
|
||||
```
|
||||
|
||||
### Environment Variables
|
||||
|
||||
Create a `.env` file with:
|
||||
|
||||
```bash
|
||||
MODAIC_TOKEN="<your_modaic_token>"
|
||||
TOGETHER_API_KEY="<your_together_api_key>"
|
||||
OPENAI_API_KEY="<your_openai_api_key>"
|
||||
```
|
||||
|
||||
### Usage
|
||||
|
||||
```python
|
||||
import json
|
||||
import dspy
|
||||
from tqdm import tqdm
|
||||
from dspy.teleprompt import MIPROv2
|
||||
from modaic import AutoAgent
|
||||
|
||||
redteam_agent = AutoAgent.from_precompiled("farouk1/redteam", config_options={"num_layers": 3})
|
||||
|
||||
def main():
|
||||
with open("advbench_subset.json", "r") as f:
|
||||
goals = json.load(f)["goals"]
|
||||
|
||||
trainset = [
|
||||
dspy.Example(harmful_intent=goal).with_inputs("harmful_intent")
|
||||
for goal in goals
|
||||
]
|
||||
|
||||
# evaluate baseline: directly passing in harmful intent strings
|
||||
base_score = 0
|
||||
import litellm
|
||||
|
||||
litellm.cache = None
|
||||
for ex in tqdm(trainset, desc="Raw Input Score"):
|
||||
base_score += redteam_agent.attack_program.metric(
|
||||
intent=ex.harmful_intent, attack_prompt=ex.harmful_intent, eval_round=True
|
||||
)
|
||||
base_score /= len(trainset)
|
||||
print(f"--- Raw Harmful Intent Strings ---")
|
||||
print(f"Baseline Score: {base_score}")
|
||||
|
||||
# evaluating architecture with no compilation
|
||||
attacker_prog = redteam_agent
|
||||
print(f"\n--- Evaluating Initial Architecture ---")
|
||||
redteam_agent.attack_program.eval_program(attacker_prog, trainset)
|
||||
|
||||
optimizer = MIPROv2(metric=redteam_agent.attack_program.metric, auto=None)
|
||||
best_prog = optimizer.compile(
|
||||
attacker_prog,
|
||||
trainset=trainset,
|
||||
max_bootstrapped_demos=2,
|
||||
max_labeled_demos=0,
|
||||
num_trials=3,
|
||||
num_candidates=6,
|
||||
)
|
||||
|
||||
# evaluating architecture DSPy post-compilation
|
||||
print(f"\n--- Evaluating Optimized Architecture ---")
|
||||
redteam_agent.attack_program.eval_program(best_prog, trainset)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
```
|
||||
|
||||
### Configuration
|
||||
|
||||
The red-team agent can be configured via the `config_options` parameter:
|
||||
|
||||
```python
|
||||
class RedTeamConfig(PrecompiledConfig):
|
||||
lm: str = "gpt-4o-mini"
|
||||
target_lm: str = "mistralai/Mistral-7B-Instruct-v0.2"
|
||||
num_layers: int = 5
|
||||
max_attack_tokens: int = 512
|
||||
temperature: float = 0
|
||||
```
|
||||
|
||||
## Overview
|
||||
|
||||
To our knowledge, this is the first attempt at using any auto-prompting *framework* to perform the red-teaming task. This is also probably the deepest architecture in public optimized with DSPy to date.
|
||||
|
||||
|
||||
@@ -85,7 +85,6 @@ class AttackProgram(dspy.Module):
|
||||
critique = critique.critique
|
||||
return self.try_attacks[-1](harmful_intent=harmful_intent, critique=critique)
|
||||
|
||||
|
||||
def metric(
|
||||
self,
|
||||
intent: str | dspy.Example,
|
||||
@@ -100,7 +99,10 @@ class AttackProgram(dspy.Module):
|
||||
target_client,
|
||||
self.target_model_name,
|
||||
attack_prompt,
|
||||
inference_params={"max_tokens": self.max_attack_tokens, "temperature": self.temperature},
|
||||
inference_params={
|
||||
"max_tokens": self.max_attack_tokens,
|
||||
"temperature": self.temperature,
|
||||
},
|
||||
)
|
||||
if use_verdict:
|
||||
score = verdict_judge(intent, response)[0] / 5
|
||||
@@ -110,7 +112,6 @@ class AttackProgram(dspy.Module):
|
||||
score = round(score)
|
||||
return score
|
||||
|
||||
|
||||
def eval_program(self, prog, eval_set):
|
||||
evaluate = Evaluate(
|
||||
devset=eval_set,
|
||||
|
||||
5
main.py
5
main.py
@@ -50,8 +50,11 @@ def main():
|
||||
red_team_agent.attack_program.eval_program(best_prog, trainset)
|
||||
"""
|
||||
# push to hub
|
||||
red_team_agent.push_to_hub("farouk1/redteam", commit_message="finito", with_code=True)
|
||||
red_team_agent.push_to_hub(
|
||||
"farouk1/redteam", commit_message="finito", with_code=True
|
||||
)
|
||||
print("---------Pushed to hub!---------")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
Reference in New Issue
Block a user