finito
This commit is contained in:
100
README.md
100
README.md
@@ -1,6 +1,104 @@
|
|||||||
# Red-Teaming Language Models with DSPy
|
# Red-Teaming Language Models with DSPy
|
||||||
|
|
||||||
We use the the power of [DSPy](https://github.com/stanfordnlp/dspy), a framework for structuring and optimizing language model programs, to red-team language models.
|
A packaged version of an open source red-teaming framework that uses the power of [DSPy](https://github.com/stanfordnlp/dspy) to red-team language models through automated attack generation and optimization.
|
||||||
|
|
||||||
|
## Quick Start
|
||||||
|
|
||||||
|
### Installation
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git clone https://git.modaic.dev/farouk1/redteam.git
|
||||||
|
cd redteam
|
||||||
|
uv sync
|
||||||
|
```
|
||||||
|
|
||||||
|
Or initialize a new project:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
uv init
|
||||||
|
uv add verdict instructor tqdm modaic
|
||||||
|
```
|
||||||
|
|
||||||
|
### Environment Variables
|
||||||
|
|
||||||
|
Create a `.env` file with:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
MODAIC_TOKEN="<your_modaic_token>"
|
||||||
|
TOGETHER_API_KEY="<your_together_api_key>"
|
||||||
|
OPENAI_API_KEY="<your_openai_api_key>"
|
||||||
|
```
|
||||||
|
|
||||||
|
### Usage
|
||||||
|
|
||||||
|
```python
|
||||||
|
import json
|
||||||
|
import dspy
|
||||||
|
from tqdm import tqdm
|
||||||
|
from dspy.teleprompt import MIPROv2
|
||||||
|
from modaic import AutoAgent
|
||||||
|
|
||||||
|
redteam_agent = AutoAgent.from_precompiled("farouk1/redteam", config_options={"num_layers": 3})
|
||||||
|
|
||||||
|
def main():
|
||||||
|
with open("advbench_subset.json", "r") as f:
|
||||||
|
goals = json.load(f)["goals"]
|
||||||
|
|
||||||
|
trainset = [
|
||||||
|
dspy.Example(harmful_intent=goal).with_inputs("harmful_intent")
|
||||||
|
for goal in goals
|
||||||
|
]
|
||||||
|
|
||||||
|
# evaluate baseline: directly passing in harmful intent strings
|
||||||
|
base_score = 0
|
||||||
|
import litellm
|
||||||
|
|
||||||
|
litellm.cache = None
|
||||||
|
for ex in tqdm(trainset, desc="Raw Input Score"):
|
||||||
|
base_score += redteam_agent.attack_program.metric(
|
||||||
|
intent=ex.harmful_intent, attack_prompt=ex.harmful_intent, eval_round=True
|
||||||
|
)
|
||||||
|
base_score /= len(trainset)
|
||||||
|
print(f"--- Raw Harmful Intent Strings ---")
|
||||||
|
print(f"Baseline Score: {base_score}")
|
||||||
|
|
||||||
|
# evaluating architecture with no compilation
|
||||||
|
attacker_prog = redteam_agent
|
||||||
|
print(f"\n--- Evaluating Initial Architecture ---")
|
||||||
|
redteam_agent.attack_program.eval_program(attacker_prog, trainset)
|
||||||
|
|
||||||
|
optimizer = MIPROv2(metric=redteam_agent.attack_program.metric, auto=None)
|
||||||
|
best_prog = optimizer.compile(
|
||||||
|
attacker_prog,
|
||||||
|
trainset=trainset,
|
||||||
|
max_bootstrapped_demos=2,
|
||||||
|
max_labeled_demos=0,
|
||||||
|
num_trials=3,
|
||||||
|
num_candidates=6,
|
||||||
|
)
|
||||||
|
|
||||||
|
# evaluating architecture DSPy post-compilation
|
||||||
|
print(f"\n--- Evaluating Optimized Architecture ---")
|
||||||
|
redteam_agent.attack_program.eval_program(best_prog, trainset)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
|
```
|
||||||
|
|
||||||
|
### Configuration
|
||||||
|
|
||||||
|
The red-team agent can be configured via the `config_options` parameter:
|
||||||
|
|
||||||
|
```python
|
||||||
|
class RedTeamConfig(PrecompiledConfig):
|
||||||
|
lm: str = "gpt-4o-mini"
|
||||||
|
target_lm: str = "mistralai/Mistral-7B-Instruct-v0.2"
|
||||||
|
num_layers: int = 5
|
||||||
|
max_attack_tokens: int = 512
|
||||||
|
temperature: float = 0
|
||||||
|
```
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
To our knowledge, this is the first attempt at using any auto-prompting *framework* to perform the red-teaming task. This is also probably the deepest architecture in public optimized with DSPy to date.
|
To our knowledge, this is the first attempt at using any auto-prompting *framework* to perform the red-teaming task. This is also probably the deepest architecture in public optimized with DSPy to date.
|
||||||
|
|
||||||
|
|||||||
@@ -85,7 +85,6 @@ class AttackProgram(dspy.Module):
|
|||||||
critique = critique.critique
|
critique = critique.critique
|
||||||
return self.try_attacks[-1](harmful_intent=harmful_intent, critique=critique)
|
return self.try_attacks[-1](harmful_intent=harmful_intent, critique=critique)
|
||||||
|
|
||||||
|
|
||||||
def metric(
|
def metric(
|
||||||
self,
|
self,
|
||||||
intent: str | dspy.Example,
|
intent: str | dspy.Example,
|
||||||
@@ -100,7 +99,10 @@ class AttackProgram(dspy.Module):
|
|||||||
target_client,
|
target_client,
|
||||||
self.target_model_name,
|
self.target_model_name,
|
||||||
attack_prompt,
|
attack_prompt,
|
||||||
inference_params={"max_tokens": self.max_attack_tokens, "temperature": self.temperature},
|
inference_params={
|
||||||
|
"max_tokens": self.max_attack_tokens,
|
||||||
|
"temperature": self.temperature,
|
||||||
|
},
|
||||||
)
|
)
|
||||||
if use_verdict:
|
if use_verdict:
|
||||||
score = verdict_judge(intent, response)[0] / 5
|
score = verdict_judge(intent, response)[0] / 5
|
||||||
@@ -110,7 +112,6 @@ class AttackProgram(dspy.Module):
|
|||||||
score = round(score)
|
score = round(score)
|
||||||
return score
|
return score
|
||||||
|
|
||||||
|
|
||||||
def eval_program(self, prog, eval_set):
|
def eval_program(self, prog, eval_set):
|
||||||
evaluate = Evaluate(
|
evaluate = Evaluate(
|
||||||
devset=eval_set,
|
devset=eval_set,
|
||||||
|
|||||||
5
main.py
5
main.py
@@ -50,8 +50,11 @@ def main():
|
|||||||
red_team_agent.attack_program.eval_program(best_prog, trainset)
|
red_team_agent.attack_program.eval_program(best_prog, trainset)
|
||||||
"""
|
"""
|
||||||
# push to hub
|
# push to hub
|
||||||
red_team_agent.push_to_hub("farouk1/redteam", commit_message="finito", with_code=True)
|
red_team_agent.push_to_hub(
|
||||||
|
"farouk1/redteam", commit_message="finito", with_code=True
|
||||||
|
)
|
||||||
print("---------Pushed to hub!---------")
|
print("---------Pushed to hub!---------")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
|||||||
Reference in New Issue
Block a user