diff --git a/README.md b/README.md index 0453138..af3e379 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,104 @@ # Red-Teaming Language Models with DSPy -We use the the power of [DSPy](https://github.com/stanfordnlp/dspy), a framework for structuring and optimizing language model programs, to red-team language models. +A packaged version of an open source red-teaming framework that uses the power of [DSPy](https://github.com/stanfordnlp/dspy) to red-team language models through automated attack generation and optimization. + +## Quick Start + +### Installation + +```bash +git clone https://git.modaic.dev/farouk1/redteam.git +cd redteam +uv sync +``` + +Or initialize a new project: + +```bash +uv init +uv add verdict instructor tqdm modaic +``` + +### Environment Variables + +Create a `.env` file with: + +```bash +MODAIC_TOKEN="" +TOGETHER_API_KEY="" +OPENAI_API_KEY="" +``` + +### Usage + +```python +import json +import dspy +from tqdm import tqdm +from dspy.teleprompt import MIPROv2 +from modaic import AutoAgent + +redteam_agent = AutoAgent.from_precompiled("farouk1/redteam", config_options={"num_layers": 3}) + +def main(): + with open("advbench_subset.json", "r") as f: + goals = json.load(f)["goals"] + + trainset = [ + dspy.Example(harmful_intent=goal).with_inputs("harmful_intent") + for goal in goals + ] + + # evaluate baseline: directly passing in harmful intent strings + base_score = 0 + import litellm + + litellm.cache = None + for ex in tqdm(trainset, desc="Raw Input Score"): + base_score += redteam_agent.attack_program.metric( + intent=ex.harmful_intent, attack_prompt=ex.harmful_intent, eval_round=True + ) + base_score /= len(trainset) + print(f"--- Raw Harmful Intent Strings ---") + print(f"Baseline Score: {base_score}") + + # evaluating architecture with no compilation + attacker_prog = redteam_agent + print(f"\n--- Evaluating Initial Architecture ---") + redteam_agent.attack_program.eval_program(attacker_prog, trainset) + + optimizer = MIPROv2(metric=redteam_agent.attack_program.metric, auto=None) + best_prog = optimizer.compile( + attacker_prog, + trainset=trainset, + max_bootstrapped_demos=2, + max_labeled_demos=0, + num_trials=3, + num_candidates=6, + ) + + # evaluating architecture DSPy post-compilation + print(f"\n--- Evaluating Optimized Architecture ---") + redteam_agent.attack_program.eval_program(best_prog, trainset) + +if __name__ == "__main__": + main() +``` + +### Configuration + +The red-team agent can be configured via the `config_options` parameter: + +```python +class RedTeamConfig(PrecompiledConfig): + lm: str = "gpt-4o-mini" + target_lm: str = "mistralai/Mistral-7B-Instruct-v0.2" + num_layers: int = 5 + max_attack_tokens: int = 512 + temperature: float = 0 +``` + +## Overview To our knowledge, this is the first attempt at using any auto-prompting *framework* to perform the red-teaming task. This is also probably the deepest architecture in public optimized with DSPy to date. diff --git a/agent/redteam.py b/agent/redteam.py index 446072d..373ef73 100644 --- a/agent/redteam.py +++ b/agent/redteam.py @@ -85,7 +85,6 @@ class AttackProgram(dspy.Module): critique = critique.critique return self.try_attacks[-1](harmful_intent=harmful_intent, critique=critique) - def metric( self, intent: str | dspy.Example, @@ -100,7 +99,10 @@ class AttackProgram(dspy.Module): target_client, self.target_model_name, attack_prompt, - inference_params={"max_tokens": self.max_attack_tokens, "temperature": self.temperature}, + inference_params={ + "max_tokens": self.max_attack_tokens, + "temperature": self.temperature, + }, ) if use_verdict: score = verdict_judge(intent, response)[0] / 5 @@ -110,7 +112,6 @@ class AttackProgram(dspy.Module): score = round(score) return score - def eval_program(self, prog, eval_set): evaluate = Evaluate( devset=eval_set, diff --git a/main.py b/main.py index 4d0b502..dbc39c6 100644 --- a/main.py +++ b/main.py @@ -50,8 +50,11 @@ def main(): red_team_agent.attack_program.eval_program(best_prog, trainset) """ # push to hub - red_team_agent.push_to_hub("farouk1/redteam", commit_message="finito", with_code=True) + red_team_agent.push_to_hub( + "farouk1/redteam", commit_message="finito", with_code=True + ) print("---------Pushed to hub!---------") - + + if __name__ == "__main__": main()