finito

2025-10-21 14:13:38 -04:00
parent a413e05023
commit 7b1535f925
3 changed files with 108 additions and 6 deletions
--- a/README.md
+++ b/README.md
@@ -1,6 +1,104 @@
 # Red-Teaming Language Models with DSPy

-We use the the power of [DSPy](https://github.com/stanfordnlp/dspy), a framework for structuring and optimizing language model programs, to red-team language models.
+A packaged version of an open source red-teaming framework that uses the power of [DSPy](https://github.com/stanfordnlp/dspy) to red-team language models through automated attack generation and optimization.
+
+## Quick Start
+
+### Installation
+
+```bash
+git clone https://git.modaic.dev/farouk1/redteam.git
+cd redteam
+uv sync
+```
+
+Or initialize a new project:
+
+```bash
+uv init
+uv add verdict instructor tqdm modaic
+```
+
+### Environment Variables
+
+Create a `.env` file with:
+
+```bash
+MODAIC_TOKEN="<your_modaic_token>"
+TOGETHER_API_KEY="<your_together_api_key>"
+OPENAI_API_KEY="<your_openai_api_key>"
+```
+
+### Usage
+
+```python
+import json
+import dspy
+from tqdm import tqdm
+from dspy.teleprompt import MIPROv2
+from modaic import AutoAgent
+
+redteam_agent = AutoAgent.from_precompiled("farouk1/redteam", config_options={"num_layers": 3})
+
+def main():
+    with open("advbench_subset.json", "r") as f:
+        goals = json.load(f)["goals"]
+
+    trainset = [
+        dspy.Example(harmful_intent=goal).with_inputs("harmful_intent")
+        for goal in goals
+    ]
+
+    # evaluate baseline: directly passing in harmful intent strings
+    base_score = 0
+    import litellm
+
+    litellm.cache = None
+    for ex in tqdm(trainset, desc="Raw Input Score"):
+        base_score += redteam_agent.attack_program.metric(
+            intent=ex.harmful_intent, attack_prompt=ex.harmful_intent, eval_round=True
+        )
+    base_score /= len(trainset)
+    print(f"--- Raw Harmful Intent Strings ---")
+    print(f"Baseline Score: {base_score}")
+
+    # evaluating architecture with no compilation
+    attacker_prog = redteam_agent
+    print(f"\n--- Evaluating Initial Architecture ---")
+    redteam_agent.attack_program.eval_program(attacker_prog, trainset)
+
+    optimizer = MIPROv2(metric=redteam_agent.attack_program.metric, auto=None)
+    best_prog = optimizer.compile(
+        attacker_prog,
+        trainset=trainset,
+        max_bootstrapped_demos=2,
+        max_labeled_demos=0,
+        num_trials=3,
+        num_candidates=6,
+    )
+
+    # evaluating architecture DSPy post-compilation
+    print(f"\n--- Evaluating Optimized Architecture ---")
+    redteam_agent.attack_program.eval_program(best_prog, trainset)
+
+if __name__ == "__main__":
+    main()
+```
+
+### Configuration
+
+The red-team agent can be configured via the `config_options` parameter:
+
+```python
+class RedTeamConfig(PrecompiledConfig):
+    lm: str = "gpt-4o-mini"
+    target_lm: str = "mistralai/Mistral-7B-Instruct-v0.2"
+    num_layers: int = 5
+    max_attack_tokens: int = 512
+    temperature: float = 0
+```
+
+## Overview

 To our knowledge, this is the first attempt at using any auto-prompting *framework* to perform the red-teaming task. This is also probably the deepest architecture in public optimized with DSPy to date.

--- a/agent/redteam.py
+++ b/agent/redteam.py
@@ -85,7 +85,6 @@ class AttackProgram(dspy.Module):
            critique = critique.critique
        return self.try_attacks[-1](harmful_intent=harmful_intent, critique=critique)

-
    def metric(
        self,
        intent: str | dspy.Example,
@@ -100,7 +99,10 @@ class AttackProgram(dspy.Module):
            target_client,
            self.target_model_name,
            attack_prompt,
-            inference_params={"max_tokens": self.max_attack_tokens, "temperature": self.temperature},
+            inference_params={
+                "max_tokens": self.max_attack_tokens,
+                "temperature": self.temperature,
+            },
        )
        if use_verdict:
            score = verdict_judge(intent, response)[0] / 5
@@ -110,7 +112,6 @@ class AttackProgram(dspy.Module):
            score = round(score)
        return score

-
    def eval_program(self, prog, eval_set):
        evaluate = Evaluate(
            devset=eval_set,
--- a/main.py
+++ b/main.py
@@ -50,8 +50,11 @@ def main():
    red_team_agent.attack_program.eval_program(best_prog, trainset)
    """
    # push to hub
-    red_team_agent.push_to_hub("farouk1/redteam", commit_message="finito", with_code=True)
+    red_team_agent.push_to_hub(
+        "farouk1/redteam", commit_message="finito", with_code=True
+    )
    print("---------Pushed to hub!---------")

+
 if __name__ == "__main__":
    main()