finito

2025-10-21 14:13:38 -04:00
parent a413e05023
commit 7b1535f925
3 changed files with 108 additions and 6 deletions
--- a/README.md
+++ b/README.md
@@ -1,6 +1,104 @@
 # Red-Teaming Language Models with DSPy
-We use the the power of [DSPy](https://github.com/stanfordnlp/dspy), a framework for structuring and optimizing language model programs, to red-team language models.
+A packaged version of an open source red-teaming framework that uses the power of [DSPy](https://github.com/stanfordnlp/dspy) to red-team language models through automated attack generation and optimization.
 ## Quick Start
 ### Installation
 ```bash
 git clone https://git.modaic.dev/farouk1/redteam.git
 cd redteam
 uv sync
 ```
 Or initialize a new project:
 ```bash
 uv init
 uv add verdict instructor tqdm modaic
 ```
 ### Environment Variables
 Create a `.env` file with:
 ```bash
 MODAIC_TOKEN="<your_modaic_token>"
 TOGETHER_API_KEY="<your_together_api_key>"
 OPENAI_API_KEY="<your_openai_api_key>"
 ```
 ### Usage
 ```python
 import json
 import dspy
 from tqdm import tqdm
 from dspy.teleprompt import MIPROv2
 from modaic import AutoAgent
 redteam_agent = AutoAgent.from_precompiled("farouk1/redteam", config_options={"num_layers": 3})
 def main():
    with open("advbench_subset.json", "r") as f:
        goals = json.load(f)["goals"]
    trainset = [
        dspy.Example(harmful_intent=goal).with_inputs("harmful_intent")
        for goal in goals
    ]
    # evaluate baseline: directly passing in harmful intent strings
    base_score = 0
    import litellm
    litellm.cache = None
    for ex in tqdm(trainset, desc="Raw Input Score"):
        base_score += redteam_agent.attack_program.metric(
            intent=ex.harmful_intent, attack_prompt=ex.harmful_intent, eval_round=True
        )
    base_score /= len(trainset)
    print(f"--- Raw Harmful Intent Strings ---")
    print(f"Baseline Score: {base_score}")
    # evaluating architecture with no compilation
    attacker_prog = redteam_agent
    print(f"\n--- Evaluating Initial Architecture ---")
    redteam_agent.attack_program.eval_program(attacker_prog, trainset)
    optimizer = MIPROv2(metric=redteam_agent.attack_program.metric, auto=None)
    best_prog = optimizer.compile(
        attacker_prog,
        trainset=trainset,
        max_bootstrapped_demos=2,
        max_labeled_demos=0,
        num_trials=3,
        num_candidates=6,
    )
    # evaluating architecture DSPy post-compilation
    print(f"\n--- Evaluating Optimized Architecture ---")
    redteam_agent.attack_program.eval_program(best_prog, trainset)
 if __name__ == "__main__":
    main()
 ```
 ### Configuration
 The red-team agent can be configured via the `config_options` parameter:
 ```python
 class RedTeamConfig(PrecompiledConfig):
    lm: str = "gpt-4o-mini"
    target_lm: str = "mistralai/Mistral-7B-Instruct-v0.2"
    num_layers: int = 5
    max_attack_tokens: int = 512
    temperature: float = 0
 ```
 ## Overview
 To our knowledge, this is the first attempt at using any auto-prompting *framework* to perform the red-teaming task. This is also probably the deepest architecture in public optimized with DSPy to date.
--- a/agent/redteam.py
+++ b/agent/redteam.py
@@ -85,7 +85,6 @@ class AttackProgram(dspy.Module):
            critique = critique.critique
        return self.try_attacks[-1](harmful_intent=harmful_intent, critique=critique)
    def metric(
        self,
        intent: str | dspy.Example,
@@ -100,7 +99,10 @@ class AttackProgram(dspy.Module):
            target_client,
            self.target_model_name,
            attack_prompt,
-            inference_params={"max_tokens": self.max_attack_tokens, "temperature": self.temperature},
+            inference_params={
                "max_tokens": self.max_attack_tokens,
                "temperature": self.temperature,
            },
        )
        if use_verdict:
            score = verdict_judge(intent, response)[0] / 5
@@ -110,7 +112,6 @@ class AttackProgram(dspy.Module):
            score = round(score)
        return score
    def eval_program(self, prog, eval_set):
        evaluate = Evaluate(
            devset=eval_set,
--- a/main.py
+++ b/main.py
@@ -50,8 +50,11 @@ def main():
    red_team_agent.attack_program.eval_program(best_prog, trainset)
    """
    # push to hub
-    red_team_agent.push_to_hub("farouk1/redteam", commit_message="finito", with_code=True)
+    red_team_agent.push_to_hub(
        "farouk1/redteam", commit_message="finito", with_code=True
    )
    print("---------Pushed to hub!---------")
 if __name__ == "__main__":
    main()