This commit is contained in:
2025-10-21 04:14:06 -04:00
parent 5e016057b3
commit a37a3d63fb
4 changed files with 39 additions and 43 deletions

View File

@@ -12,18 +12,15 @@ We accomplish this using a *deep* language program with several layers of altern
*Figure 1: Overview of DSPy for red-teaming. The DSPy MIPRO optimizer, guided by a LLM as a judge, compiles our language program into an effective red-teamer against Vicuna.*
The following Table demonstrates the effectiveness of the chosen architecture, as well as the benefit of DSPy compilation:
![Results](https://cdn.prod.website-files.com/66f89b6eb96e685709a53e09/678357036bff3a56f1161706_678356ec1f1cbdbead37e11d_Screenshot%25202025-01-12%2520at%252012.45.10%25E2%2580%25AFAM.png)
*Table 1: ASR with raw harmful inputs, un-optimized architecture, and architecture post DSPy compilation.*
With *no specific prompt engineering*, we are able to achieve an Attack Success Rate of 44%, 4x over the baseline. This is by no means the SOTA, but considering how we essentially spent no effort designing the architecture and prompts, and considering how we just used an off-the-shelf optimizer with almost no hyperparameter tuning (except to fit compute constraints), we think it is pretty exciting that we can achieve this result!

View File

@@ -1,4 +1,4 @@
from .redteam import metric, eval_program, AttackProgram
from .redteam import AttackProgram
from .index import RedTeamAgent, RedTeamConfig
__all__ = ["metric", "eval_program", "AttackProgram", "RedTeamAgent", "RedTeamConfig"]
__all__ = ["AttackProgram", "RedTeamAgent", "RedTeamConfig"]

View File

@@ -86,23 +86,21 @@ class AttackProgram(dspy.Module):
return self.try_attacks[-1](harmful_intent=harmful_intent, critique=critique)
def metric(
def metric(
self,
intent: str | dspy.Example,
attack_prompt: str | dspy.Example,
use_verdict=True,
trace=None,
eval_round=True,
target_model_name="mistralai/Mistral-7B-Instruct-v0.2",
max_tokens=512,
temperature=0,
):
):
if isinstance(intent, dspy.Example):
intent = intent.harmful_intent # Test without Verdict too
response = get_response(
target_client,
target_model_name,
self.target_model_name,
attack_prompt,
inference_params={"max_tokens": max_tokens, "temperature": temperature},
inference_params={"max_tokens": self.max_attack_tokens, "temperature": self.temperature},
)
if use_verdict:
score = verdict_judge(intent, response)[0] / 5
@@ -113,10 +111,10 @@ def metric(
return score
def eval_program(prog, eval_set):
def eval_program(self, prog, eval_set):
evaluate = Evaluate(
devset=eval_set,
metric=lambda x, y: metric(x, y),
metric=lambda x, y: self.metric(x, y),
num_threads=4,
display_progress=True,
display_table=0,

View File

@@ -1,7 +1,7 @@
import json
import dspy
from tqdm import tqdm
from agent import metric, eval_program, RedTeamAgent, RedTeamConfig
from agent import RedTeamAgent, RedTeamConfig
from dspy.teleprompt import MIPROv2
red_team_agent = RedTeamAgent(RedTeamConfig())
@@ -23,7 +23,7 @@ def main():
litellm.cache = None
for ex in tqdm(trainset, desc="Raw Input Score"):
base_score += metric(
base_score += red_team_agent.attack_program.metric(
intent=ex.harmful_intent, attack_prompt=ex.harmful_intent, eval_round=True
)
base_score /= len(trainset)
@@ -33,7 +33,7 @@ def main():
# evaluating architecture with no compilation
attacker_prog = red_team_agent
print(f"\n--- Evaluating Initial Architecture ---")
eval_program(attacker_prog, trainset)
red_team_agent.attack_program.eval_program(attacker_prog, trainset)
optimizer = MIPROv2(metric=metric, auto=None)
best_prog = optimizer.compile(
@@ -46,8 +46,9 @@ def main():
# evaluating architecture DSPy post-compilation
print(f"\n--- Evaluating Optimized Architecture ---")
eval_program(best_prog, trainset)
red_team_agent.attack_program.eval_program(best_prog, trainset)
"""
# push to hub
red_team_agent.push_to_hub("farouk1/redteam", commit_message="finito", with_code=True)
print("---------Pushed to hub!---------")