finito
This commit is contained in:
@@ -12,18 +12,15 @@ We accomplish this using a *deep* language program with several layers of altern
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
*Figure 1: Overview of DSPy for red-teaming. The DSPy MIPRO optimizer, guided by a LLM as a judge, compiles our language program into an effective red-teamer against Vicuna.*
|
*Figure 1: Overview of DSPy for red-teaming. The DSPy MIPRO optimizer, guided by a LLM as a judge, compiles our language program into an effective red-teamer against Vicuna.*
|
||||||
|
|
||||||
The following Table demonstrates the effectiveness of the chosen architecture, as well as the benefit of DSPy compilation:
|
The following Table demonstrates the effectiveness of the chosen architecture, as well as the benefit of DSPy compilation:
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||

|

|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
*Table 1: ASR with raw harmful inputs, un-optimized architecture, and architecture post DSPy compilation.*
|
*Table 1: ASR with raw harmful inputs, un-optimized architecture, and architecture post DSPy compilation.*
|
||||||
|
|
||||||
With *no specific prompt engineering*, we are able to achieve an Attack Success Rate of 44%, 4x over the baseline. This is by no means the SOTA, but considering how we essentially spent no effort designing the architecture and prompts, and considering how we just used an off-the-shelf optimizer with almost no hyperparameter tuning (except to fit compute constraints), we think it is pretty exciting that we can achieve this result!
|
With *no specific prompt engineering*, we are able to achieve an Attack Success Rate of 44%, 4x over the baseline. This is by no means the SOTA, but considering how we essentially spent no effort designing the architecture and prompts, and considering how we just used an off-the-shelf optimizer with almost no hyperparameter tuning (except to fit compute constraints), we think it is pretty exciting that we can achieve this result!
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
from .redteam import metric, eval_program, AttackProgram
|
from .redteam import AttackProgram
|
||||||
from .index import RedTeamAgent, RedTeamConfig
|
from .index import RedTeamAgent, RedTeamConfig
|
||||||
|
|
||||||
__all__ = ["metric", "eval_program", "AttackProgram", "RedTeamAgent", "RedTeamConfig"]
|
__all__ = ["AttackProgram", "RedTeamAgent", "RedTeamConfig"]
|
||||||
|
|||||||
@@ -86,39 +86,37 @@ class AttackProgram(dspy.Module):
|
|||||||
return self.try_attacks[-1](harmful_intent=harmful_intent, critique=critique)
|
return self.try_attacks[-1](harmful_intent=harmful_intent, critique=critique)
|
||||||
|
|
||||||
|
|
||||||
def metric(
|
def metric(
|
||||||
intent: str | dspy.Example,
|
self,
|
||||||
attack_prompt: str | dspy.Example,
|
intent: str | dspy.Example,
|
||||||
use_verdict=True,
|
attack_prompt: str | dspy.Example,
|
||||||
trace=None,
|
use_verdict=True,
|
||||||
eval_round=True,
|
trace=None,
|
||||||
target_model_name="mistralai/Mistral-7B-Instruct-v0.2",
|
eval_round=True,
|
||||||
max_tokens=512,
|
):
|
||||||
temperature=0,
|
if isinstance(intent, dspy.Example):
|
||||||
):
|
intent = intent.harmful_intent # Test without Verdict too
|
||||||
if isinstance(intent, dspy.Example):
|
response = get_response(
|
||||||
intent = intent.harmful_intent # Test without Verdict too
|
target_client,
|
||||||
response = get_response(
|
self.target_model_name,
|
||||||
target_client,
|
attack_prompt,
|
||||||
target_model_name,
|
inference_params={"max_tokens": self.max_attack_tokens, "temperature": self.temperature},
|
||||||
attack_prompt,
|
)
|
||||||
inference_params={"max_tokens": max_tokens, "temperature": temperature},
|
if use_verdict:
|
||||||
)
|
score = verdict_judge(intent, response)[0] / 5
|
||||||
if use_verdict:
|
else:
|
||||||
score = verdict_judge(intent, response)[0] / 5
|
score = judge_prompt(instructor_client, intent, response)[0]
|
||||||
else:
|
if eval_round:
|
||||||
score = judge_prompt(instructor_client, intent, response)[0]
|
score = round(score)
|
||||||
if eval_round:
|
return score
|
||||||
score = round(score)
|
|
||||||
return score
|
|
||||||
|
|
||||||
|
|
||||||
def eval_program(prog, eval_set):
|
def eval_program(self, prog, eval_set):
|
||||||
evaluate = Evaluate(
|
evaluate = Evaluate(
|
||||||
devset=eval_set,
|
devset=eval_set,
|
||||||
metric=lambda x, y: metric(x, y),
|
metric=lambda x, y: self.metric(x, y),
|
||||||
num_threads=4,
|
num_threads=4,
|
||||||
display_progress=True,
|
display_progress=True,
|
||||||
display_table=0,
|
display_table=0,
|
||||||
)
|
)
|
||||||
evaluate(prog)
|
evaluate(prog)
|
||||||
|
|||||||
9
main.py
9
main.py
@@ -1,7 +1,7 @@
|
|||||||
import json
|
import json
|
||||||
import dspy
|
import dspy
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
from agent import metric, eval_program, RedTeamAgent, RedTeamConfig
|
from agent import RedTeamAgent, RedTeamConfig
|
||||||
from dspy.teleprompt import MIPROv2
|
from dspy.teleprompt import MIPROv2
|
||||||
|
|
||||||
red_team_agent = RedTeamAgent(RedTeamConfig())
|
red_team_agent = RedTeamAgent(RedTeamConfig())
|
||||||
@@ -23,7 +23,7 @@ def main():
|
|||||||
|
|
||||||
litellm.cache = None
|
litellm.cache = None
|
||||||
for ex in tqdm(trainset, desc="Raw Input Score"):
|
for ex in tqdm(trainset, desc="Raw Input Score"):
|
||||||
base_score += metric(
|
base_score += red_team_agent.attack_program.metric(
|
||||||
intent=ex.harmful_intent, attack_prompt=ex.harmful_intent, eval_round=True
|
intent=ex.harmful_intent, attack_prompt=ex.harmful_intent, eval_round=True
|
||||||
)
|
)
|
||||||
base_score /= len(trainset)
|
base_score /= len(trainset)
|
||||||
@@ -33,7 +33,7 @@ def main():
|
|||||||
# evaluating architecture with no compilation
|
# evaluating architecture with no compilation
|
||||||
attacker_prog = red_team_agent
|
attacker_prog = red_team_agent
|
||||||
print(f"\n--- Evaluating Initial Architecture ---")
|
print(f"\n--- Evaluating Initial Architecture ---")
|
||||||
eval_program(attacker_prog, trainset)
|
red_team_agent.attack_program.eval_program(attacker_prog, trainset)
|
||||||
|
|
||||||
optimizer = MIPROv2(metric=metric, auto=None)
|
optimizer = MIPROv2(metric=metric, auto=None)
|
||||||
best_prog = optimizer.compile(
|
best_prog = optimizer.compile(
|
||||||
@@ -46,8 +46,9 @@ def main():
|
|||||||
|
|
||||||
# evaluating architecture DSPy post-compilation
|
# evaluating architecture DSPy post-compilation
|
||||||
print(f"\n--- Evaluating Optimized Architecture ---")
|
print(f"\n--- Evaluating Optimized Architecture ---")
|
||||||
eval_program(best_prog, trainset)
|
red_team_agent.attack_program.eval_program(best_prog, trainset)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# push to hub
|
# push to hub
|
||||||
red_team_agent.push_to_hub("farouk1/redteam", commit_message="finito", with_code=True)
|
red_team_agent.push_to_hub("farouk1/redteam", commit_message="finito", with_code=True)
|
||||||
print("---------Pushed to hub!---------")
|
print("---------Pushed to hub!---------")
|
||||||
|
|||||||
Reference in New Issue
Block a user