finito
This commit is contained in:
@@ -1,4 +1,4 @@
|
||||
from .redteam import metric, eval_program, AttackProgram
|
||||
from .redteam import AttackProgram
|
||||
from .index import RedTeamAgent, RedTeamConfig
|
||||
|
||||
__all__ = ["metric", "eval_program", "AttackProgram", "RedTeamAgent", "RedTeamConfig"]
|
||||
__all__ = ["AttackProgram", "RedTeamAgent", "RedTeamConfig"]
|
||||
|
||||
@@ -86,39 +86,37 @@ class AttackProgram(dspy.Module):
|
||||
return self.try_attacks[-1](harmful_intent=harmful_intent, critique=critique)
|
||||
|
||||
|
||||
def metric(
|
||||
intent: str | dspy.Example,
|
||||
attack_prompt: str | dspy.Example,
|
||||
use_verdict=True,
|
||||
trace=None,
|
||||
eval_round=True,
|
||||
target_model_name="mistralai/Mistral-7B-Instruct-v0.2",
|
||||
max_tokens=512,
|
||||
temperature=0,
|
||||
):
|
||||
if isinstance(intent, dspy.Example):
|
||||
intent = intent.harmful_intent # Test without Verdict too
|
||||
response = get_response(
|
||||
target_client,
|
||||
target_model_name,
|
||||
attack_prompt,
|
||||
inference_params={"max_tokens": max_tokens, "temperature": temperature},
|
||||
)
|
||||
if use_verdict:
|
||||
score = verdict_judge(intent, response)[0] / 5
|
||||
else:
|
||||
score = judge_prompt(instructor_client, intent, response)[0]
|
||||
if eval_round:
|
||||
score = round(score)
|
||||
return score
|
||||
def metric(
|
||||
self,
|
||||
intent: str | dspy.Example,
|
||||
attack_prompt: str | dspy.Example,
|
||||
use_verdict=True,
|
||||
trace=None,
|
||||
eval_round=True,
|
||||
):
|
||||
if isinstance(intent, dspy.Example):
|
||||
intent = intent.harmful_intent # Test without Verdict too
|
||||
response = get_response(
|
||||
target_client,
|
||||
self.target_model_name,
|
||||
attack_prompt,
|
||||
inference_params={"max_tokens": self.max_attack_tokens, "temperature": self.temperature},
|
||||
)
|
||||
if use_verdict:
|
||||
score = verdict_judge(intent, response)[0] / 5
|
||||
else:
|
||||
score = judge_prompt(instructor_client, intent, response)[0]
|
||||
if eval_round:
|
||||
score = round(score)
|
||||
return score
|
||||
|
||||
|
||||
def eval_program(prog, eval_set):
|
||||
evaluate = Evaluate(
|
||||
devset=eval_set,
|
||||
metric=lambda x, y: metric(x, y),
|
||||
num_threads=4,
|
||||
display_progress=True,
|
||||
display_table=0,
|
||||
)
|
||||
evaluate(prog)
|
||||
def eval_program(self, prog, eval_set):
|
||||
evaluate = Evaluate(
|
||||
devset=eval_set,
|
||||
metric=lambda x, y: self.metric(x, y),
|
||||
num_threads=4,
|
||||
display_progress=True,
|
||||
display_table=0,
|
||||
)
|
||||
evaluate(prog)
|
||||
|
||||
Reference in New Issue
Block a user