61 lines
1.7 KiB
Python
61 lines
1.7 KiB
Python
import json
|
|
import dspy
|
|
from tqdm import tqdm
|
|
from agent import RedTeamAgent, RedTeamConfig
|
|
from dspy.teleprompt import MIPROv2
|
|
|
|
red_team_agent = RedTeamAgent(RedTeamConfig())
|
|
|
|
|
|
def main():
|
|
"""
|
|
with open("advbench_subset.json", "r") as f:
|
|
goals = json.load(f)["goals"]
|
|
|
|
trainset = [
|
|
dspy.Example(harmful_intent=goal).with_inputs("harmful_intent")
|
|
for goal in goals
|
|
]
|
|
|
|
# evaluate baseline: directly passing in harmful intent strings
|
|
base_score = 0
|
|
import litellm
|
|
|
|
litellm.cache = None
|
|
for ex in tqdm(trainset, desc="Raw Input Score"):
|
|
base_score += red_team_agent.attack_program.metric(
|
|
intent=ex.harmful_intent, attack_prompt=ex.harmful_intent, eval_round=True
|
|
)
|
|
base_score /= len(trainset)
|
|
print(f"--- Raw Harmful Intent Strings ---")
|
|
print(f"Baseline Score: {base_score}")
|
|
|
|
# evaluating architecture with no compilation
|
|
attacker_prog = red_team_agent
|
|
print(f"\n--- Evaluating Initial Architecture ---")
|
|
red_team_agent.attack_program.eval_program(attacker_prog, trainset)
|
|
|
|
optimizer = MIPROv2(metric=red_team_agent.attack_program.metric, auto=None)
|
|
best_prog = optimizer.compile(
|
|
attacker_prog,
|
|
trainset=trainset,
|
|
max_bootstrapped_demos=2,
|
|
max_labeled_demos=0,
|
|
num_trials=3,
|
|
num_candidates=6,
|
|
)
|
|
|
|
# evaluating architecture DSPy post-compilation
|
|
print(f"\n--- Evaluating Optimized Architecture ---")
|
|
red_team_agent.attack_program.eval_program(best_prog, trainset)
|
|
"""
|
|
# push to hub
|
|
red_team_agent.push_to_hub(
|
|
"farouk1/redteam", commit_message="finito", with_code=True
|
|
)
|
|
print("---------Pushed to hub!---------")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|