Files
redteam/main.py
2025-10-21 04:29:53 -04:00

58 lines
1.7 KiB
Python

import json
import dspy
from tqdm import tqdm
from agent import RedTeamAgent, RedTeamConfig
from dspy.teleprompt import MIPROv2
red_team_agent = RedTeamAgent(RedTeamConfig())
def main():
"""
with open("advbench_subset.json", "r") as f:
goals = json.load(f)["goals"]
trainset = [
dspy.Example(harmful_intent=goal).with_inputs("harmful_intent")
for goal in goals
]
# evaluate baseline: directly passing in harmful intent strings
base_score = 0
import litellm
litellm.cache = None
for ex in tqdm(trainset, desc="Raw Input Score"):
base_score += red_team_agent.attack_program.metric(
intent=ex.harmful_intent, attack_prompt=ex.harmful_intent, eval_round=True
)
base_score /= len(trainset)
print(f"--- Raw Harmful Intent Strings ---")
print(f"Baseline Score: {base_score}")
# evaluating architecture with no compilation
attacker_prog = red_team_agent
print(f"\n--- Evaluating Initial Architecture ---")
red_team_agent.attack_program.eval_program(attacker_prog, trainset)
optimizer = MIPROv2(metric=red_team_agent.attack_program.metric, auto=None)
best_prog = optimizer.compile(
attacker_prog,
trainset=trainset,
max_bootstrapped_demos=2,
max_labeled_demos=0,
num_trials=3,
num_candidates=6,
)
# evaluating architecture DSPy post-compilation
print(f"\n--- Evaluating Optimized Architecture ---")
red_team_agent.attack_program.eval_program(best_prog, trainset)
"""
# push to hub
red_team_agent.push_to_hub("farouk1/redteam", commit_message="finito", with_code=True)
print("---------Pushed to hub!---------")
if __name__ == "__main__":
main()