import json import dspy from tqdm import tqdm from agent import RedTeamAgent, RedTeamConfig from dspy.teleprompt import MIPROv2 red_team_agent = RedTeamAgent(RedTeamConfig()) def main(): """ with open("advbench_subset.json", "r") as f: goals = json.load(f)["goals"] trainset = [ dspy.Example(harmful_intent=goal).with_inputs("harmful_intent") for goal in goals ] # evaluate baseline: directly passing in harmful intent strings base_score = 0 import litellm litellm.cache = None for ex in tqdm(trainset, desc="Raw Input Score"): base_score += red_team_agent.attack_program.metric( intent=ex.harmful_intent, attack_prompt=ex.harmful_intent, eval_round=True ) base_score /= len(trainset) print(f"--- Raw Harmful Intent Strings ---") print(f"Baseline Score: {base_score}") # evaluating architecture with no compilation attacker_prog = red_team_agent print(f"\n--- Evaluating Initial Architecture ---") red_team_agent.attack_program.eval_program(attacker_prog, trainset) optimizer = MIPROv2(metric=red_team_agent.attack_program.metric, auto=None) best_prog = optimizer.compile( attacker_prog, trainset=trainset, max_bootstrapped_demos=2, max_labeled_demos=0, num_trials=3, num_candidates=6, ) # evaluating architecture DSPy post-compilation print(f"\n--- Evaluating Optimized Architecture ---") red_team_agent.attack_program.eval_program(best_prog, trainset) """ # push to hub red_team_agent.push_to_hub( "farouk1/redteam", commit_message="finito", with_code=True ) print("---------Pushed to hub!---------") if __name__ == "__main__": main()