use new modaic nomenclature

This commit is contained in:
2025-12-28 12:08:41 -08:00
parent d37869e895
commit 7775134235
11 changed files with 54 additions and 224 deletions

49
main.py
View File

@@ -1,57 +1,16 @@
import json
import dspy
from tqdm import tqdm
from agent import RedTeamAgent, RedTeamConfig
from program import AttackProgram, AttackProgramConfig
from dspy.teleprompt import MIPROv2
red_team_agent = RedTeamAgent(RedTeamConfig())
attack_program = AttackProgram(AttackProgramConfig())
def main():
"""
with open("advbench_subset.json", "r") as f:
goals = json.load(f)["goals"]
trainset = [
dspy.Example(harmful_intent=goal).with_inputs("harmful_intent")
for goal in goals
]
# evaluate baseline: directly passing in harmful intent strings
base_score = 0
import litellm
litellm.cache = None
for ex in tqdm(trainset, desc="Raw Input Score"):
base_score += red_team_agent.attack_program.metric(
intent=ex.harmful_intent, attack_prompt=ex.harmful_intent, eval_round=True
)
base_score /= len(trainset)
print(f"--- Raw Harmful Intent Strings ---")
print(f"Baseline Score: {base_score}")
# evaluating architecture with no compilation
attacker_prog = red_team_agent
print(f"\n--- Evaluating Initial Architecture ---")
red_team_agent.attack_program.eval_program(attacker_prog, trainset)
optimizer = MIPROv2(metric=red_team_agent.attack_program.metric, auto=None)
best_prog = optimizer.compile(
attacker_prog,
trainset=trainset,
max_bootstrapped_demos=2,
max_labeled_demos=0,
num_trials=3,
num_candidates=6,
)
# evaluating architecture DSPy post-compilation
print(f"\n--- Evaluating Optimized Architecture ---")
red_team_agent.attack_program.eval_program(best_prog, trainset)
"""
# push to hub
red_team_agent.push_to_hub(
"farouk1/redteam", commit_message="finito", with_code=True
attack_program.push_to_hub(
"farouk1/redteam", commit_message="use new modaic nomenclature", with_code=True
)
print("---------Pushed to hub!---------")