finito

2025-10-21 03:48:46 -04:00
parent f5889ece32
commit b1b3e39cd9
11 changed files with 712 additions and 20 deletions
--- a/main.py
+++ b/main.py
@@ -0,0 +1,56 @@
+import json
+import dspy
+from tqdm import tqdm
+from agent import metric, eval_program, RedTeamAgent, RedTeamConfig
+from dspy.teleprompt import MIPROv2
+
+red_team_agent = RedTeamAgent(RedTeamConfig())
+
+
+def main():
+    """
+    with open("advbench_subset.json", "r") as f:
+        goals = json.load(f)["goals"]
+
+    trainset = [
+        dspy.Example(harmful_intent=goal).with_inputs("harmful_intent")
+        for goal in goals
+    ]
+
+    # evaluate baseline: directly passing in harmful intent strings
+    base_score = 0
+    import litellm
+
+    litellm.cache = None
+    for ex in tqdm(trainset, desc="Raw Input Score"):
+        base_score += metric(
+            intent=ex.harmful_intent, attack_prompt=ex.harmful_intent, eval_round=True
+        )
+    base_score /= len(trainset)
+    print(f"--- Raw Harmful Intent Strings ---")
+    print(f"Baseline Score: {base_score}")
+
+    # evaluating architecture with no compilation
+    attacker_prog = red_team_agent
+    print(f"\n--- Evaluating Initial Architecture ---")
+    eval_program(attacker_prog, trainset)
+
+    optimizer = MIPROv2(metric=metric, auto=None)
+    best_prog = optimizer.compile(
+        attacker_prog,
+        trainset=trainset,
+        max_bootstrapped_demos=2,
+        max_labeled_demos=0,
+        num_trials=3,
+    )
+
+    # evaluating architecture DSPy post-compilation
+    print(f"\n--- Evaluating Optimized Architecture ---")
+    eval_program(best_prog, trainset)
+    """
+    # push to hub
+    red_team_agent.push_to_hub("farouk1/redteam", commit_message="finito", with_code=True)
+    print("---------Pushed to hub!---------")
+    
+if __name__ == "__main__":
+    main()