finito
This commit is contained in:
56
main.py
Normal file
56
main.py
Normal file
@@ -0,0 +1,56 @@
|
||||
import json
|
||||
import dspy
|
||||
from tqdm import tqdm
|
||||
from agent import metric, eval_program, RedTeamAgent, RedTeamConfig
|
||||
from dspy.teleprompt import MIPROv2
|
||||
|
||||
red_team_agent = RedTeamAgent(RedTeamConfig())
|
||||
|
||||
|
||||
def main():
|
||||
"""
|
||||
with open("advbench_subset.json", "r") as f:
|
||||
goals = json.load(f)["goals"]
|
||||
|
||||
trainset = [
|
||||
dspy.Example(harmful_intent=goal).with_inputs("harmful_intent")
|
||||
for goal in goals
|
||||
]
|
||||
|
||||
# evaluate baseline: directly passing in harmful intent strings
|
||||
base_score = 0
|
||||
import litellm
|
||||
|
||||
litellm.cache = None
|
||||
for ex in tqdm(trainset, desc="Raw Input Score"):
|
||||
base_score += metric(
|
||||
intent=ex.harmful_intent, attack_prompt=ex.harmful_intent, eval_round=True
|
||||
)
|
||||
base_score /= len(trainset)
|
||||
print(f"--- Raw Harmful Intent Strings ---")
|
||||
print(f"Baseline Score: {base_score}")
|
||||
|
||||
# evaluating architecture with no compilation
|
||||
attacker_prog = red_team_agent
|
||||
print(f"\n--- Evaluating Initial Architecture ---")
|
||||
eval_program(attacker_prog, trainset)
|
||||
|
||||
optimizer = MIPROv2(metric=metric, auto=None)
|
||||
best_prog = optimizer.compile(
|
||||
attacker_prog,
|
||||
trainset=trainset,
|
||||
max_bootstrapped_demos=2,
|
||||
max_labeled_demos=0,
|
||||
num_trials=3,
|
||||
)
|
||||
|
||||
# evaluating architecture DSPy post-compilation
|
||||
print(f"\n--- Evaluating Optimized Architecture ---")
|
||||
eval_program(best_prog, trainset)
|
||||
"""
|
||||
# push to hub
|
||||
red_team_agent.push_to_hub("farouk1/redteam", commit_message="finito", with_code=True)
|
||||
print("---------Pushed to hub!---------")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user