use new modaic nomenclature
This commit is contained in:
49
main.py
49
main.py
@@ -1,57 +1,16 @@
|
||||
import json
|
||||
import dspy
|
||||
from tqdm import tqdm
|
||||
from agent import RedTeamAgent, RedTeamConfig
|
||||
from program import AttackProgram, AttackProgramConfig
|
||||
from dspy.teleprompt import MIPROv2
|
||||
|
||||
red_team_agent = RedTeamAgent(RedTeamConfig())
|
||||
attack_program = AttackProgram(AttackProgramConfig())
|
||||
|
||||
|
||||
def main():
|
||||
"""
|
||||
with open("advbench_subset.json", "r") as f:
|
||||
goals = json.load(f)["goals"]
|
||||
|
||||
trainset = [
|
||||
dspy.Example(harmful_intent=goal).with_inputs("harmful_intent")
|
||||
for goal in goals
|
||||
]
|
||||
|
||||
# evaluate baseline: directly passing in harmful intent strings
|
||||
base_score = 0
|
||||
import litellm
|
||||
|
||||
litellm.cache = None
|
||||
for ex in tqdm(trainset, desc="Raw Input Score"):
|
||||
base_score += red_team_agent.attack_program.metric(
|
||||
intent=ex.harmful_intent, attack_prompt=ex.harmful_intent, eval_round=True
|
||||
)
|
||||
base_score /= len(trainset)
|
||||
print(f"--- Raw Harmful Intent Strings ---")
|
||||
print(f"Baseline Score: {base_score}")
|
||||
|
||||
# evaluating architecture with no compilation
|
||||
attacker_prog = red_team_agent
|
||||
print(f"\n--- Evaluating Initial Architecture ---")
|
||||
red_team_agent.attack_program.eval_program(attacker_prog, trainset)
|
||||
|
||||
optimizer = MIPROv2(metric=red_team_agent.attack_program.metric, auto=None)
|
||||
best_prog = optimizer.compile(
|
||||
attacker_prog,
|
||||
trainset=trainset,
|
||||
max_bootstrapped_demos=2,
|
||||
max_labeled_demos=0,
|
||||
num_trials=3,
|
||||
num_candidates=6,
|
||||
)
|
||||
|
||||
# evaluating architecture DSPy post-compilation
|
||||
print(f"\n--- Evaluating Optimized Architecture ---")
|
||||
red_team_agent.attack_program.eval_program(best_prog, trainset)
|
||||
"""
|
||||
# push to hub
|
||||
red_team_agent.push_to_hub(
|
||||
"farouk1/redteam", commit_message="finito", with_code=True
|
||||
attack_program.push_to_hub(
|
||||
"farouk1/redteam", commit_message="use new modaic nomenclature", with_code=True
|
||||
)
|
||||
print("---------Pushed to hub!---------")
|
||||
|
||||
|
||||
Reference in New Issue
Block a user