This commit is contained in:
2025-10-21 03:48:46 -04:00
parent f5889ece32
commit b1b3e39cd9
11 changed files with 712 additions and 20 deletions

34
agent/index.py Normal file
View File

@@ -0,0 +1,34 @@
from .redteam import AttackProgram
from modaic import PrecompiledAgent, PrecompiledConfig
import dspy
class RedTeamConfig(PrecompiledConfig):
lm: str = "gpt-4o-mini"
target_lm: str = "mistralai/Mistral-7B-Instruct-v0.2"
num_layers: int = 5
max_attack_tokens: int = 512
temperature: float = 0
class RedTeamAgent(PrecompiledAgent):
config: RedTeamConfig
def __init__(self, config: RedTeamConfig, **kwargs):
super().__init__(config, **kwargs)
self.attack_program = AttackProgram(
layers=config.num_layers,
target_model_name=config.target_lm,
max_attack_tokens=config.max_attack_tokens,
temperature=config.temperature,
)
attack_model = dspy.LM(model=config.lm, max_tokens=config.max_attack_tokens)
self.attack_program.set_lm(attack_model)
self.target_lm = config.target_lm
self.num_layers = config.num_layers
self.max_attack_tokens = config.max_attack_tokens
self.temperature = config.temperature
def forward(self, harmful_intent, critique=""):
return self.attack_program(harmful_intent, critique)