from .redteam import AttackProgram from modaic import PrecompiledAgent, PrecompiledConfig import dspy class RedTeamConfig(PrecompiledConfig): lm: str = "gpt-4o-mini" target_lm: str = "mistralai/Mistral-7B-Instruct-v0.2" num_layers: int = 5 max_attack_tokens: int = 512 temperature: float = 0 class RedTeamAgent(PrecompiledAgent): config: RedTeamConfig def __init__(self, config: RedTeamConfig, **kwargs): super().__init__(config, **kwargs) self.attack_program = AttackProgram( layers=config.num_layers, target_model_name=config.target_lm, max_attack_tokens=config.max_attack_tokens, temperature=config.temperature, ) attack_model = dspy.LM(model=config.lm, max_tokens=config.max_attack_tokens) self.attack_program.set_lm(attack_model) self.target_lm = config.target_lm self.num_layers = config.num_layers self.max_attack_tokens = config.max_attack_tokens self.temperature = config.temperature def forward(self, harmful_intent, critique=""): return self.attack_program(harmful_intent, critique)