35 lines
1.1 KiB
Python
35 lines
1.1 KiB
Python
from .redteam import AttackProgram
|
|
from modaic import PrecompiledAgent, PrecompiledConfig
|
|
import dspy
|
|
|
|
|
|
class RedTeamConfig(PrecompiledConfig):
|
|
lm: str = "gpt-4o-mini"
|
|
target_lm: str = "mistralai/Mistral-7B-Instruct-v0.2"
|
|
num_layers: int = 5
|
|
max_attack_tokens: int = 512
|
|
temperature: float = 0
|
|
|
|
|
|
class RedTeamAgent(PrecompiledAgent):
|
|
config: RedTeamConfig
|
|
|
|
def __init__(self, config: RedTeamConfig, **kwargs):
|
|
super().__init__(config, **kwargs)
|
|
|
|
self.attack_program = AttackProgram(
|
|
layers=config.num_layers,
|
|
target_model_name=config.target_lm,
|
|
max_attack_tokens=config.max_attack_tokens,
|
|
temperature=config.temperature,
|
|
)
|
|
attack_model = dspy.LM(model=config.lm, max_tokens=config.max_attack_tokens)
|
|
self.attack_program.set_lm(attack_model)
|
|
self.target_lm = config.target_lm
|
|
self.num_layers = config.num_layers
|
|
self.max_attack_tokens = config.max_attack_tokens
|
|
self.temperature = config.temperature
|
|
|
|
def forward(self, harmful_intent, critique=""):
|
|
return self.attack_program(harmful_intent, critique)
|