finito
This commit is contained in:
34
agent/index.py
Normal file
34
agent/index.py
Normal file
@@ -0,0 +1,34 @@
|
||||
from .redteam import AttackProgram
|
||||
from modaic import PrecompiledAgent, PrecompiledConfig
|
||||
import dspy
|
||||
|
||||
|
||||
class RedTeamConfig(PrecompiledConfig):
|
||||
lm: str = "gpt-4o-mini"
|
||||
target_lm: str = "mistralai/Mistral-7B-Instruct-v0.2"
|
||||
num_layers: int = 5
|
||||
max_attack_tokens: int = 512
|
||||
temperature: float = 0
|
||||
|
||||
|
||||
class RedTeamAgent(PrecompiledAgent):
|
||||
config: RedTeamConfig
|
||||
|
||||
def __init__(self, config: RedTeamConfig, **kwargs):
|
||||
super().__init__(config, **kwargs)
|
||||
|
||||
self.attack_program = AttackProgram(
|
||||
layers=config.num_layers,
|
||||
target_model_name=config.target_lm,
|
||||
max_attack_tokens=config.max_attack_tokens,
|
||||
temperature=config.temperature,
|
||||
)
|
||||
attack_model = dspy.LM(model=config.lm, max_tokens=config.max_attack_tokens)
|
||||
self.attack_program.set_lm(attack_model)
|
||||
self.target_lm = config.target_lm
|
||||
self.num_layers = config.num_layers
|
||||
self.max_attack_tokens = config.max_attack_tokens
|
||||
self.temperature = config.temperature
|
||||
|
||||
def forward(self, harmful_intent, critique=""):
|
||||
return self.attack_program(harmful_intent, critique)
|
||||
Reference in New Issue
Block a user