diff --git a/program.json b/program.json index 7174e6c..d599d87 100644 --- a/program.json +++ b/program.json @@ -28,7 +28,7 @@ "finetuning_model": null, "launch_kwargs": {}, "train_kwargs": {}, - "temperature": null, + "temperature": 0, "max_tokens": 512 } }, @@ -61,7 +61,7 @@ "finetuning_model": null, "launch_kwargs": {}, "train_kwargs": {}, - "temperature": null, + "temperature": 0, "max_tokens": 512 } }, @@ -94,7 +94,7 @@ "finetuning_model": null, "launch_kwargs": {}, "train_kwargs": {}, - "temperature": null, + "temperature": 0, "max_tokens": 512 } }, @@ -127,7 +127,7 @@ "finetuning_model": null, "launch_kwargs": {}, "train_kwargs": {}, - "temperature": null, + "temperature": 0, "max_tokens": 512 } }, @@ -160,7 +160,7 @@ "finetuning_model": null, "launch_kwargs": {}, "train_kwargs": {}, - "temperature": null, + "temperature": 0, "max_tokens": 512 } }, @@ -197,7 +197,7 @@ "finetuning_model": null, "launch_kwargs": {}, "train_kwargs": {}, - "temperature": null, + "temperature": 0, "max_tokens": 512 } }, @@ -234,7 +234,7 @@ "finetuning_model": null, "launch_kwargs": {}, "train_kwargs": {}, - "temperature": null, + "temperature": 0, "max_tokens": 512 } }, @@ -271,7 +271,7 @@ "finetuning_model": null, "launch_kwargs": {}, "train_kwargs": {}, - "temperature": null, + "temperature": 0, "max_tokens": 512 } }, @@ -308,7 +308,7 @@ "finetuning_model": null, "launch_kwargs": {}, "train_kwargs": {}, - "temperature": null, + "temperature": 0, "max_tokens": 512 } }, @@ -345,7 +345,7 @@ "finetuning_model": null, "launch_kwargs": {}, "train_kwargs": {}, - "temperature": null, + "temperature": 0, "max_tokens": 512 } }, diff --git a/program/redteam.py b/program/redteam.py index 213f192..18dcee0 100644 --- a/program/redteam.py +++ b/program/redteam.py @@ -63,14 +63,18 @@ class AttackProgram(PrecompiledProgram): **kwargs, ): super().__init__(config, **kwargs) - attack_model = dspy.LM(model=config.lm, max_tokens=config.max_attack_tokens) + + attack_model = dspy.LM(model=config.lm, max_tokens=config.max_attack_tokens, temperature=config.temperature) self.get_response = get_response self.layers = config.num_layers + self.try_attacks = [dspy.Predict(Attack) for _ in range(self.layers)] self.critique_attacks = [dspy.Predict(Refine) for _ in range(self.layers)] + self.target_model_name = config.target_lm self.max_attack_tokens = config.max_attack_tokens self.temperature = config.temperature + self.set_lm(attack_model) def forward(self, harmful_intent, critique=""):