Files
CoTWithThoughtSimplifiedBaleen/main.py

254 lines
8.5 KiB
Python

# run with python main.py cot
import pickle
import time
import argparse
from typing import Optional
from opentom_evaluator import OpenToMEvaluatorDspy
import dspy
from dspy.teleprompt import BootstrapFewShotWithRandomSearch
from dspy.evaluate.evaluate import Evaluate
from src.cot import CoTSimplifiedBaleen, CoTSimplifiedBaleenConfig
from src.cot_with_thought import CoTWithThoughtSimplifiedBaleen, CoTWithThoughtSimplifiedBaleenConfig
from get_data import default_factory, load_dataset
from collections import defaultdict
from dotenv import load_dotenv
import neptune
import numpy as np
load_dotenv()
# initialize neptune
run = neptune.init_run(
project="modaic/dspy-opentom",
capture_hardware_metrics=False,
capture_stderr=True,
capture_stdout=True,
capture_traceback=True,
)
EVAL_QUESTION_TYPES = [
"attitude",
"multihop-fo",
"multihop-so",
"location-fo-coarse",
"location-fo-fine",
"location-so-coarse",
"location-so-fine",
]
def dump_state(data, filename):
with open(filename, "wb") as file:
pickle.dump(data, file)
def main(
dspy_method,
dspy_optimizer,
download_dataset,
question_types,
teacher_lm,
train_size,
):
# load dataset
if download_dataset:
load_dataset()
# read in the datasets pickle object
with open("datasets.pkl", "rb") as file:
datasets = pickle.load(file)
if dspy_method == "cot":
module_type = CoTSimplifiedBaleen(CoTSimplifiedBaleenConfig())
module_name = "CoTSimplifiedBaleen"
elif dspy_method == "cot_with_thought":
module_type = CoTWithThoughtSimplifiedBaleen(CoTWithThoughtSimplifiedBaleenConfig())
module_name = "CoTWithThoughtSimplifiedBaleen"
else:
raise Exception(f"Dspy method '{dspy_method}' is not valid")
module_type.push_to_hub(f"vintro/{module_name}", with_code=True, commit_message=f"Uncompiled {module_name} as baseline")
modules = {}
# define modules for each question type
for question_type in question_types:
print(f"TYPE: {question_type}")
evaluator = OpenToMEvaluatorDspy(model_name="(training set) complied baleen")
if dspy_optimizer == "bootstrap_fewshot_with_random_search":
optimizer = BootstrapFewShotWithRandomSearch(
metric=evaluator.dspy_metric,
num_candidate_programs=25,
num_threads=1,
teacher_settings=dict(lm=teacher_lm),
)
compiled_baleen = optimizer.compile(
module_type(), trainset=datasets[question_type]["train"][:train_size]
)
# elif dspy_optimizer == "signature_optimizer": # Signature Optimizer is deprecated TODO: add a new one like GEPA
# optimizer = SignatureOptimizer(
# metric=evaluator.dspy_metric,
# breadth=10,
# depth=3,
# init_temperature=1.4,
# verbose=True,
# track_stats=True,
# prompt_model=teacher_lm,
# )
# eval_kwargs = dict(num_threads=1, display_progress=True, display_table=0)
# compiled_baleen = optimizer.compile(
# module_type(),
# devset=datasets[question_type]["train"][:train_size],
# eval_kwargs=eval_kwargs,
# )
else:
raise Exception(f"Invalid dspy optimizer type: {dspy_optimizer}")
modules[question_type] = compiled_baleen
compiled_baleen.push_to_hub(f"vintro/{module_name}-{question_type}", with_code=True, commit_message=f"Compiled {module_name} with {dspy_optimizer} for {question_type}")
time.sleep(10)
uncompiled_baleen = (
CoTSimplifiedBaleen()
) # regular cot is always the uncompiled baseline
print("Beginning Evaluation")
for question_type in question_types:
compiled_baleen = modules[question_type]
# Evaluation Procedure: Calculate the F1 Score for a randomly drawn batch of 50 questions 5 times and average the F1 Scores
batch_size = 50
num_batches = 5
assert len(datasets[question_type]["test"]) >= batch_size * num_batches
test = datasets[question_type]["test"][: batch_size * num_batches]
test_sets = [test[i : i + batch_size] for i in range(num_batches)]
uncompiled_f1_scores = []
compiled_f1_scores = []
for test in test_sets:
# Set up the `evaluate_on_hotpotqa` function.
evaluate_on_opentom = Evaluate(
devset=test, num_threads=1, display_progress=True, display_table=0
)
uncompiled_baleen_evaluator = OpenToMEvaluatorDspy(
model_name="uncompiled_baleen"
)
evaluate_on_opentom(
uncompiled_baleen,
metric=uncompiled_baleen_evaluator.dspy_metric,
display=True,
)
uncompiled_f1_scores.append(
uncompiled_baleen_evaluator.f1_score()[question_type]["macro_averaged"]
)
compiled_baleen_evaluator = OpenToMEvaluatorDspy(
model_name="compiled_baleen"
)
evaluate_on_opentom(
compiled_baleen,
metric=compiled_baleen_evaluator.dspy_metric,
display=True,
)
compiled_f1_scores.append(
compiled_baleen_evaluator.f1_score()[question_type]["macro_averaged"]
)
# overall f1 scores
uncompiled_mean_f1 = np.mean(uncompiled_f1_scores)
uncompiled_std_f1 = np.std(uncompiled_f1_scores)
compiled_mean_f1 = np.mean(compiled_f1_scores)
compiled_std_f1 = np.std(compiled_f1_scores)
run[f"evaluation/{question_type}/uncompiled/mean_macro_averaged_f1"] = (
uncompiled_mean_f1
)
run[f"evaluation/{question_type}/uncompiled/mean_macro_averaged_f1"] = (
uncompiled_std_f1
)
run[f"evaluation/{question_type}/compiled/mean_macro_averaged_f1"] = (
compiled_mean_f1
)
run[f"evaluation/{question_type}/compiled/mean_macro_averaged_f1"] = (
compiled_std_f1
)
print(
f"Mean Macro Averaged F1 Scores (± std dev.) - {question_type} - Aggregated from {num_batches} batches of {batch_size} questions"
)
print(f"uncompiled: {uncompiled_mean_f1:.3f} ± {uncompiled_std_f1:.3}")
print(f"compiled: {compiled_mean_f1:.3} ± {compiled_std_f1:.3}")
dump_state(modules, "cot_modules.pkl")
run["cot_modules"].upload("cot_modules.pkl")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Run DSPY method.")
# dspy arguments
parser.add_argument("experiment_title", type=str, help="Title of new experiment")
parser.add_argument("dspy_method", type=str, help="The DSPY method to run")
parser.add_argument("dspy_optimizer", type=str, help="The DSPY optimizer to use")
parser.add_argument(
"--student",
default="gpt-3.5-turbo",
type=str,
help="The LLM to optimize prompts for",
)
parser.add_argument(
"--teacher",
default=None,
type=str,
help="Teacher LLM for optimizing prompts. Defaults to Student LLM",
)
parser.add_argument(
"--train_size",
default=50,
type=int,
help="Number of training examples to use for optimization",
)
parser.add_argument(
"--download_dataset", default=True, type=bool, help="Download dataset"
)
parser.add_argument(
"--question_types",
default=EVAL_QUESTION_TYPES,
nargs="*",
help="Question types. Defaults to all",
)
args = parser.parse_args()
# setup LLMs
student_lm = dspy.LM(model=args.student, max_tokens=1000)
args.teacher = args.student if args.teacher is None else args.teacher
teacher_lm = dspy.LM(model=args.teacher, max_tokens=1000)
dspy.settings.configure(lm=student_lm)
# validate question types
question_types = args.question_types
assert all(
[question_type in EVAL_QUESTION_TYPES for question_type in question_types]
)
args.question_types = ", ".join(
question_types
) # turn list into string for neptune logging
# log run parameters
run["parameters"] = args
run["sys/name"] = args.experiment_title
main(
args.dspy_method,
args.dspy_optimizer,
args.download_dataset,
question_types,
teacher_lm,
args.train_size,
)