CoTWithThoughtSimplifiedBaleen/main.py

# run with python main.py cot

import pickle
import time
import argparse
from typing import Optional
from opentom_evaluator import OpenToMEvaluatorDspy
import dspy
from dspy.teleprompt import BootstrapFewShotWithRandomSearch
from dspy.evaluate.evaluate import Evaluate
from src.cot import CoTSimplifiedBaleen, CoTSimplifiedBaleenConfig
from src.cot_with_thought import CoTWithThoughtSimplifiedBaleen, CoTWithThoughtSimplifiedBaleenConfig
from get_data import default_factory, load_dataset
from collections import defaultdict
from dotenv import load_dotenv
import neptune
import numpy as np

load_dotenv()

# initialize neptune
run = neptune.init_run(
    project="modaic/dspy-opentom",
    capture_hardware_metrics=False,
    capture_stderr=True,
    capture_stdout=True,
    capture_traceback=True,
)

EVAL_QUESTION_TYPES = [
    "attitude",
    "multihop-fo",
    "multihop-so",
    "location-fo-coarse",
    "location-fo-fine",
    "location-so-coarse",
    "location-so-fine",
]


def dump_state(data, filename):
    with open(filename, "wb") as file:
        pickle.dump(data, file)


def main(
    dspy_method,
    dspy_optimizer,
    download_dataset,
    question_types,
    teacher_lm,
    train_size,
):
    # load dataset
    if download_dataset:
        load_dataset()

    # read in the datasets pickle object
    with open("datasets.pkl", "rb") as file:
        datasets = pickle.load(file)

    if dspy_method == "cot":
        module_type = CoTSimplifiedBaleen(CoTSimplifiedBaleenConfig())
        module_name = "CoTSimplifiedBaleen"
    elif dspy_method == "cot_with_thought":
        module_type = CoTWithThoughtSimplifiedBaleen(CoTWithThoughtSimplifiedBaleenConfig())
        module_name = "CoTWithThoughtSimplifiedBaleen"
    else:
        raise Exception(f"Dspy method '{dspy_method}' is not valid")

    module_type.push_to_hub(f"vintro/{module_name}", with_code=True, commit_message=f"Uncompiled {module_name} as baseline")
    modules = {}
    # define modules for each question type
    for question_type in question_types:
        print(f"TYPE: {question_type}")
        evaluator = OpenToMEvaluatorDspy(model_name="(training set) complied baleen")

        if dspy_optimizer == "bootstrap_fewshot_with_random_search":
            optimizer = BootstrapFewShotWithRandomSearch(
                metric=evaluator.dspy_metric,
                num_candidate_programs=25,
                num_threads=1,
                teacher_settings=dict(lm=teacher_lm),
            )
            compiled_baleen = optimizer.compile(
                module_type(), trainset=datasets[question_type]["train"][:train_size]
            )
        # elif dspy_optimizer == "signature_optimizer": # Signature Optimizer is deprecated TODO: add a new one like GEPA
        #    optimizer = SignatureOptimizer(
        #        metric=evaluator.dspy_metric,
        #        breadth=10,
        #        depth=3,
        #        init_temperature=1.4,
        #        verbose=True,
        #        track_stats=True,
        #        prompt_model=teacher_lm,
        #    )
        #    eval_kwargs = dict(num_threads=1, display_progress=True, display_table=0)
        #    compiled_baleen = optimizer.compile(
        #        module_type(),
        #        devset=datasets[question_type]["train"][:train_size],
        #        eval_kwargs=eval_kwargs,
        #    )
        else:
            raise Exception(f"Invalid dspy optimizer type: {dspy_optimizer}")

        modules[question_type] = compiled_baleen
        compiled_baleen.push_to_hub(f"vintro/{module_name}-{question_type}", with_code=True, commit_message=f"Compiled {module_name} with {dspy_optimizer} for {question_type}")
        time.sleep(10)

    uncompiled_baleen = (
        CoTSimplifiedBaleen()
    )  # regular cot is always the uncompiled baseline

    print("Beginning Evaluation")
    for question_type in question_types:
        compiled_baleen = modules[question_type]

        # Evaluation Procedure: Calculate the F1 Score for a randomly drawn batch of 50 questions 5 times and average the F1 Scores
        batch_size = 50
        num_batches = 5

        assert len(datasets[question_type]["test"]) >= batch_size * num_batches
        test = datasets[question_type]["test"][: batch_size * num_batches]
        test_sets = [test[i : i + batch_size] for i in range(num_batches)]

        uncompiled_f1_scores = []
        compiled_f1_scores = []

        for test in test_sets:
            # Set up the `evaluate_on_hotpotqa` function.
            evaluate_on_opentom = Evaluate(
                devset=test, num_threads=1, display_progress=True, display_table=0
            )

            uncompiled_baleen_evaluator = OpenToMEvaluatorDspy(
                model_name="uncompiled_baleen"
            )
            evaluate_on_opentom(
                uncompiled_baleen,
                metric=uncompiled_baleen_evaluator.dspy_metric,
                display=True,
            )
            uncompiled_f1_scores.append(
                uncompiled_baleen_evaluator.f1_score()[question_type]["macro_averaged"]
            )

            compiled_baleen_evaluator = OpenToMEvaluatorDspy(
                model_name="compiled_baleen"
            )
            evaluate_on_opentom(
                compiled_baleen,
                metric=compiled_baleen_evaluator.dspy_metric,
                display=True,
            )
            compiled_f1_scores.append(
                compiled_baleen_evaluator.f1_score()[question_type]["macro_averaged"]
            )

        # overall f1 scores
        uncompiled_mean_f1 = np.mean(uncompiled_f1_scores)
        uncompiled_std_f1 = np.std(uncompiled_f1_scores)

        compiled_mean_f1 = np.mean(compiled_f1_scores)
        compiled_std_f1 = np.std(compiled_f1_scores)

        run[f"evaluation/{question_type}/uncompiled/mean_macro_averaged_f1"] = (
            uncompiled_mean_f1
        )
        run[f"evaluation/{question_type}/uncompiled/mean_macro_averaged_f1"] = (
            uncompiled_std_f1
        )
        run[f"evaluation/{question_type}/compiled/mean_macro_averaged_f1"] = (
            compiled_mean_f1
        )
        run[f"evaluation/{question_type}/compiled/mean_macro_averaged_f1"] = (
            compiled_std_f1
        )

        print(
            f"Mean Macro Averaged F1 Scores (± std dev.) - {question_type} - Aggregated from {num_batches} batches of {batch_size} questions"
        )
        print(f"uncompiled: {uncompiled_mean_f1:.3f} ± {uncompiled_std_f1:.3}")
        print(f"compiled: {compiled_mean_f1:.3} ± {compiled_std_f1:.3}")

    dump_state(modules, "cot_modules.pkl")
    run["cot_modules"].upload("cot_modules.pkl")


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Run DSPY method.")

    # dspy arguments
    parser.add_argument("experiment_title", type=str, help="Title of new experiment")
    parser.add_argument("dspy_method", type=str, help="The DSPY method to run")
    parser.add_argument("dspy_optimizer", type=str, help="The DSPY optimizer to use")
    parser.add_argument(
        "--student",
        default="gpt-3.5-turbo",
        type=str,
        help="The LLM to optimize prompts for",
    )
    parser.add_argument(
        "--teacher",
        default=None,
        type=str,
        help="Teacher LLM for optimizing prompts. Defaults to Student LLM",
    )
    parser.add_argument(
        "--train_size",
        default=50,
        type=int,
        help="Number of training examples to use for optimization",
    )
    parser.add_argument(
        "--download_dataset", default=True, type=bool, help="Download dataset"
    )
    parser.add_argument(
        "--question_types",
        default=EVAL_QUESTION_TYPES,
        nargs="*",
        help="Question types. Defaults to all",
    )

    args = parser.parse_args()

    # setup LLMs
    student_lm = dspy.LM(model=args.student, max_tokens=1000)
    args.teacher = args.student if args.teacher is None else args.teacher
    teacher_lm = dspy.LM(model=args.teacher, max_tokens=1000)
    dspy.settings.configure(lm=student_lm)

    # validate question types
    question_types = args.question_types
    assert all(
        [question_type in EVAL_QUESTION_TYPES for question_type in question_types]
    )
    args.question_types = ", ".join(
        question_types
    )  # turn list into string for neptune logging

    # log run parameters
    run["parameters"] = args
    run["sys/name"] = args.experiment_title

    main(
        args.dspy_method,
        args.dspy_optimizer,
        args.download_dataset,
        question_types,
        teacher_lm,
        args.train_size,
    )