Compiled CoTWithThoughtSimplifiedBaleen with bootstrap_fewshot_with_random_search for location-fo-coarse
This commit is contained in:
101
get_data.py
Normal file
101
get_data.py
Normal file
@@ -0,0 +1,101 @@
|
||||
import dspy
|
||||
import requests
|
||||
import pickle
|
||||
import json
|
||||
import random
|
||||
from collections import defaultdict
|
||||
import pandas as pd
|
||||
|
||||
|
||||
# this is the one that they sampled 100 existing OpenToM plots to produce "extra long" narratives
|
||||
# URL = "https://raw.githubusercontent.com/SeacowX/OpenToM/main/data/opentom_long.json"
|
||||
URL = "https://raw.githubusercontent.com/SeacowX/OpenToM/main/data/opentom.json"
|
||||
|
||||
|
||||
def default_factory():
|
||||
return []
|
||||
|
||||
|
||||
def load_dataset():
|
||||
response = requests.get(URL).json()
|
||||
|
||||
df = pd.DataFrame(response)
|
||||
|
||||
# Extract 'type' and 'answer' into separate columns
|
||||
df["type"] = df["question"].apply(lambda x: x["type"])
|
||||
df["answer"] = df["question"].apply(lambda x: x["answer"])
|
||||
|
||||
unique_answers_by_type = df.groupby("type")["answer"].unique()
|
||||
|
||||
# convert the dataset to what DSPy expects (list of Example objects)
|
||||
dataset = []
|
||||
|
||||
for index, row in df.iterrows():
|
||||
context = row["narrative"]
|
||||
question = row["question"]["question"]
|
||||
answer = row["question"]["answer"]
|
||||
type = row["question"]["type"]
|
||||
plot_info = json.dumps(
|
||||
row["plot_info"]
|
||||
) # Keeping each example field as a string might be a good idea
|
||||
|
||||
# update the type value if location is coarse or fine
|
||||
if "location" in type:
|
||||
location_granularity = (
|
||||
"fine"
|
||||
if answer.lower().strip() != "yes" and answer.lower().strip() != "no"
|
||||
else "coarse"
|
||||
)
|
||||
type = f"{type}-{location_granularity}"
|
||||
|
||||
# Answer choices
|
||||
if "location" in type and (
|
||||
answer.lower().strip() != "yes" and answer.lower().strip() != "no"
|
||||
): # don't provide answer choices for fine grained location questions
|
||||
answer_choices = "n/a, list a specific location"
|
||||
elif "location" in type:
|
||||
answer_choices = "No, Yes"
|
||||
else:
|
||||
answer_choices = ", ".join(unique_answers_by_type[type])
|
||||
|
||||
dataset.append(
|
||||
dspy.Example(
|
||||
context=context,
|
||||
question=question,
|
||||
answer=answer,
|
||||
type=type,
|
||||
plot_info=plot_info,
|
||||
answer_choices=answer_choices,
|
||||
).with_inputs("context", "question", "answer_choices")
|
||||
)
|
||||
|
||||
# split datasets by question types
|
||||
datasets = defaultdict(default_factory)
|
||||
|
||||
for example in dataset:
|
||||
datasets[example.type].append(example)
|
||||
|
||||
datasets.keys()
|
||||
[len(dataset) for dataset in datasets.values()]
|
||||
|
||||
# create train test split
|
||||
for question_type, dataset in datasets.items():
|
||||
random.shuffle(dataset)
|
||||
|
||||
datasets[question_type] = {
|
||||
"train": dataset[int(len(dataset) * 0.8) :], # 80% test, 20% train
|
||||
"test": dataset[: int(len(dataset) * 0.8)],
|
||||
}
|
||||
|
||||
print(f"Train {question_type}: {len(datasets[question_type]['train'])}")
|
||||
print(f"Test {question_type}: {len(datasets[question_type]['test'])}")
|
||||
|
||||
# Serialize and save the datasets object to a file
|
||||
with open("datasets.pkl", "wb") as file:
|
||||
pickle.dump(datasets, file)
|
||||
|
||||
print("🫡 Datasets object has been saved to 'datasets.pkl' 🫡")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
load_dataset()
|
||||
Reference in New Issue
Block a user