diff --git a/auto_classes.json b/auto_classes.json new file mode 100644 index 0000000..52efeb3 --- /dev/null +++ b/auto_classes.json @@ -0,0 +1,4 @@ +{ + "AutoConfig": "modules.GenerateCypherConfig", + "AutoProgram": "modules.GenerateCypher" +} \ No newline at end of file diff --git a/modules.py b/modules.py new file mode 100644 index 0000000..c1f6285 --- /dev/null +++ b/modules.py @@ -0,0 +1,74 @@ +import os +import dspy +from dotenv import load_dotenv +from modaic import PrecompiledProgram, PrecompiledConfig + +load_dotenv() + + +class CypherFromQuestion(dspy.Signature): + """Task: Generate Cypher statement to query a graph database. + Instructions: Use only the provided relationship types and properties in the schema. + Do not use any other relationship types or properties that are not provided in the schema. + Do not include any explanations or apologies in your responses. + Do not respond to any questions that might ask anything else than for you to construct a Cypher statement. + Do not include any text except the generated Cypher statement. + """ + + question = dspy.InputField( + desc="Question to model using a cypher statement. Use only the provided relationship types and properties in the schema." + ) + neo4j_schema = dspy.InputField( + desc="Current graph schema in Neo4j as a list of NODES and RELATIONSHIPS." + ) + statement = dspy.OutputField(desc="Cypher statement to query the graph database.") + + +class GenerateCypherConfig(PrecompiledConfig): + model: str = "openrouter/openai/gpt-4o" # OPENROUTER ONLY + max_tokens: int = 1024 + + +class GenerateCypher(PrecompiledProgram): + config: GenerateCypherConfig + + def __init__(self, config: GenerateCypherConfig, **kwargs): + super().__init__(config=config, **kwargs) + self.lm = dspy.LM( + model=config.model, + max_tokens=config.max_tokens, + api_base="https://openrouter.ai/api/v1", + ) + self.generate_cypher = dspy.ChainOfThought(CypherFromQuestion) + self.generate_cypher.set_lm(self.lm) + + def forward(self, question: str, neo4j_schema: list[str]): + return self.generate_cypher(question=question, neo4j_schema=neo4j_schema) + + +generate_cypher = GenerateCypher(GenerateCypherConfig()) + +if __name__ == "__main__": + """ + from pathlib import Path + import json + + examples_path = Path(__file__).parent / "examples" / "wikipedia-abstracts-v0_0_1.ndjson" + with open(examples_path, "r") as f: + for line in f: + data = json.loads(line) + text = data["text"] + print("TEXT TO PROCESS:\n", text[:50]) + cypher = generate_cypher(text=text, neo4j_schema=neo4j.fmt_schema()) + neo4j.query(cypher.statement.replace('```', '')) + print("CYPHER STATEMENT:\n", cypher.statement) + + schema = neo4j.fmt_schema() + print("SCHEMA:\n", schema) + """ + generate_cypher.push_to_hub( + "farouk1/text-to-cypher", + with_code=True, + tag="v0.0.9", + commit_message="Update README.md", + ) diff --git a/optimize.py b/optimize.py new file mode 100644 index 0000000..a37f1f2 --- /dev/null +++ b/optimize.py @@ -0,0 +1,78 @@ +import dspy +from dspy import GEPA +from modules import generate_cypher +from datasets import load_dataset + + +def process_dataset(): + train_split = load_dataset("neo4j/text2cypher-2025v1")["train"] + train_split = [ + dspy.Example( + { + "question": x["question"], + "neo4j_schema": x["schema"], + "expected_cypher": x["cypher"], + } + ).with_inputs("question", "neo4j_schema") + for x in train_split + ] + import random + + random.Random(0).shuffle(train_split) + train_split = train_split[:200] + tot_num = len(train_split) + + test_split = load_dataset("neo4j/text2cypher-2025v1")["test"] + test_split = [ + dspy.Example( + { + "question": x["question"], + "neo4j_schema": x["schema"], + "expected_cypher": x["cypher"], + } + ).with_inputs("question", "neo4j_schema") + for x in test_split + ] + + train_set = train_split[: int(0.5 * tot_num)] + val_set = train_split[int(0.5 * tot_num) :] + test_set = test_split[:200] + + return train_set, val_set, test_set + + +def metric(example, prediction, trace=None, pred_name=None, pred_trace=None): + from sacrebleu import sentence_bleu + + expected_cypher = example["expected_cypher"] + generated_cypher = prediction.statement + + # Calculate sentence-level BLEU (Google BLEU) + bleu_score = sentence_bleu( + generated_cypher, + [expected_cypher], # Reference as a list + ).score / 100.0 # Normalize to 0-1 + + feedback = f"BLEU score: {bleu_score:.3f}" + return dspy.Prediction(score=bleu_score, feedback=feedback) + + +train_set, val_set, test_set = process_dataset() + +optimizer = GEPA( + metric=metric, + auto="light", + num_threads=32, + track_stats=True, + reflection_minibatch_size=3, + reflection_lm=dspy.LM(model="gpt-5.2", temperature=1.0, max_tokens=32000), +) + + +if __name__ == "__main__": + optimized_program = optimizer.compile( + generate_cypher, + trainset=train_set, + valset=val_set, + ) + optimized_program.push_to_hub("farouk1/text-to-cypher-gepa", tag="v1.0.1", commit_message="Optimized program with code") diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..84a5c0d --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,7 @@ +[project] +name = "text-to-cypher-gepa" +version = "0.1.0" +description = "Add your description here" +readme = "README.md" +requires-python = ">=3.13" +dependencies = ["datasets>=4.4.2", "dspy>=3.0.4", "modaic>=0.8.3", "neo4j~=5.18.0", "python-dotenv~=1.0.1", "sacrebleu>=2.5.1"]