1 Commits

Author SHA1 Message Date
86b1c6834d Optimized program with code 2025-12-27 07:14:57 -08:00
4 changed files with 163 additions and 0 deletions

4
auto_classes.json Normal file
View File

@@ -0,0 +1,4 @@
{
"AutoConfig": "modules.GenerateCypherConfig",
"AutoProgram": "modules.GenerateCypher"
}

74
modules.py Normal file
View File

@@ -0,0 +1,74 @@
import os
import dspy
from dotenv import load_dotenv
from modaic import PrecompiledProgram, PrecompiledConfig
load_dotenv()
class CypherFromQuestion(dspy.Signature):
"""Task: Generate Cypher statement to query a graph database.
Instructions: Use only the provided relationship types and properties in the schema.
Do not use any other relationship types or properties that are not provided in the schema.
Do not include any explanations or apologies in your responses.
Do not respond to any questions that might ask anything else than for you to construct a Cypher statement.
Do not include any text except the generated Cypher statement.
"""
question = dspy.InputField(
desc="Question to model using a cypher statement. Use only the provided relationship types and properties in the schema."
)
neo4j_schema = dspy.InputField(
desc="Current graph schema in Neo4j as a list of NODES and RELATIONSHIPS."
)
statement = dspy.OutputField(desc="Cypher statement to query the graph database.")
class GenerateCypherConfig(PrecompiledConfig):
model: str = "openrouter/openai/gpt-4o" # OPENROUTER ONLY
max_tokens: int = 1024
class GenerateCypher(PrecompiledProgram):
config: GenerateCypherConfig
def __init__(self, config: GenerateCypherConfig, **kwargs):
super().__init__(config=config, **kwargs)
self.lm = dspy.LM(
model=config.model,
max_tokens=config.max_tokens,
api_base="https://openrouter.ai/api/v1",
)
self.generate_cypher = dspy.ChainOfThought(CypherFromQuestion)
self.generate_cypher.set_lm(self.lm)
def forward(self, question: str, neo4j_schema: list[str]):
return self.generate_cypher(question=question, neo4j_schema=neo4j_schema)
generate_cypher = GenerateCypher(GenerateCypherConfig())
if __name__ == "__main__":
"""
from pathlib import Path
import json
examples_path = Path(__file__).parent / "examples" / "wikipedia-abstracts-v0_0_1.ndjson"
with open(examples_path, "r") as f:
for line in f:
data = json.loads(line)
text = data["text"]
print("TEXT TO PROCESS:\n", text[:50])
cypher = generate_cypher(text=text, neo4j_schema=neo4j.fmt_schema())
neo4j.query(cypher.statement.replace('```', ''))
print("CYPHER STATEMENT:\n", cypher.statement)
schema = neo4j.fmt_schema()
print("SCHEMA:\n", schema)
"""
generate_cypher.push_to_hub(
"farouk1/text-to-cypher",
with_code=True,
tag="v0.0.9",
commit_message="Update README.md",
)

78
optimize.py Normal file
View File

@@ -0,0 +1,78 @@
import dspy
from dspy import GEPA
from modules import generate_cypher
from datasets import load_dataset
def process_dataset():
train_split = load_dataset("neo4j/text2cypher-2025v1")["train"]
train_split = [
dspy.Example(
{
"question": x["question"],
"neo4j_schema": x["schema"],
"expected_cypher": x["cypher"],
}
).with_inputs("question", "neo4j_schema")
for x in train_split
]
import random
random.Random(0).shuffle(train_split)
train_split = train_split[:200]
tot_num = len(train_split)
test_split = load_dataset("neo4j/text2cypher-2025v1")["test"]
test_split = [
dspy.Example(
{
"question": x["question"],
"neo4j_schema": x["schema"],
"expected_cypher": x["cypher"],
}
).with_inputs("question", "neo4j_schema")
for x in test_split
]
train_set = train_split[: int(0.5 * tot_num)]
val_set = train_split[int(0.5 * tot_num) :]
test_set = test_split[:200]
return train_set, val_set, test_set
def metric(example, prediction, trace=None, pred_name=None, pred_trace=None):
from sacrebleu import sentence_bleu
expected_cypher = example["expected_cypher"]
generated_cypher = prediction.statement
# Calculate sentence-level BLEU (Google BLEU)
bleu_score = sentence_bleu(
generated_cypher,
[expected_cypher], # Reference as a list
).score / 100.0 # Normalize to 0-1
feedback = f"BLEU score: {bleu_score:.3f}"
return dspy.Prediction(score=bleu_score, feedback=feedback)
train_set, val_set, test_set = process_dataset()
optimizer = GEPA(
metric=metric,
auto="light",
num_threads=32,
track_stats=True,
reflection_minibatch_size=3,
reflection_lm=dspy.LM(model="gpt-5.2", temperature=1.0, max_tokens=32000),
)
if __name__ == "__main__":
optimized_program = optimizer.compile(
generate_cypher,
trainset=train_set,
valset=val_set,
)
optimized_program.push_to_hub("farouk1/text-to-cypher-gepa", tag="v1.0.1", commit_message="Optimized program with code")

7
pyproject.toml Normal file
View File

@@ -0,0 +1,7 @@
[project]
name = "text-to-cypher-gepa"
version = "0.1.0"
description = "Add your description here"
readme = "README.md"
requires-python = ">=3.13"
dependencies = ["datasets>=4.4.2", "dspy>=3.0.4", "modaic>=0.8.3", "neo4j~=5.18.0", "python-dotenv~=1.0.1", "sacrebleu>=2.5.1"]