(no commit message)

2025-10-06 21:49:50 -07:00
parent 8a2608f427
commit d0dbad868e
11 changed files with 1064 additions and 9 deletions
--- a/README.md
+++ b/README.md
@@ -48,14 +48,18 @@ Once you have a `dataset.jsonl` file, you can optimize the agent with dspy's bui
 1. Create a file called `compile.py` with the following code. Replace `<your-username>` with your modaic username.

 ```python
+from sqlalchemy.sql import true
 from dotenv import load_dotenv
 import os
 import dspy
 import json
 from dspy import Prediction, Example
-from modaic import AutoAgent
+from typing import Optional, Tuple

-searcher = AutoAgent.from_precompiled("swagginty/persana-lead-gen", api_key=os.getenv("PERSANA_KEY"))
+
+searcher = AutoAgent.from_precompiled(
+    "swagginty/persana-lead-gen", api_key=os.getenv("PERSANA_KEY"), train=True
+)
 feedback_creator = searcher.feedback_creator


@@ -66,11 +70,103 @@ class SearchExample(Example):


 class SearchPrediction(Prediction):
-    profiles: list[dict]
+    profiles: Optional[list[dict]]
    search_parameters: dict


-def evaluate_results(
+def any_in(list: list[str], string: str):
+    return any(item.lower() in string.lower() for item in list)
+
+
+def include_profile(
+    search_parameters: dict,
+    profile: dict,
+) -> Tuple[bool, Optional[str]]:
+    if (titles := search_parameters.get("include_job_titles")) and (
+        not any_in(titles, profile["experience_data"]["title"])
+    ):
+        return (
+            False,
+            f"include_job_titles: {titles} not in {profile}['experience_data']['title']",
+        )
+    if (companies := search_parameters.get("include_companies")) and (
+        not any_in(companies, profile["experience_data"]["company_name"])
+    ):
+        return (
+            False,
+            f"include_companies: {companies} not in {profile}['experience_data']['company_name']",
+        )
+    if (company_types := search_parameters.get("company_types")) and (
+        not any_in(company_types, profile["experience_data"]["company_type"])
+    ):
+        return (
+            False,
+            f"company_types: {company_types} not in {profile}['experience_data']['company_type']",
+        )
+    if (
+        (company_keywords := search_parameters.get("company_include_keywords"))
+        and not any_in(
+            company_keywords, profile["experience_data"]["company_company_headline"]
+        )
+        and not any_in(
+            company_keywords, profile["experience_data"]["company_description"]
+        )
+    ):
+        return (
+            False,
+            f"company_include_keywords: {company_keywords} not in {profile}['experience_data']['company_company_headline'] or {profile}['experience_data']['company_description']",
+        )
+    return True, None
+
+
+def exclude_profile(
+    search_parameters: dict,
+    profile: dict,
+) -> Tuple[bool, Optional[str]]:
+    if (titles := search_parameters.get("exclude_job_titles")) and (
+        any_in(titles, profile["experience_data"]["title"])
+    ):
+        return (
+            True,
+            f"exclude_job_titles: {titles} in {profile}['experience_data']['title']",
+        )
+    if (companies := search_parameters.get("exclude_companies")) and (
+        any_in(companies, profile["experience_data"]["company_name"])
+    ):
+        return (
+            True,
+            f"exclude_companies: {companies} in {profile}['experience_data']['company_name']",
+        )
+    if (company_keywords := search_parameters.get("company_exclude_keywords")) and (
+        any_in(company_keywords, profile["experience_data"]["company_company_headline"])
+        and any_in(company_keywords, profile["experience_data"]["company_description"])
+    ):
+        return (
+            True,
+            f"company_exclude_keywords: {company_keywords} in {profile}['experience_data']['company_company_headline'] or {profile}['experience_data']['company_description']",
+        )
+    return False, None
+
+
+def get_search_eval(
+    search_parameters: dict,
+    target_profiles: list[dict],
+) -> Tuple[float, list[dict]]:
+    count = 0
+    exclude_reasons = []
+    for profile in target_profiles:
+        include, exclude_reason = include_profile(search_parameters, profile)
+        if include:
+            exclude, exclude_reason = exclude_profile(search_parameters, profile)
+            if not exclude:
+                count += 1
+        if exclude_reason:
+            exclude_reasons.append(exclude_reason)
+    score = count / len(target_profiles)
+    return score, exclude_reasons
+
+
+def evaluate_results_expensive(
    target: SearchExample,
    predictied: SearchPrediction,
    trace=None,
@@ -86,7 +182,6 @@ def evaluate_results(
    for t_result in target.selected_profiles:
        if t_result["profile_id"] in pred_ids:
            count += 1
-
    score = count / len(target.selected_profiles)

    target_ids = {result["profile_id"] for result in target.selected_profiles}
@@ -113,6 +208,30 @@ def evaluate_results(
    )


+def evaluate_results_cheap(
+    target: SearchExample,
+    predictied: SearchPrediction,
+    trace=None,
+    pred_name=None,
+    pred_trace=None,
+) -> Prediction:
+    """
+    Evaluates the search results target results were retrieved
+    """
+    # How many of the target profiles were retrieved
+    score, unselected_profiles = get_search_eval(
+        predictied.search_parameters, target.selected_profiles
+    )
+    feedback = (
+        "The model failed to retrieve the following profiles in the search: "
+        + ", ".join([str(profile) for profile in unselected_profiles])
+    )
+    return Prediction(
+        score=score,
+        feedback=feedback,
+    )
+
+
 if __name__ == "__main__":
    load_dotenv()

@@ -125,14 +244,19 @@ if __name__ == "__main__":
        ).with_inputs("company_description", "target_customer")
        for e in data
    ]
+
    compiler = dspy.GEPA(
-        metric=evaluate_results,
-        auto="light",
+        metric=evaluate_results_cheap,
        reflection_lm=dspy.LM("openai/gpt-5", temperature=1.0, max_tokens=32000),
+        auto="light",
+    )
+    compiled_searcher = compiler.compile(
+        searcher,
+        trainset=trainset,
    )
-    compiled_searcher = compiler.compile(searcher, trainset=trainset)
    compiled_searcher.save("compiled_searcher.json")
-    compiled_searcher.push_to_hub("<your-username>/persana-lead-gen") # Replace <your-username> with your username
+    compiled_searcher.push_to_hub("<your-username>/persana-lead-gen", with_code=True)
+
 ```

 2. Run the file