Add MODAIC token support and fix init to accept kwargs

2025-12-17 10:52:44 -08:00
parent 316f16630f
commit 76b959d985
6 changed files with 102 additions and 10 deletions
--- a/README.md
+++ b/README.md
@@ -0,0 +1,92 @@
 Modaic and Weaviate
 Modaic is a hub to share and manage DSPy programs!
 You can learn more about the modaic sdk here and check out programs on the hub here.
 This notebook will illustrate how to load the CrossEncoderRanker program from the Modaic Hub, as well as PromptToSignature.
 Further, check out ce_ranker.py and pyproject.toml to see how to publish your programs on the Modaic Hub!
 CrossEncoderRanker
 from dotenv import load_dotenv
 import os
 from modaic import AutoProgram
 load_dotenv()
 # This is looking for MODAIC_TOKEN, WEAVIATE_URL, WEAVIATE_API_KEY, and OPENAI_API_KEY
 ce_ranker = AutoProgram.from_precompiled(
    "connor/CrossEncoderRanker",
    config_options={ # replace this with your collection name and other custom parameters
        "collection_name": "IRPapersText_Default",
        "return_properties": ["content"],
        "k": 5
    }
 )
 response = ce_ranker("What does HyDE stand for?");
 for idx, ranked_doc in enumerate(response):
    print(f"Rank {idx+1}: {ranked_doc[:300]}")
 SSSS connor/CrossEncoderRanker
 Rank 1: Figure 1: An illustration of the HyDE model. Document snippets are shown. HyDE serves all types of queries without changing the underlying InstructGPT and Contriever/mContriever models.
 GPT-3 (Brown et al., 2020) models can be aligned to human intents to follow instructions faithfully.
 With these 
 Rank 2: |           | Scifact | FiQA  | DBPedia |
 |-----------|---------|-------|---------|
 | Contriever | 64.9    | 24.5  | 29.2    |
 | HyDE w/ InstructGPT | **69.1**  | 27.3  | 36.8    |
 | w/ GPT-3  | 65.9    | **27.9**  | **40.5**    |
 Table 5: nDCG@10 comparing InstructGPT vs. 3-shot GPT-3 on BEIR. Bes
 Rank 3: 6 Conclusion
 In this paper, we introduce HyDE, a new approach for building effective dense retrievers in a completely unsupervised manner, without the need for any relevance labels. We demonstrate that some aspects of relevance modeling can be delegated to a more powerful, flexible, and general-pur
 Rank 4: estimate Equation 5 by sampling N documents from g, [d̂1, d̂2, ..., d̂N]:
 v̂qij = 1/N ∑_(d̂k ∼ g(qij, INSTi)) f(d̂k)    (6)
      = 1/N ∑_(k=1)^N f(d̂k)                        (7)
 We also consider the query as a possible hypothesis:
 v̂qij = 1/(N+1) [(∑_(k=1)^N f(d̂k)) + f(qij)]      (8)
 Inner p
 Rank 5: |           | sw  | wo  | ko   | ja   | bn   |
 |-----------|-----|-----|------|------|------|
 | **Unsupervised**         |     |     |      |      |      |
 | BM25      | 38.9| 28.5| 21.2 | 41.8 |      |
 | mContriever| 38.3| 22.3| 19.5 | 35.3 |      |
 | HyDE      | 41.7| 30.6| 30.7 | 41.3 |      |
 | 
 PromptToSignature
 from dotenv import load_dotenv
 import os
 from modaic import AutoProgram
 load_dotenv() # This is looking for MODAIC_TOKEN and OPENROUTER_API_KEY
 agent = AutoProgram.from_precompiled(
    "fadeleke/prompt-to-signature",
    config_options={
        "lm": "openrouter/anthropic/claude-sonnet-4.5",
        "max_tokens": 32000,
        "temperature": 0.7,
    }
 )
 result = agent("Summarize a document and extract key entities")
 print(result)
 SSSS fadeleke/prompt-to-signature
 Signature generation took 0.00 seconds in inference.
 Prediction(
    signature_name='DocumentSummaryAndEntityExtraction',
    task_description='Extract a concise summary of a document and identify key entities (people, organizations, locations, dates, etc.) mentioned within it.',
    signature_fields=[GeneratedField(name='document', type=<FieldType.STRING: 'str'>, role=<FieldRole.INPUT: 'input'>, description='The document text to summarize and extract entities from', pydantic_model_schema=None, literal_values=None, default_value=None), GeneratedField(name='summary', type=<FieldType.STRING: 'str'>, role=<FieldRole.OUTPUT: 'output'>, description="A concise summary of the document's main points and content", pydantic_model_schema=None, literal_values=None, default_value=None), GeneratedField(name='key_entities', type=<FieldType.PYDANTIC_MODEL: 'pydantic'>, role=<FieldRole.OUTPUT: 'output'>, description='Structured extraction of key entities found in the document', pydantic_model_schema=PydanticModelSchema(model_name='KeyEntities', description='Container for extracted entities from the document', fields=[PydanticFieldDef(name='people', type=<FieldType.LIST_STRING: 'list[str]'>, description='Names of people mentioned in the document', required=False, literal_values=None, nested_model=None), PydanticFieldDef(name='organizations', type=<FieldType.LIST_STRING: 'list[str]'>, description='Names of organizations, companies, or institutions mentioned', required=False, literal_values=None, nested_model=None), PydanticFieldDef(name='locations', type=<FieldType.LIST_STRING: 'list[str]'>, description='Geographic locations, cities, countries, or regions mentioned', required=False, literal_values=None, nested_model=None), PydanticFieldDef(name='dates', type=<FieldType.LIST_STRING: 'list[str]'>, description='Important dates or time references mentioned in the document', required=False, literal_values=None, nested_model=None), PydanticFieldDef(name='topics', type=<FieldType.LIST_STRING: 'list[str]'>, description='Main topics or themes covered in the document', required=False, literal_values=None, nested_model=None)]), literal_values=None, default_value=None)],
    reasoning=None
 )
--- a/auto_classes.json
+++ b/auto_classes.json
@@ -1,4 +1,4 @@
 {
  "AutoConfig": "ce_ranker.CERankerConfig",
-  "AutoAgent": "ce_ranker.CERankerAgent"
+  "AutoProgram": "ce_ranker.CERankerAgent"
 }
--- a/ce_ranker.py
+++ b/ce_ranker.py
@@ -3,7 +3,7 @@ import os
 import asyncio
 import dspy
-from modaic import PrecompiledAgent, PrecompiledConfig
+from modaic import PrecompiledProgram, PrecompiledConfig
 import weaviate
 load_dotenv()
@@ -22,7 +22,7 @@ class CERankerConfig(PrecompiledConfig):
    lm: str = "openai/gpt-4.1-mini"
-class CERankerAgent(PrecompiledAgent):
+class CERankerAgent(PrecompiledProgram):
    config: CERankerConfig
    def __init__(self, config: CERankerConfig, **kwargs):
@@ -73,10 +73,9 @@ if __name__ == "__main__":
        return_properties=["content"],
        k=5
    )
-    agent = CERankerAgent(config)
+    reranker = CERankerAgent(config)
-    print(agent(query="What is HyDE?"))
+    reranker.push_to_hub(
    agent.push_to_hub(
        "connor/CrossEncoderRanker",
        with_code=True,
-        commit_message="Fix init to accept kwargs"
+        commit_message="Add MODAIC token support and fix init to accept kwargs"
    )
--- a/config.json
+++ b/config.json
@@ -1,4 +1,5 @@
 {
  "model": null,
  "collection_name": "IRPapersText_Default",
  "return_properties": [
    "content"
--- a/program.json
+++ b/program.json
@@ -28,7 +28,7 @@
  },
  "metadata": {
    "dependency_versions": {
-      "python": "3.11",
+      "python": "3.13",
      "dspy": "3.0.4",
      "cloudpickle": "3.1"
    }
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,7 +1,7 @@
 [project]
 name = "CrossEncoderRanker"
 version = "0.1.0"
-description = "Add your description here"
+description = "Cross encoder ranker using modaic"
 readme = "README.md"
 requires-python = ">=3.11"
-dependencies = ["modaic>=0.4.1", "weaviate-client>=4.18.1"]
+dependencies = ["modaic>=0.8.0", "weaviate-client>=4.18.1"]