diff --git a/README.md b/README.md index e69de29..30ded3a 100644 --- a/README.md +++ b/README.md @@ -0,0 +1,92 @@ +Modaic and Weaviate +Modaic is a hub to share and manage DSPy programs! + +You can learn more about the modaic sdk here and check out programs on the hub here. + +This notebook will illustrate how to load the CrossEncoderRanker program from the Modaic Hub, as well as PromptToSignature. + +Further, check out ce_ranker.py and pyproject.toml to see how to publish your programs on the Modaic Hub! + +CrossEncoderRanker +from dotenv import load_dotenv +import os +from modaic import AutoProgram + +load_dotenv() + +# This is looking for MODAIC_TOKEN, WEAVIATE_URL, WEAVIATE_API_KEY, and OPENAI_API_KEY + +ce_ranker = AutoProgram.from_precompiled( + "connor/CrossEncoderRanker", + config_options={ # replace this with your collection name and other custom parameters + "collection_name": "IRPapersText_Default", + "return_properties": ["content"], + "k": 5 + } +) + +response = ce_ranker("What does HyDE stand for?"); + +for idx, ranked_doc in enumerate(response): + print(f"Rank {idx+1}: {ranked_doc[:300]}") +SSSS connor/CrossEncoderRanker +Rank 1: Figure 1: An illustration of the HyDE model. Document snippets are shown. HyDE serves all types of queries without changing the underlying InstructGPT and Contriever/mContriever models. + +GPT-3 (Brown et al., 2020) models can be aligned to human intents to follow instructions faithfully. + +With these +Rank 2: | | Scifact | FiQA | DBPedia | +|-----------|---------|-------|---------| +| Contriever | 64.9 | 24.5 | 29.2 | +| HyDE w/ InstructGPT | **69.1** | 27.3 | 36.8 | +| w/ GPT-3 | 65.9 | **27.9** | **40.5** | + +Table 5: nDCG@10 comparing InstructGPT vs. 3-shot GPT-3 on BEIR. Bes +Rank 3: 6 Conclusion + +In this paper, we introduce HyDE, a new approach for building effective dense retrievers in a completely unsupervised manner, without the need for any relevance labels. We demonstrate that some aspects of relevance modeling can be delegated to a more powerful, flexible, and general-pur +Rank 4: estimate Equation 5 by sampling N documents from g, [d̂1, d̂2, ..., d̂N]: + +v̂qij = 1/N ∑_(d̂k ∼ g(qij, INSTi)) f(d̂k) (6) + + = 1/N ∑_(k=1)^N f(d̂k) (7) + +We also consider the query as a possible hypothesis: + +v̂qij = 1/(N+1) [(∑_(k=1)^N f(d̂k)) + f(qij)] (8) + +Inner p +Rank 5: | | sw | wo | ko | ja | bn | +|-----------|-----|-----|------|------|------| +| **Unsupervised** | | | | | | +| BM25 | 38.9| 28.5| 21.2 | 41.8 | | +| mContriever| 38.3| 22.3| 19.5 | 35.3 | | +| HyDE | 41.7| 30.6| 30.7 | 41.3 | | +| +PromptToSignature +from dotenv import load_dotenv +import os +from modaic import AutoProgram + +load_dotenv() # This is looking for MODAIC_TOKEN and OPENROUTER_API_KEY + +agent = AutoProgram.from_precompiled( + "fadeleke/prompt-to-signature", + config_options={ + "lm": "openrouter/anthropic/claude-sonnet-4.5", + "max_tokens": 32000, + "temperature": 0.7, + } +) + +result = agent("Summarize a document and extract key entities") + +print(result) +SSSS fadeleke/prompt-to-signature +Signature generation took 0.00 seconds in inference. +Prediction( + signature_name='DocumentSummaryAndEntityExtraction', + task_description='Extract a concise summary of a document and identify key entities (people, organizations, locations, dates, etc.) mentioned within it.', + signature_fields=[GeneratedField(name='document', type=, role=, description='The document text to summarize and extract entities from', pydantic_model_schema=None, literal_values=None, default_value=None), GeneratedField(name='summary', type=, role=, description="A concise summary of the document's main points and content", pydantic_model_schema=None, literal_values=None, default_value=None), GeneratedField(name='key_entities', type=, role=, description='Structured extraction of key entities found in the document', pydantic_model_schema=PydanticModelSchema(model_name='KeyEntities', description='Container for extracted entities from the document', fields=[PydanticFieldDef(name='people', type=, description='Names of people mentioned in the document', required=False, literal_values=None, nested_model=None), PydanticFieldDef(name='organizations', type=, description='Names of organizations, companies, or institutions mentioned', required=False, literal_values=None, nested_model=None), PydanticFieldDef(name='locations', type=, description='Geographic locations, cities, countries, or regions mentioned', required=False, literal_values=None, nested_model=None), PydanticFieldDef(name='dates', type=, description='Important dates or time references mentioned in the document', required=False, literal_values=None, nested_model=None), PydanticFieldDef(name='topics', type=, description='Main topics or themes covered in the document', required=False, literal_values=None, nested_model=None)]), literal_values=None, default_value=None)], + reasoning=None +) \ No newline at end of file diff --git a/auto_classes.json b/auto_classes.json index 7031b8b..222686b 100644 --- a/auto_classes.json +++ b/auto_classes.json @@ -1,4 +1,4 @@ { "AutoConfig": "ce_ranker.CERankerConfig", - "AutoAgent": "ce_ranker.CERankerAgent" + "AutoProgram": "ce_ranker.CERankerAgent" } \ No newline at end of file diff --git a/ce_ranker.py b/ce_ranker.py index e848909..51246ed 100644 --- a/ce_ranker.py +++ b/ce_ranker.py @@ -3,7 +3,7 @@ import os import asyncio import dspy -from modaic import PrecompiledAgent, PrecompiledConfig +from modaic import PrecompiledProgram, PrecompiledConfig import weaviate load_dotenv() @@ -22,7 +22,7 @@ class CERankerConfig(PrecompiledConfig): lm: str = "openai/gpt-4.1-mini" -class CERankerAgent(PrecompiledAgent): +class CERankerAgent(PrecompiledProgram): config: CERankerConfig def __init__(self, config: CERankerConfig, **kwargs): @@ -73,10 +73,9 @@ if __name__ == "__main__": return_properties=["content"], k=5 ) - agent = CERankerAgent(config) - print(agent(query="What is HyDE?")) - agent.push_to_hub( + reranker = CERankerAgent(config) + reranker.push_to_hub( "connor/CrossEncoderRanker", with_code=True, - commit_message="Fix init to accept kwargs" + commit_message="Add MODAIC token support and fix init to accept kwargs" ) \ No newline at end of file diff --git a/config.json b/config.json index 1f67b53..175f7ec 100644 --- a/config.json +++ b/config.json @@ -1,4 +1,5 @@ { + "model": null, "collection_name": "IRPapersText_Default", "return_properties": [ "content" diff --git a/agent.json b/program.json similarity index 96% rename from agent.json rename to program.json index 51bdb7a..afd8013 100644 --- a/agent.json +++ b/program.json @@ -28,7 +28,7 @@ }, "metadata": { "dependency_versions": { - "python": "3.11", + "python": "3.13", "dspy": "3.0.4", "cloudpickle": "3.1" } diff --git a/pyproject.toml b/pyproject.toml index a7d5bba..1817b87 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "CrossEncoderRanker" version = "0.1.0" -description = "Add your description here" +description = "Cross encoder ranker using modaic" readme = "README.md" requires-python = ">=3.11" -dependencies = ["modaic>=0.4.1", "weaviate-client>=4.18.1"] +dependencies = ["modaic>=0.8.0", "weaviate-client>=4.18.1"]