(no commit message)

This commit is contained in:
2025-08-17 05:38:31 -04:00
parent a9bbdd17a8
commit fb8d33b76f
10 changed files with 588 additions and 0 deletions

View File

@@ -1,2 +1,3 @@
# TableRAG # TableRAG
Wassup fool

12
__main__.py Normal file
View File

@@ -0,0 +1,12 @@
from agent.agent import TableRAGAgent, TableRAGIndexer, TableRAGConfig
from modaic.databases import MilvusVDBConfig, SQLiteConfig
config = TableRAGConfig()
indexer = TableRAGIndexer(
config=config,
vdb_config=MilvusVDBConfig.from_local("index2.db"),
sql_config=SQLiteConfig(db_path="tables.db"),
)
agent = TableRAGAgent(config=TableRAGConfig(), indexer=indexer)
agent.push_to_hub("swagginty/TableRAG")

231
agent.json Normal file
View File

@@ -0,0 +1,231 @@
{
"main.react": {
"traces": [],
"train": [],
"demos": [],
"signature": {
"instructions": "Next, you will complete a table-related question answering task. Based on the provided materials such as the table content (in Markdown format), you need to analyze the User Query.\nAnd try to decide whether the User Input Query should be broken down into subqueries. You are provided with \"solve_subquery\" tool that can get answer for the subqueries.\nAfter you have collected sufficient information, you need to generate comprehensive answers.\n\nInstructions:\n1. Carefully analyze each user query through step-by-step reasoning.\n2. If the query needs information more than the given table content\n - Decompose the query into subqueries.\n - Process one subquery at a time.\n - Use \"solve_subquery\" tool to get answers for each subquey.\n3. If a query can be answered by table content, do not decompose it. And directly put the orignal query into the \"solve_subquery\" tool.\n The \"solve_subquery\" tool utilizes SQL execution inside, it can solve complex subquery on table through one tool call.\n4. Generate exactly ONE subquery at a time.\n5. Write out all terms completely - avoid using abbreviations.\n6. When you have sufficient information, provide the final answer in the following format:\n <Answer>: [your complete response]\nPlease start!\n\nYou are an Agent. In each episode, you will be given the fields `table_content`, `user_input_query` as input. And you can see your past trajectory so far.\nYour goal is to use one or more of the supplied tools to collect any necessary information for producing `answer`.\n\nTo do this, you will interleave next_thought, next_tool_name, and next_tool_args in each turn, and also when finishing the task.\nAfter each tool call, you receive a resulting observation, which gets appended to your trajectory.\n\nWhen writing next_thought, you may reason about the current situation and plan for future steps.\nWhen selecting the next_tool_name and its next_tool_args, the tool must be one of:\n\n(1) solve_subquery, whose description is <desc> Solves a natural language subqeury using the SQL exectution. <\/desc>. It takes arguments {'sub_query': {'type': 'string'}}.\n(2) finish, whose description is <desc>Marks the task as complete. That is, signals that all information for producing the outputs, i.e. `answer`, are now available to be extracted.<\/desc>. It takes arguments {}.\nWhen providing `next_tool_args`, the value inside the field must be in JSON format",
"fields": [
{
"prefix": "Table Content:",
"description": "${table_content}"
},
{
"prefix": "User Input Query:",
"description": "${user_input_query}"
},
{
"prefix": "Trajectory:",
"description": "${trajectory}"
},
{
"prefix": "Next Thought:",
"description": "${next_thought}"
},
{
"prefix": "Next Tool Name:",
"description": "${next_tool_name}"
},
{
"prefix": "Next Tool Args:",
"description": "${next_tool_args}"
}
]
},
"lm": {
"model": "openai\/gpt-4o-mini",
"model_type": "chat",
"cache": true,
"cache_in_memory": true,
"num_retries": 3,
"finetuning_model": null,
"launch_kwargs": {},
"train_kwargs": {},
"temperature": 0.0,
"max_tokens": 4000
}
},
"main.extract.predict": {
"traces": [],
"train": [],
"demos": [],
"signature": {
"instructions": "Next, you will complete a table-related question answering task. Based on the provided materials such as the table content (in Markdown format), you need to analyze the User Query.\nAnd try to decide whether the User Input Query should be broken down into subqueries. You are provided with \"solve_subquery\" tool that can get answer for the subqueries.\nAfter you have collected sufficient information, you need to generate comprehensive answers.\n\nInstructions:\n1. Carefully analyze each user query through step-by-step reasoning.\n2. If the query needs information more than the given table content\n - Decompose the query into subqueries.\n - Process one subquery at a time.\n - Use \"solve_subquery\" tool to get answers for each subquey.\n3. If a query can be answered by table content, do not decompose it. And directly put the orignal query into the \"solve_subquery\" tool.\n The \"solve_subquery\" tool utilizes SQL execution inside, it can solve complex subquery on table through one tool call.\n4. Generate exactly ONE subquery at a time.\n5. Write out all terms completely - avoid using abbreviations.\n6. When you have sufficient information, provide the final answer in the following format:\n <Answer>: [your complete response]\nPlease start!",
"fields": [
{
"prefix": "Table Content:",
"description": "${table_content}"
},
{
"prefix": "User Input Query:",
"description": "${user_input_query}"
},
{
"prefix": "Trajectory:",
"description": "${trajectory}"
},
{
"prefix": "Reasoning: Let's think step by step in order to",
"description": "${reasoning}"
},
{
"prefix": "Answer:",
"description": "${answer}"
}
]
},
"lm": {
"model": "openai\/gpt-4o-mini",
"model_type": "chat",
"cache": true,
"cache_in_memory": true,
"num_retries": 3,
"finetuning_model": null,
"launch_kwargs": {},
"train_kwargs": {},
"temperature": 0.0,
"max_tokens": 4000
}
},
"nl2sql.react": {
"traces": [],
"train": [],
"demos": [],
"signature": {
"instructions": "You are an expert in SQL and can generate SQL statements based on table schemas and query requirements.\nRespond as concisely as possible, providing only the SQL statement without any additional explanations.\n\nYou are an Agent. In each episode, you will be given the fields `schema_list`, `user_query` as input. And you can see your past trajectory so far.\nYour goal is to use one or more of the supplied tools to collect any necessary information for producing `answer`.\n\nTo do this, you will interleave next_thought, next_tool_name, and next_tool_args in each turn, and also when finishing the task.\nAfter each tool call, you receive a resulting observation, which gets appended to your trajectory.\n\nWhen writing next_thought, you may reason about the current situation and plan for future steps.\nWhen selecting the next_tool_name and its next_tool_args, the tool must be one of:\n\n(1) sql_query, whose description is <desc> Query the sql database and get the result as a string. Args: query: The sql query to execute. Returns: The result of the sql query as a string. <\/desc>. It takes arguments {'query': {'type': 'string'}}.\n(2) finish, whose description is <desc>Marks the task as complete. That is, signals that all information for producing the outputs, i.e. `answer`, are now available to be extracted.<\/desc>. It takes arguments {}.\nWhen providing `next_tool_args`, the value inside the field must be in JSON format",
"fields": [
{
"prefix": "Schema List:",
"description": "Based on the schemas please use MySQL syntax to the user's query"
},
{
"prefix": "User Query:",
"description": "The user's query"
},
{
"prefix": "Trajectory:",
"description": "${trajectory}"
},
{
"prefix": "Next Thought:",
"description": "${next_thought}"
},
{
"prefix": "Next Tool Name:",
"description": "${next_tool_name}"
},
{
"prefix": "Next Tool Args:",
"description": "${next_tool_args}"
}
]
},
"lm": {
"model": "openai\/gpt-4o-mini",
"model_type": "chat",
"cache": true,
"cache_in_memory": true,
"num_retries": 3,
"finetuning_model": null,
"launch_kwargs": {},
"train_kwargs": {},
"temperature": 0.0,
"max_tokens": 4000
}
},
"nl2sql.extract.predict": {
"traces": [],
"train": [],
"demos": [],
"signature": {
"instructions": "You are an expert in SQL and can generate SQL statements based on table schemas and query requirements.\nRespond as concisely as possible, providing only the SQL statement without any additional explanations.",
"fields": [
{
"prefix": "Schema List:",
"description": "Based on the schemas please use MySQL syntax to the user's query"
},
{
"prefix": "User Query:",
"description": "The user's query"
},
{
"prefix": "Trajectory:",
"description": "${trajectory}"
},
{
"prefix": "Reasoning: Let's think step by step in order to",
"description": "${reasoning}"
},
{
"prefix": "Answer:",
"description": "Answer to the user's query"
}
]
},
"lm": {
"model": "openai\/gpt-4o-mini",
"model_type": "chat",
"cache": true,
"cache_in_memory": true,
"num_retries": 3,
"finetuning_model": null,
"launch_kwargs": {},
"train_kwargs": {},
"temperature": 0.0,
"max_tokens": 4000
}
},
"subquery_summarizer": {
"traces": [],
"train": [],
"demos": [],
"signature": {
"instructions": "You are about to complete a table-based question answernig task using the following two types of reference materials:\n\nNote:\n1. The markdown table content in Original Content may be incomplete.\n2. You should cross-validate the given two materials:\n - if the answers are the same, directly output the answer.\n - if the \"SQL execution result\" contains error or is empty, you should try to answer based on the Original Content.\n - if the two materials shows conflit, you should think about each of them, and finally give an answer.",
"fields": [
{
"prefix": "Original Content:",
"description": "Content 1: Original content (table content is provided in Markdown format)"
},
{
"prefix": "Table Schema:",
"description": "The user given table schema"
},
{
"prefix": "Gnerated Sql:",
"description": "SQL generated based on the schema and the user question"
},
{
"prefix": "Sql Execute Result:",
"description": "SQL execution results"
},
{
"prefix": "User Query:",
"description": "The user's question"
},
{
"prefix": "Answer:",
"description": "Answer to the user's question"
}
]
},
"lm": {
"model": "openai\/gpt-4o-mini",
"model_type": "chat",
"cache": true,
"cache_in_memory": true,
"num_retries": 3,
"finetuning_model": null,
"launch_kwargs": {},
"train_kwargs": {},
"temperature": 0.0,
"max_tokens": 4000
}
},
"metadata": {
"dependency_versions": {
"python": "3.11",
"dspy": "3.0.1",
"cloudpickle": "3.1"
}
}
}

0
agent/__init__.py Normal file
View File

140
agent/agent.py Normal file
View File

@@ -0,0 +1,140 @@
from modaic.precompiled_agent import PrecompiledConfig, PrecompiledAgent
from modaic.context import Table
from typing import Type, Optional
import dspy
from agent.indexer import TableRAGIndexer
import json
from modaic.databases import (
VectorDatabase,
MilvusVDBConfig,
SearchResult,
SQLDatabase,
SQLiteConfig,
)
import os
# import utils.google_api as google_api
# import utils.outlook_api as outlook_api
# import utils.zoom_api as zoom_api
from agent.config import TableRAGConfig
# Signatures
class NL2SQL(dspy.Signature):
"""You are an expert in SQL and can generate SQL statements based on table schemas and query requirements.
Respond as concisely as possible, providing only the SQL statement without any additional explanations."""
schema_list = dspy.InputField(
desc="Based on the schemas please use MySQL syntax to the user's query"
)
user_query = dspy.InputField(desc="The user's query")
answer = dspy.OutputField(desc="Answer to the user's query")
class Main(dspy.Signature):
"""
Next, you will complete a table-related question answering task. Based on the provided materials such as the table content (in Markdown format), you need to analyze the User Query.
And try to decide whether the User Input Query should be broken down into subqueries. You are provided with "solve_subquery" tool that can get answer for the subqueries.
After you have collected sufficient information, you need to generate comprehensive answers.
Instructions:
1. Carefully analyze each user query through step-by-step reasoning.
2. If the query needs information more than the given table content
- Decompose the query into subqueries.
- Process one subquery at a time.
- Use "solve_subquery" tool to get answers for each subquey.
3. If a query can be answered by table content, do not decompose it. And directly put the orignal query into the "solve_subquery" tool.
The "solve_subquery" tool utilizes SQL execution inside, it can solve complex subquery on table through one tool call.
4. Generate exactly ONE subquery at a time.
5. Write out all terms completely - avoid using abbreviations.
6. When you have sufficient information, provide the final answer in the following format:
<Answer>: [your complete response]
Please start!
"""
table_content = dspy.InputField()
user_input_query = dspy.InputField()
answer = dspy.OutputField()
class SubQuerySummarizer(dspy.Signature):
"""
You are about to complete a table-based question answernig task using the following two types of reference materials:
Note:
1. The markdown table content in Original Content may be incomplete.
2. You should cross-validate the given two materials:
- if the answers are the same, directly output the answer.
- if the "SQL execution result" contains error or is empty, you should try to answer based on the Original Content.
- if the two materials shows conflit, you should think about each of them, and finally give an answer.
"""
original_content = dspy.InputField(
desc="Content 1: Original content (table content is provided in Markdown format)"
)
table_schema = dspy.InputField(desc="The user given table schema")
gnerated_sql = dspy.InputField(
desc="SQL generated based on the schema and the user question"
)
sql_execute_result = dspy.InputField(desc="SQL execution results")
user_query = dspy.InputField(desc="The user's question")
answer = dspy.OutputField(desc="Answer to the user's question")
class TableRAGAgent(PrecompiledAgent):
config_class = TableRAGConfig
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.main = dspy.ReAct(Main, tools=[self.solve_subquery])
self.nl2sql = dspy.ReAct(NL2SQL, tools=[self.indexer.sql_query])
self.subquery_summarizer = dspy.Predict(SubQuerySummarizer)
self.set_lm(dspy.LM("openai/gpt-4o-mini"))
def forward(self, user_query: str, table_id: Optional[str] = None, **kwargs) -> str:
if table_id is not None:
self.user_query = user_query + f"The given table is in {table_id}"
else:
self.user_query = user_query
print("USER QUERY", self.user_query)
related_table_serialized = self.indexer.retrieve(
self.user_query,
k_recall=self.config.k_recall,
k_rerank=self.config.k_rerank,
type="table",
)[0] # TODO: handle multiple tables
print("RELATED TABLE", related_table_serialized)
related_table = self.indexer.get_table(
related_table_serialized.metadata["schema"]["table_name"]
)
self.table_md = related_table.markdown()
self.table_schema = json.dumps(related_table.metadata["schema"])
return self.main(user_input_query=user_query, table_content=self.table_md)
def solve_subquery(self, sub_query: str) -> str:
"""
Solves a natural language subqeury using the SQL exectution.
"""
sql_result = self.nl2sql(schema_list=self.table_schema, user_query=sub_query)
generated_sql = self.indexer.last_query
return self.subquery_summarizer(
original_content=self.table_md,
table_schema=self.table_schema,
gnerated_sql=generated_sql,
sql_execute_result=sql_result,
user_query=self.user_query,
)
if __name__ == "__main__":
indexer = TableRAGIndexer(
vdb_config=MilvusVDBConfig.from_local("examples/TableRAG/index2.db"),
sql_config=SQLiteConfig(db_path="examples/TableRAG/tables.db"),
)
agent = TableRAGAgent(config=TableRAGConfig(), indexer=indexer)
# # x = indexer.sql_query("SELECT * FROM t_5th_new_zealand_parliament_0")
# # print(x)
# x = agent(user_query="Who is the New Zealand Parliament Member for Canterbury")
# print(x)
agent.push_to_hub("test/test")

8
agent/config.py Normal file
View File

@@ -0,0 +1,8 @@
from modaic.precompiled_agent import PrecompiledConfig
from dataclasses import dataclass
@dataclass
class TableRAGConfig(PrecompiledConfig):
k_recall: int = 50
k_rerank: int = 5

176
agent/indexer.py Normal file
View File

@@ -0,0 +1,176 @@
from modaic import Indexer
from modaic.databases import VectorDatabase, MilvusVDBConfig, SearchResult
from typing import List, Literal, Tuple
import dspy
from langchain_text_splitters import RecursiveCharacterTextSplitter
import os
import json
from modaic.databases import SQLDatabase, SQLiteConfig
from modaic.context import (
ContextSchema,
Table,
LongText,
Text,
Source,
SourceType,
TextSchema,
)
from modaic.indexing import PineconeReranker, Embedder
from dotenv import load_dotenv
from tqdm.auto import tqdm # auto picks the right frontend
from modaic.context.query_language import Filter
from agent.config import TableRAGConfig
load_dotenv()
class TableRAGIndexer(Indexer):
config_class = TableRAGConfig
def __init__(
self,
*args,
vdb_config: MilvusVDBConfig,
sql_config: SQLiteConfig,
**kwargs,
):
super().__init__(*args, **kwargs)
self.embedder = Embedder(model="openai/text-embedding-3-small")
self.vector_database = VectorDatabase(
config=vdb_config,
embedder=self.embedder,
payload_schema=Text.schema,
)
self.sql_db = SQLDatabase(config=sql_config)
self.text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200,
)
self.reranker = PineconeReranker(
model="bge-reranker-v2-m3", api_key=os.getenv("PINECONE_API_KEY")
)
self.last_query = None
# self.vector_database.load_collection("table_rag")
self.vector_database.create_collection(
"table_rag", Text.schema, exists_behavior="append"
)
def ingest(self, files: List[str] | str, *args, **kwargs):
if isinstance(files, str):
files = [os.path.join(files, file) for file in os.listdir(files)]
records = []
with self.sql_db.connect_and_begin():
for file in tqdm(files, desc="Ingesting files", position=0):
if file.endswith((".csv", ".xlsx", ".xls")):
if file.endswith(".csv"):
table = Table.from_csv(file)
elif file.endswith((".xlsx", ".xls")):
table = Table.from_excel(file)
# Add table to file system context store
table.metadata["schema"] = table.schema_info()
# print("TABLE NAME", table.name)
# print("TABLE SCHEMA\n", table.schema_info())
# print("TABLE METADATA\n", table.metadata)
# print()
# print()
# print(table.metadata["schema"])
self.sql_db.add_table(table)
table.chunk_with(self.chunk_table)
records.extend(table.chunks)
elif file.endswith((".json")):
with open(file, "r", encoding="utf-8") as f:
data_split = json.load(f)
key_value_doc = ""
for key, item in data_split.items():
key_value_doc += f"{key} {item}\n"
text_document = LongText(text=key_value_doc)
text_document.chunk_text(self.text_splitter.split_text)
text_document.apply_to_chunks(
lambda chunk: chunk.add_metadata({"type": "text"})
)
records.extend(text_document.chunks)
print("Adding records to vector database")
print("number of records", len(records))
self.vector_database.add_records("table_rag", records, batch_size=10000)
def add():
pass
def delete():
pass
def retrieve(
self,
user_query: str,
k_recall: int = 10,
k_rerank: int = 10,
type: Literal["table", "text", "all"] = "all",
) -> List[ContextSchema]:
results = self.recall(user_query, k_recall, type)
records = [
(result["context_schema"].text, result["context_schema"])
if result["context_schema"].context_class == "Text"
else (
result["context_schema"].metadata["md_chunk"],
result["context_schema"],
)
for result in results
]
results = self.reranker(user_query, records, k_rerank)
results = [result[1] for result in results]
return results
def recall(
self,
user_query: str,
k: int = 10,
type: Literal["table", "text", "all"] = "all",
) -> List[SearchResult]:
embedding = self.embedder([user_query])[0]
if type == "table":
filter = TextSchema.metadata["type"] == "table"
elif type == "text":
filter = TextSchema.metadata["type"] == "text"
else:
filter = None
return self.vector_database.search("table_rag", embedding, k, Filter(filter))
def chunk_table(self, table: Table) -> List[Text]:
# if (
# table.name == "t_5th_new_zealand_parliament_0"
# or table.name == "france_at_the_2013_world_aquatics_championships_0"
# ):
# print("CHUNKING TABLE", table.name)
# print("TABLE SCHEMA\n", table.schema_info())
# print("TABLE METADATA\n", table.metadata)
# print()
# print()
# raise Exception("Stop here")
table_md = LongText(text=table.markdown())
table_md.chunk_text(self.text_splitter.split_text)
table_md.apply_to_chunks(
lambda chunk: chunk.add_metadata(
{"type": "table", "schema": table.metadata["schema"]}
)
)
# raise Exception("Stop here")
return table_md.chunks
def sql_query(self, query: str) -> str:
"""
Query the sql database and get the result as a string.
Args:
query: The sql query to execute.
Returns:
The result of the sql query as a string.
"""
self.last_query = query
try:
return str(self.sql_db.fetchall(query))
except Exception as e:
return f"Error executing sql query: {e}"
def get_table(self, table_id: str) -> Table:
return self.sql_db.get_table(table_id)

5
auto_classes.json Normal file
View File

@@ -0,0 +1,5 @@
{
"AutoConfig": "agent.config.TableRAGConfig",
"AutoAgent": "agent.agent.TableRAGAgent",
"AutoIndexer": "agent.indexer.TableRAGIndexer"
}

4
config.json Normal file
View File

@@ -0,0 +1,4 @@
{
"k_recall": 50,
"k_rerank": 5
}

11
pyproject.toml Normal file
View File

@@ -0,0 +1,11 @@
[project]
name = "TableRAG"
version = "0.1.0"
description = "Add your description here"
readme = "README.md"
requires-python = ">=3.11"
dependencies = ["ipykernel>=6.30.1", "matplotlib>=3.10.5", "modaic", "numpy>=2.3.2"]
[tool.uv.sources]
modaic = { path = "../../modaic", editable = true }