(no commit message)

2025-08-17 05:38:31 -04:00
parent a9bbdd17a8
commit fb8d33b76f
10 changed files with 588 additions and 0 deletions
--- a/README.md
+++ b/README.md
@@ -1,2 +1,3 @@
 # TableRAG
 Wassup fool
--- a/main.py
+++ b/main.py
@@ -0,0 +1,12 @@
 from agent.agent import TableRAGAgent, TableRAGIndexer, TableRAGConfig
 from modaic.databases import MilvusVDBConfig, SQLiteConfig
 config = TableRAGConfig()
 indexer = TableRAGIndexer(
    config=config,
    vdb_config=MilvusVDBConfig.from_local("index2.db"),
    sql_config=SQLiteConfig(db_path="tables.db"),
 )
 agent = TableRAGAgent(config=TableRAGConfig(), indexer=indexer)
 agent.push_to_hub("swagginty/TableRAG")
--- a/agent.json
+++ b/agent.json
@@ -0,0 +1,231 @@
 {
  "main.react": {
    "traces": [],
    "train": [],
    "demos": [],
    "signature": {
      "instructions": "Next, you will complete a table-related question answering task. Based on the provided materials such as the table content (in Markdown format), you need to analyze the User Query.\nAnd try to decide whether the User Input Query should be broken down into subqueries. You are provided with \"solve_subquery\" tool that can get answer for the subqueries.\nAfter you have collected sufficient information, you need to generate comprehensive answers.\n\nInstructions:\n1. Carefully analyze each user query through step-by-step reasoning.\n2. If the query needs information more than the given table content：\n    - Decompose the query into subqueries.\n    - Process one subquery at a time.\n    - Use \"solve_subquery\" tool to get answers for each subquey.\n3. If a query can be answered by table content, do not decompose it. And directly put the orignal query into the \"solve_subquery\" tool.\n    The \"solve_subquery\" tool utilizes SQL execution inside, it can solve complex subquery on table through one tool call.\n4. Generate exactly ONE subquery at a time.\n5. Write out all terms completely - avoid using abbreviations.\n6. When you have sufficient information, provide the final answer in the following format:\n    <Answer>: [your complete response]\nPlease start!\n\nYou are an Agent. In each episode, you will be given the fields `table_content`, `user_input_query` as input. And you can see your past trajectory so far.\nYour goal is to use one or more of the supplied tools to collect any necessary information for producing `answer`.\n\nTo do this, you will interleave next_thought, next_tool_name, and next_tool_args in each turn, and also when finishing the task.\nAfter each tool call, you receive a resulting observation, which gets appended to your trajectory.\n\nWhen writing next_thought, you may reason about the current situation and plan for future steps.\nWhen selecting the next_tool_name and its next_tool_args, the tool must be one of:\n\n(1) solve_subquery, whose description is <desc>          Solves a natural language subqeury using the SQL exectution.          <\/desc>. It takes arguments {'sub_query': {'type': 'string'}}.\n(2) finish, whose description is <desc>Marks the task as complete. That is, signals that all information for producing the outputs, i.e. `answer`, are now available to be extracted.<\/desc>. It takes arguments {}.\nWhen providing `next_tool_args`, the value inside the field must be in JSON format",
      "fields": [
        {
          "prefix": "Table Content:",
          "description": "${table_content}"
        },
        {
          "prefix": "User Input Query:",
          "description": "${user_input_query}"
        },
        {
          "prefix": "Trajectory:",
          "description": "${trajectory}"
        },
        {
          "prefix": "Next Thought:",
          "description": "${next_thought}"
        },
        {
          "prefix": "Next Tool Name:",
          "description": "${next_tool_name}"
        },
        {
          "prefix": "Next Tool Args:",
          "description": "${next_tool_args}"
        }
      ]
    },
    "lm": {
      "model": "openai\/gpt-4o-mini",
      "model_type": "chat",
      "cache": true,
      "cache_in_memory": true,
      "num_retries": 3,
      "finetuning_model": null,
      "launch_kwargs": {},
      "train_kwargs": {},
      "temperature": 0.0,
      "max_tokens": 4000
    }
  },
  "main.extract.predict": {
    "traces": [],
    "train": [],
    "demos": [],
    "signature": {
      "instructions": "Next, you will complete a table-related question answering task. Based on the provided materials such as the table content (in Markdown format), you need to analyze the User Query.\nAnd try to decide whether the User Input Query should be broken down into subqueries. You are provided with \"solve_subquery\" tool that can get answer for the subqueries.\nAfter you have collected sufficient information, you need to generate comprehensive answers.\n\nInstructions:\n1. Carefully analyze each user query through step-by-step reasoning.\n2. If the query needs information more than the given table content：\n    - Decompose the query into subqueries.\n    - Process one subquery at a time.\n    - Use \"solve_subquery\" tool to get answers for each subquey.\n3. If a query can be answered by table content, do not decompose it. And directly put the orignal query into the \"solve_subquery\" tool.\n    The \"solve_subquery\" tool utilizes SQL execution inside, it can solve complex subquery on table through one tool call.\n4. Generate exactly ONE subquery at a time.\n5. Write out all terms completely - avoid using abbreviations.\n6. When you have sufficient information, provide the final answer in the following format:\n    <Answer>: [your complete response]\nPlease start!",
      "fields": [
        {
          "prefix": "Table Content:",
          "description": "${table_content}"
        },
        {
          "prefix": "User Input Query:",
          "description": "${user_input_query}"
        },
        {
          "prefix": "Trajectory:",
          "description": "${trajectory}"
        },
        {
          "prefix": "Reasoning: Let's think step by step in order to",
          "description": "${reasoning}"
        },
        {
          "prefix": "Answer:",
          "description": "${answer}"
        }
      ]
    },
    "lm": {
      "model": "openai\/gpt-4o-mini",
      "model_type": "chat",
      "cache": true,
      "cache_in_memory": true,
      "num_retries": 3,
      "finetuning_model": null,
      "launch_kwargs": {},
      "train_kwargs": {},
      "temperature": 0.0,
      "max_tokens": 4000
    }
  },
  "nl2sql.react": {
    "traces": [],
    "train": [],
    "demos": [],
    "signature": {
      "instructions": "You are an expert in SQL and can generate SQL statements based on table schemas and query requirements.\nRespond as concisely as possible, providing only the SQL statement without any additional explanations.\n\nYou are an Agent. In each episode, you will be given the fields `schema_list`, `user_query` as input. And you can see your past trajectory so far.\nYour goal is to use one or more of the supplied tools to collect any necessary information for producing `answer`.\n\nTo do this, you will interleave next_thought, next_tool_name, and next_tool_args in each turn, and also when finishing the task.\nAfter each tool call, you receive a resulting observation, which gets appended to your trajectory.\n\nWhen writing next_thought, you may reason about the current situation and plan for future steps.\nWhen selecting the next_tool_name and its next_tool_args, the tool must be one of:\n\n(1) sql_query, whose description is <desc>          Query the sql database and get the result as a string.          Args:              query: The sql query to execute.          Returns:              The result of the sql query as a string.          <\/desc>. It takes arguments {'query': {'type': 'string'}}.\n(2) finish, whose description is <desc>Marks the task as complete. That is, signals that all information for producing the outputs, i.e. `answer`, are now available to be extracted.<\/desc>. It takes arguments {}.\nWhen providing `next_tool_args`, the value inside the field must be in JSON format",
      "fields": [
        {
          "prefix": "Schema List:",
          "description": "Based on the schemas please use MySQL syntax to the user's query"
        },
        {
          "prefix": "User Query:",
          "description": "The user's query"
        },
        {
          "prefix": "Trajectory:",
          "description": "${trajectory}"
        },
        {
          "prefix": "Next Thought:",
          "description": "${next_thought}"
        },
        {
          "prefix": "Next Tool Name:",
          "description": "${next_tool_name}"
        },
        {
          "prefix": "Next Tool Args:",
          "description": "${next_tool_args}"
        }
      ]
    },
    "lm": {
      "model": "openai\/gpt-4o-mini",
      "model_type": "chat",
      "cache": true,
      "cache_in_memory": true,
      "num_retries": 3,
      "finetuning_model": null,
      "launch_kwargs": {},
      "train_kwargs": {},
      "temperature": 0.0,
      "max_tokens": 4000
    }
  },
  "nl2sql.extract.predict": {
    "traces": [],
    "train": [],
    "demos": [],
    "signature": {
      "instructions": "You are an expert in SQL and can generate SQL statements based on table schemas and query requirements.\nRespond as concisely as possible, providing only the SQL statement without any additional explanations.",
      "fields": [
        {
          "prefix": "Schema List:",
          "description": "Based on the schemas please use MySQL syntax to the user's query"
        },
        {
          "prefix": "User Query:",
          "description": "The user's query"
        },
        {
          "prefix": "Trajectory:",
          "description": "${trajectory}"
        },
        {
          "prefix": "Reasoning: Let's think step by step in order to",
          "description": "${reasoning}"
        },
        {
          "prefix": "Answer:",
          "description": "Answer to the user's query"
        }
      ]
    },
    "lm": {
      "model": "openai\/gpt-4o-mini",
      "model_type": "chat",
      "cache": true,
      "cache_in_memory": true,
      "num_retries": 3,
      "finetuning_model": null,
      "launch_kwargs": {},
      "train_kwargs": {},
      "temperature": 0.0,
      "max_tokens": 4000
    }
  },
  "subquery_summarizer": {
    "traces": [],
    "train": [],
    "demos": [],
    "signature": {
      "instructions": "You are about to complete a table-based question answernig task using the following two types of reference materials:\n\nNote:\n1. The markdown table content in Original Content may be incomplete.\n2. You should cross-validate the given two materials:\n    - if the answers are the same, directly output the answer.\n    - if the \"SQL execution result\" contains error or is empty, you should try to answer based on the Original Content.\n    - if the two materials shows conflit, you should think about each of them, and finally give an answer.",
      "fields": [
        {
          "prefix": "Original Content:",
          "description": "Content 1: Original content (table content is provided in Markdown format)"
        },
        {
          "prefix": "Table Schema:",
          "description": "The user given table schema"
        },
        {
          "prefix": "Gnerated Sql:",
          "description": "SQL generated based on the schema and the user question"
        },
        {
          "prefix": "Sql Execute Result:",
          "description": "SQL execution results"
        },
        {
          "prefix": "User Query:",
          "description": "The user's question"
        },
        {
          "prefix": "Answer:",
          "description": "Answer to the user's question"
        }
      ]
    },
    "lm": {
      "model": "openai\/gpt-4o-mini",
      "model_type": "chat",
      "cache": true,
      "cache_in_memory": true,
      "num_retries": 3,
      "finetuning_model": null,
      "launch_kwargs": {},
      "train_kwargs": {},
      "temperature": 0.0,
      "max_tokens": 4000
    }
  },
  "metadata": {
    "dependency_versions": {
      "python": "3.11",
      "dspy": "3.0.1",
      "cloudpickle": "3.1"
    }
  }
 }
--- a/agent/init.py
+++ b/agent/init.py
--- a/agent/agent.py
+++ b/agent/agent.py
@@ -0,0 +1,140 @@
 from modaic.precompiled_agent import PrecompiledConfig, PrecompiledAgent
 from modaic.context import Table
 from typing import Type, Optional
 import dspy
 from agent.indexer import TableRAGIndexer
 import json
 from modaic.databases import (
    VectorDatabase,
    MilvusVDBConfig,
    SearchResult,
    SQLDatabase,
    SQLiteConfig,
 )
 import os
 # import utils.google_api as google_api
 # import utils.outlook_api as outlook_api
 # import utils.zoom_api as zoom_api
 from agent.config import TableRAGConfig
 # Signatures
 class NL2SQL(dspy.Signature):
    """You are an expert in SQL and can generate SQL statements based on table schemas and query requirements.
    Respond as concisely as possible, providing only the SQL statement without any additional explanations."""
    schema_list = dspy.InputField(
        desc="Based on the schemas please use MySQL syntax to the user's query"
    )
    user_query = dspy.InputField(desc="The user's query")
    answer = dspy.OutputField(desc="Answer to the user's query")
 class Main(dspy.Signature):
    """
    Next, you will complete a table-related question answering task. Based on the provided materials such as the table content (in Markdown format), you need to analyze the User Query.
    And try to decide whether the User Input Query should be broken down into subqueries. You are provided with "solve_subquery" tool that can get answer for the subqueries.
    After you have collected sufficient information, you need to generate comprehensive answers.
    Instructions:
    1. Carefully analyze each user query through step-by-step reasoning.
    2. If the query needs information more than the given table content：
        - Decompose the query into subqueries.
        - Process one subquery at a time.
        - Use "solve_subquery" tool to get answers for each subquey.
    3. If a query can be answered by table content, do not decompose it. And directly put the orignal query into the "solve_subquery" tool.
        The "solve_subquery" tool utilizes SQL execution inside, it can solve complex subquery on table through one tool call.
    4. Generate exactly ONE subquery at a time.
    5. Write out all terms completely - avoid using abbreviations.
    6. When you have sufficient information, provide the final answer in the following format:
        <Answer>: [your complete response]
    Please start!
    """
    table_content = dspy.InputField()
    user_input_query = dspy.InputField()
    answer = dspy.OutputField()
 class SubQuerySummarizer(dspy.Signature):
    """
    You are about to complete a table-based question answernig task using the following two types of reference materials:
    Note:
    1. The markdown table content in Original Content may be incomplete.
    2. You should cross-validate the given two materials:
        - if the answers are the same, directly output the answer.
        - if the "SQL execution result" contains error or is empty, you should try to answer based on the Original Content.
        - if the two materials shows conflit, you should think about each of them, and finally give an answer.
    """
    original_content = dspy.InputField(
        desc="Content 1: Original content (table content is provided in Markdown format)"
    )
    table_schema = dspy.InputField(desc="The user given table schema")
    gnerated_sql = dspy.InputField(
        desc="SQL generated based on the schema and the user question"
    )
    sql_execute_result = dspy.InputField(desc="SQL execution results")
    user_query = dspy.InputField(desc="The user's question")
    answer = dspy.OutputField(desc="Answer to the user's question")
 class TableRAGAgent(PrecompiledAgent):
    config_class = TableRAGConfig
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.main = dspy.ReAct(Main, tools=[self.solve_subquery])
        self.nl2sql = dspy.ReAct(NL2SQL, tools=[self.indexer.sql_query])
        self.subquery_summarizer = dspy.Predict(SubQuerySummarizer)
        self.set_lm(dspy.LM("openai/gpt-4o-mini"))
    def forward(self, user_query: str, table_id: Optional[str] = None, **kwargs) -> str:
        if table_id is not None:
            self.user_query = user_query + f"The given table is in {table_id}"
        else:
            self.user_query = user_query
        print("USER QUERY", self.user_query)
        related_table_serialized = self.indexer.retrieve(
            self.user_query,
            k_recall=self.config.k_recall,
            k_rerank=self.config.k_rerank,
            type="table",
        )[0]  # TODO: handle multiple tables
        print("RELATED TABLE", related_table_serialized)
        related_table = self.indexer.get_table(
            related_table_serialized.metadata["schema"]["table_name"]
        )
        self.table_md = related_table.markdown()
        self.table_schema = json.dumps(related_table.metadata["schema"])
        return self.main(user_input_query=user_query, table_content=self.table_md)
    def solve_subquery(self, sub_query: str) -> str:
        """
        Solves a natural language subqeury using the SQL exectution.
        """
        sql_result = self.nl2sql(schema_list=self.table_schema, user_query=sub_query)
        generated_sql = self.indexer.last_query
        return self.subquery_summarizer(
            original_content=self.table_md,
            table_schema=self.table_schema,
            gnerated_sql=generated_sql,
            sql_execute_result=sql_result,
            user_query=self.user_query,
        )
 if __name__ == "__main__":
    indexer = TableRAGIndexer(
        vdb_config=MilvusVDBConfig.from_local("examples/TableRAG/index2.db"),
        sql_config=SQLiteConfig(db_path="examples/TableRAG/tables.db"),
    )
    agent = TableRAGAgent(config=TableRAGConfig(), indexer=indexer)
    # # x = indexer.sql_query("SELECT * FROM t_5th_new_zealand_parliament_0")
    # # print(x)
    # x = agent(user_query="Who is the New Zealand Parliament Member for Canterbury")
    # print(x)
    agent.push_to_hub("test/test")
--- a/agent/config.py
+++ b/agent/config.py
@@ -0,0 +1,8 @@
 from modaic.precompiled_agent import PrecompiledConfig
 from dataclasses import dataclass
@dataclass
 class TableRAGConfig(PrecompiledConfig):
    k_recall: int = 50
    k_rerank: int = 5
--- a/agent/indexer.py
+++ b/agent/indexer.py
@@ -0,0 +1,176 @@
 from modaic import Indexer
 from modaic.databases import VectorDatabase, MilvusVDBConfig, SearchResult
 from typing import List, Literal, Tuple
 import dspy
 from langchain_text_splitters import RecursiveCharacterTextSplitter
 import os
 import json
 from modaic.databases import SQLDatabase, SQLiteConfig
 from modaic.context import (
    ContextSchema,
    Table,
    LongText,
    Text,
    Source,
    SourceType,
    TextSchema,
 )
 from modaic.indexing import PineconeReranker, Embedder
 from dotenv import load_dotenv
 from tqdm.auto import tqdm  # auto picks the right frontend
 from modaic.context.query_language import Filter
 from agent.config import TableRAGConfig
 load_dotenv()
 class TableRAGIndexer(Indexer):
    config_class = TableRAGConfig
    def __init__(
        self,
        *args,
        vdb_config: MilvusVDBConfig,
        sql_config: SQLiteConfig,
        **kwargs,
    ):
        super().__init__(*args, **kwargs)
        self.embedder = Embedder(model="openai/text-embedding-3-small")
        self.vector_database = VectorDatabase(
            config=vdb_config,
            embedder=self.embedder,
            payload_schema=Text.schema,
        )
        self.sql_db = SQLDatabase(config=sql_config)
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=200,
        )
        self.reranker = PineconeReranker(
            model="bge-reranker-v2-m3", api_key=os.getenv("PINECONE_API_KEY")
        )
        self.last_query = None
        # self.vector_database.load_collection("table_rag")
        self.vector_database.create_collection(
            "table_rag", Text.schema, exists_behavior="append"
        )
    def ingest(self, files: List[str] | str, *args, **kwargs):
        if isinstance(files, str):
            files = [os.path.join(files, file) for file in os.listdir(files)]
        records = []
        with self.sql_db.connect_and_begin():
            for file in tqdm(files, desc="Ingesting files", position=0):
                if file.endswith((".csv", ".xlsx", ".xls")):
                    if file.endswith(".csv"):
                        table = Table.from_csv(file)
                    elif file.endswith((".xlsx", ".xls")):
                        table = Table.from_excel(file)
                    # Add table to file system context store
                    table.metadata["schema"] = table.schema_info()
                    # print("TABLE NAME", table.name)
                    # print("TABLE SCHEMA\n", table.schema_info())
                    # print("TABLE METADATA\n", table.metadata)
                    # print()
                    # print()
                    # print(table.metadata["schema"])
                    self.sql_db.add_table(table)
                    table.chunk_with(self.chunk_table)
                    records.extend(table.chunks)
                elif file.endswith((".json")):
                    with open(file, "r", encoding="utf-8") as f:
                        data_split = json.load(f)
                    key_value_doc = ""
                    for key, item in data_split.items():
                        key_value_doc += f"{key} {item}\n"
                    text_document = LongText(text=key_value_doc)
                    text_document.chunk_text(self.text_splitter.split_text)
                    text_document.apply_to_chunks(
                        lambda chunk: chunk.add_metadata({"type": "text"})
                    )
                    records.extend(text_document.chunks)
        print("Adding records to vector database")
        print("number of records", len(records))
        self.vector_database.add_records("table_rag", records, batch_size=10000)
    def add():
        pass
    def delete():
        pass
    def retrieve(
        self,
        user_query: str,
        k_recall: int = 10,
        k_rerank: int = 10,
        type: Literal["table", "text", "all"] = "all",
    ) -> List[ContextSchema]:
        results = self.recall(user_query, k_recall, type)
        records = [
            (result["context_schema"].text, result["context_schema"])
            if result["context_schema"].context_class == "Text"
            else (
                result["context_schema"].metadata["md_chunk"],
                result["context_schema"],
            )
            for result in results
        ]
        results = self.reranker(user_query, records, k_rerank)
        results = [result[1] for result in results]
        return results
    def recall(
        self,
        user_query: str,
        k: int = 10,
        type: Literal["table", "text", "all"] = "all",
    ) -> List[SearchResult]:
        embedding = self.embedder([user_query])[0]
        if type == "table":
            filter = TextSchema.metadata["type"] == "table"
        elif type == "text":
            filter = TextSchema.metadata["type"] == "text"
        else:
            filter = None
        return self.vector_database.search("table_rag", embedding, k, Filter(filter))
    def chunk_table(self, table: Table) -> List[Text]:
        # if (
        #     table.name == "t_5th_new_zealand_parliament_0"
        #     or table.name == "france_at_the_2013_world_aquatics_championships_0"
        # ):
        #     print("CHUNKING TABLE", table.name)
        #     print("TABLE SCHEMA\n", table.schema_info())
        #     print("TABLE METADATA\n", table.metadata)
        #     print()
        #     print()
        #     raise Exception("Stop here")
        table_md = LongText(text=table.markdown())
        table_md.chunk_text(self.text_splitter.split_text)
        table_md.apply_to_chunks(
            lambda chunk: chunk.add_metadata(
                {"type": "table", "schema": table.metadata["schema"]}
            )
        )
        # raise Exception("Stop here")
        return table_md.chunks
    def sql_query(self, query: str) -> str:
        """
        Query the sql database and get the result as a string.
        Args:
            query: The sql query to execute.
        Returns:
            The result of the sql query as a string.
        """
        self.last_query = query
        try:
            return str(self.sql_db.fetchall(query))
        except Exception as e:
            return f"Error executing sql query: {e}"
    def get_table(self, table_id: str) -> Table:
        return self.sql_db.get_table(table_id)
--- a/auto_classes.json
+++ b/auto_classes.json
@@ -0,0 +1,5 @@
 {
  "AutoConfig": "agent.config.TableRAGConfig",
  "AutoAgent": "agent.agent.TableRAGAgent",
  "AutoIndexer": "agent.indexer.TableRAGIndexer"
 }
--- a/config.json
+++ b/config.json
@@ -0,0 +1,4 @@
 {
  "k_recall": 50,
  "k_rerank": 5
 }
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -0,0 +1,11 @@
 [project]
 name = "TableRAG"
 version = "0.1.0"
 description = "Add your description here"
 readme = "README.md"
 requires-python = ">=3.11"
 dependencies = ["ipykernel>=6.30.1", "matplotlib>=3.10.5", "modaic", "numpy>=2.3.2"]
 [tool.uv.sources]
 modaic = { path = "../../modaic", editable = true }