(no commit message)

2025-11-24 22:04:16 -05:00
parent e5757a367b
commit 2c346586a9
12 changed files with 601 additions and 18 deletions
--- a/18
+++ b/18
@@ -1,18 +0,0 @@
 MIT License
 Copyright (c) 2025 johwang
 Permission is hereby granted, free of charge, to any person obtaining a copy of this software and 
 associated documentation files (the "Software"), to deal in the Software without restriction, including 
 without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 
 copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the 
 following conditions:
 The above copyright notice and this permission notice shall be included in all copies or substantial 
 portions of the Software.
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT 
 LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO 
 EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 
 IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 
 USE OR OTHER DEALINGS IN THE SOFTWARE.
--- a/README.md
+++ b/README.md
--- a/agent.json
+++ b/agent.json
@@ -0,0 +1,193 @@
 {
  "analyze_repo.predict": {
    "traces": [],
    "train": [],
    "demos": [],
    "signature": {
      "instructions": "Analyze a repository structure and identify key components.",
      "fields": [
        {
          "prefix": "Repo Url:",
          "description": "GitHub repository URL"
        },
        {
          "prefix": "File Tree:",
          "description": "Repository file structure"
        },
        {
          "prefix": "Readme Content:",
          "description": "README.md content"
        },
        {
          "prefix": "Reasoning: Let's think step by step in order to",
          "description": "${reasoning}"
        },
        {
          "prefix": "Project Purpose:",
          "description": "Main purpose and goals of the project"
        },
        {
          "prefix": "Key Concepts:",
          "description": "List of important concepts and terminology"
        },
        {
          "prefix": "Architecture Overview:",
          "description": "High-level architecture description"
        }
      ]
    },
    "lm": {
      "model": "gpt-4o",
      "model_type": "chat",
      "cache": true,
      "num_retries": 3,
      "finetuning_model": null,
      "launch_kwargs": {},
      "train_kwargs": {},
      "temperature": 0.7,
      "max_tokens": 8192
    }
  },
  "analyze_structure.predict": {
    "traces": [],
    "train": [],
    "demos": [],
    "signature": {
      "instructions": "Analyze code structure to identify important directories and files.",
      "fields": [
        {
          "prefix": "File Tree:",
          "description": "Repository file structure"
        },
        {
          "prefix": "Package Files:",
          "description": "Key package and configuration files"
        },
        {
          "prefix": "Reasoning: Let's think step by step in order to",
          "description": "${reasoning}"
        },
        {
          "prefix": "Important Directories:",
          "description": "Key directories and their purposes"
        },
        {
          "prefix": "Entry Points:",
          "description": "Main entry points and important files"
        },
        {
          "prefix": "Development Info:",
          "description": "Development setup and workflow information"
        }
      ]
    },
    "lm": {
      "model": "gpt-4o",
      "model_type": "chat",
      "cache": true,
      "num_retries": 3,
      "finetuning_model": null,
      "launch_kwargs": {},
      "train_kwargs": {},
      "temperature": 0.7,
      "max_tokens": 8192
    }
  },
  "generate_examples.predict": {
    "traces": [],
    "train": [],
    "demos": [],
    "signature": {
      "instructions": "Given the fields `repo_info`, produce the fields `usage_examples`.",
      "fields": [
        {
          "prefix": "Repo Info:",
          "description": "${repo_info}"
        },
        {
          "prefix": "Reasoning: Let's think step by step in order to",
          "description": "${reasoning}"
        },
        {
          "prefix": "Usage Examples:",
          "description": "${usage_examples}"
        }
      ]
    },
    "lm": {
      "model": "gpt-4o",
      "model_type": "chat",
      "cache": true,
      "num_retries": 3,
      "finetuning_model": null,
      "launch_kwargs": {},
      "train_kwargs": {},
      "temperature": 0.7,
      "max_tokens": 8192
    }
  },
  "generate_llms_txt.predict": {
    "traces": [],
    "train": [],
    "demos": [],
    "signature": {
      "instructions": "Generate a comprehensive llms.txt file from analyzed repository information.",
      "fields": [
        {
          "prefix": "Project Purpose:",
          "description": "${project_purpose}"
        },
        {
          "prefix": "Key Concepts:",
          "description": "${key_concepts}"
        },
        {
          "prefix": "Architecture Overview:",
          "description": "${architecture_overview}"
        },
        {
          "prefix": "Important Directories:",
          "description": "${important_directories}"
        },
        {
          "prefix": "Entry Points:",
          "description": "${entry_points}"
        },
        {
          "prefix": "Development Info:",
          "description": "${development_info}"
        },
        {
          "prefix": "Usage Examples:",
          "description": "Common usage patterns and examples"
        },
        {
          "prefix": "Reasoning: Let's think step by step in order to",
          "description": "${reasoning}"
        },
        {
          "prefix": "Llms Txt Content:",
          "description": "Complete llms.txt file content following the standard format"
        }
      ]
    },
    "lm": {
      "model": "gpt-4o",
      "model_type": "chat",
      "cache": true,
      "num_retries": 3,
      "finetuning_model": null,
      "launch_kwargs": {},
      "train_kwargs": {},
      "temperature": 0.7,
      "max_tokens": 8192
    }
  },
  "metadata": {
    "dependency_versions": {
      "python": "3.13",
      "dspy": "3.0.4",
      "cloudpickle": "3.1"
    }
  }
 }
--- a/auto_classes.json
+++ b/auto_classes.json
@@ -0,0 +1,4 @@
 {
  "AutoConfig": "src.analyzer.RepositoryAnalyzerConfig",
  "AutoAgent": "src.analyzer.RepositoryAnalyzer"
 }
--- a/compile.py
+++ b/compile.py
@@ -0,0 +1,15 @@
 import sys
 from src.analyzer import RepositoryAnalyzer, RepositoryAnalyzerConfig
 llms_txt_generator = RepositoryAnalyzer(
    config=RepositoryAnalyzerConfig()
 )
 def main():
    try:
        llms_txt_generator.push_to_hub("johwang/llms-txt-generator", with_code=True)
    except Exception as e:
        print(f"Error: {e}", file=sys.stderr)
 if __name__ == "__main__":
    main()
--- a/config.json
+++ b/config.json
@@ -0,0 +1,7 @@
 {
  "repo_analyzer_model": "gpt-4o",
  "code_analyzer_model": "gpt-4o",
  "llms_txt_generator_model": "gpt-4o",
  "max_tokens": 8192,
  "temperature": 0.7
 }
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -0,0 +1,17 @@
 [project]
 name = "llms-txt-generator"
 version = "0.1.0"
 description = "Generate llms.txt documentation for GitHub repositories using DSPy"
 readme = "README.md"
 requires-python = ">=3.10"
 dependencies = ["dspy>=3.0.4", "requests>=2.31.0", "python-dotenv>=1.0.0"]
 [project.scripts]
 llmstxt-gen = "main:main"
 [project.optional-dependencies]
 dev = [
    "pytest>=7.4.0",
    "black>=23.0.0",
    "ruff>=0.1.0",
 ]
--- a/src/init.py
+++ b/src/init.py
@@ -0,0 +1,14 @@
 from .signatures import AnalyzeRepository, AnalyzeCodeStructure, GenerateLLMsTxt
 from .analyzer import RepositoryAnalyzer, RepositoryAnalyzerConfig
 from .github_utils import gather_repository_info
 from .generator import generate_llms_txt
 __all__ = [
    "AnalyzeRepository",
    "AnalyzeCodeStructure",
    "GenerateLLMsTxt",
    "RepositoryAnalyzer",
    "RepositoryAnalyzerConfig",
    "gather_repository_info",
    "generate_llms_txt",
 ]
--- a/src/analyzer.py
+++ b/src/analyzer.py
@@ -0,0 +1,117 @@
 import os
 from typing import Optional
 import dspy
 from modaic import PrecompiledAgent, PrecompiledConfig
 from .github_utils import gather_repository_info
 from .signatures import AnalyzeRepository, AnalyzeCodeStructure, GenerateLLMsTxt
 class RepositoryAnalyzerConfig(PrecompiledConfig):
    """Configuration for RepositoryAnalyzer."""
    repo_analyzer_model: str = "gpt-4o"
    code_analyzer_model: str = "gpt-4o"
    llms_txt_generator_model: str = "gpt-4o"
    max_tokens: int = 8192
    temperature: float = 0.7
 class RepositoryAnalyzer(PrecompiledAgent):
    """Analyzes repository and generates llms.txt documentation."""
    config: RepositoryAnalyzerConfig
    def __init__(self, config: RepositoryAnalyzerConfig, **kwargs):
        super().__init__(config, **kwargs)
        self.repo_analyzer_lm = dspy.LM(
            config.repo_analyzer_model,
            max_tokens=config.max_tokens,
            temperature=config.temperature,
        )
        self.code_analyzer_lm = dspy.LM(
            config.code_analyzer_model,
            max_tokens=config.max_tokens,
            temperature=config.temperature,
        )
        self.llms_txt_generator_lm = dspy.LM(
            config.llms_txt_generator_model,
            max_tokens=config.max_tokens,
            temperature=config.temperature,
        )
        self.analyze_repo = dspy.ChainOfThought(AnalyzeRepository)
        self.analyze_repo.set_lm(self.repo_analyzer_lm)
        self.analyze_structure = dspy.ChainOfThought(AnalyzeCodeStructure)
        self.analyze_structure.set_lm(self.code_analyzer_lm)
        self.generate_examples = dspy.ChainOfThought("repo_info -> usage_examples")
        self.generate_examples.set_lm(self.code_analyzer_lm)
        self.generate_llms_txt = dspy.ChainOfThought(GenerateLLMsTxt)
        self.generate_llms_txt.set_lm(self.llms_txt_generator_lm)
    def forward(
        self,
        repo_url: str,
        output_file: str = "llms.txt",
    ):
        """
        Analyze repository and generate llms.txt content.
        This method handles the complete pipeline:
        1. Fetches repository information from GitHub
        2. Analyzes repository structure and purpose
        3. Generates llms.txt documentation
        4. Writes the result to a file
        Args:
            repo_url: GitHub repository URL (e.g., https://github.com/stanfordnlp/dspy)
            output_file: Path to output file (default: llms.txt)
        Returns:
            dspy.Prediction with llms_txt_content, analysis, and structure
        """
        print(f"Fetching repository information from {repo_url}...")
        file_tree, readme_content, package_files = gather_repository_info(
            repo_url, token=os.environ.get("GITHUB_ACCESS_TOKEN")
        )
        print("Analyzing repository structure and purpose...")
        repo_analysis = self.analyze_repo(
            repo_url=repo_url, file_tree=file_tree, readme_content=readme_content
        )
        print("Analyzing code structure...")
        structure_analysis = self.analyze_structure(
            file_tree=file_tree, package_files=package_files
        )
        print("Generating usage examples...")
        usage_examples = self.generate_examples(
            repo_info=f"Purpose: {repo_analysis.project_purpose}\nConcepts: {repo_analysis.key_concepts}"
        )
        print("Generating llms.txt content...")
        llms_txt = self.generate_llms_txt(
            project_purpose=repo_analysis.project_purpose,
            key_concepts=repo_analysis.key_concepts,
            architecture_overview=repo_analysis.architecture_overview,
            important_directories=structure_analysis.important_directories,
            entry_points=structure_analysis.entry_points,
            development_info=structure_analysis.development_info,
            usage_examples=usage_examples.usage_examples,
        )
        with open(output_file, "w") as f:
            f.write(llms_txt.llms_txt_content)
        print(f"\nGenerated llms.txt saved to: {output_file}")
        print(f"\nPreview (first 500 characters):")
        print(llms_txt.llms_txt_content[:500] + "...")
        return dspy.Prediction(
            llms_txt_content=llms_txt.llms_txt_content,
            analysis=repo_analysis,
            structure=structure_analysis,
        )
--- a/src/generator.py
+++ b/src/generator.py
@@ -0,0 +1,56 @@
 from typing import Optional
 from dotenv import load_dotenv
 from .analyzer import RepositoryAnalyzer, RepositoryAnalyzerConfig
 load_dotenv()
 def generate_llms_txt(
    repo_url: str,
    output_file: str = "llms.txt",
    model: str = "gpt-4o",
    github_token: Optional[str] = None,
 ) -> str:
    """
    Generate llms.txt file for a given repository.
    This is a convenience wrapper around RepositoryAnalyzer that handles
    configuration and invokes the analyzer's forward method.
    Args:
        repo_url: GitHub repository URL (e.g., https://github.com/stanfordnlp/dspy)
        output_file: Path to output file (default: llms.txt)
        model: Language model to use for analysis (default: gpt-4o)
        github_token: GitHub access token (optional, can also use GITHUB_ACCESS_TOKEN env var)
    Returns:
        The generated llms.txt content
    Example:
        >>> result = generate_llms_txt("https://github.com/stanfordnlp/dspy")
        >>> print(result[:100])
    """
    # Create analyzer configuration
    config = RepositoryAnalyzerConfig(
        repo_analyzer_model=model,
        code_analyzer_model=model,
        llms_txt_generator_model=model,
    )
    # Create analyzer instance
    analyzer = RepositoryAnalyzer(config=config)
    # Run the complete pipeline
    result = analyzer(
        repo_url=repo_url,
        output_file=output_file,
        github_token=github_token,
    )
    return result.llms_txt_content
 if __name__ == "__main__":
    generate_llms_txt("https://github.com/stanfordnlp/dspy")
--- a/src/github_utils.py
+++ b/src/github_utils.py
@@ -0,0 +1,126 @@
 import base64
 import os
 from typing import Optional
 from dotenv import load_dotenv
 import requests
 load_dotenv()
 def get_github_file_tree(repo_url: str, token: Optional[str] = None) -> str:
    """
    Get repository file structure from GitHub API.
    Args:
        repo_url: GitHub repository URL (e.g., https://github.com/owner/repo)
        token: Optional GitHub access token for authentication
    Returns:
        Newline-separated list of file paths in the repository
    Raises:
        Exception: If the API request fails
    """
    # extract owner/repo from URL
    parts = repo_url.rstrip("/").split("/")
    owner, repo = parts[-2], parts[-1]
    # try both main and master branches
    for branch in ["main", "master"]:
        api_url = f"https://api.github.com/repos/{owner}/{repo}/git/trees/{branch}?recursive=1"
        headers = {}
        if token:
            headers["Authorization"] = f"Bearer {token}"
        elif os.environ.get("GITHUB_ACCESS_TOKEN"):
            headers["Authorization"] = f"Bearer {os.environ.get('GITHUB_ACCESS_TOKEN')}"
        response = requests.get(api_url, headers=headers)
        if response.status_code == 200:
            tree_data = response.json()
            file_paths = [
                item["path"] for item in tree_data["tree"] if item["type"] == "blob"
            ]
            return "\n".join(sorted(file_paths))
        elif response.status_code != 404:
            raise Exception(
                f"Failed to fetch repository tree: {response.status_code} - {response.text}"
            )
    raise Exception(f"Could not fetch repository tree. Tried branches: main, master")
 def get_github_file_content(
    repo_url: str, file_path: str, token: Optional[str] = None
 ) -> str:
    """
    Get specific file content from GitHub.
    Args:
        repo_url: GitHub repository URL
        file_path: Path to the file within the repository
        token: Optional GitHub access token for authentication
    Returns:
        Content of the file as a string
    """
    parts = repo_url.rstrip("/").split("/")
    owner, repo = parts[-2], parts[-1]
    api_url = f"https://api.github.com/repos/{owner}/{repo}/contents/{file_path}"
    headers = {}
    if token:
        headers["Authorization"] = f"Bearer {token}"
    elif os.environ.get("GITHUB_ACCESS_TOKEN"):
        headers["Authorization"] = f"Bearer {os.environ.get('GITHUB_ACCESS_TOKEN')}"
    response = requests.get(api_url, headers=headers)
    if response.status_code == 200:
        content = base64.b64decode(response.json()["content"]).decode("utf-8")
        return content
    else:
        return f"Could not fetch {file_path}"
 def gather_repository_info(
    repo_url: str, token: Optional[str] = None
 ) -> tuple[str, str, str]:
    """
    Gather all necessary repository information.
    Args:
        repo_url: GitHub repository URL
        token: Optional GitHub access token for authentication
    Returns:
        Tuple of (file_tree, readme_content, package_files_content)
    """
    file_tree = get_github_file_tree(repo_url, token)
    readme_content = get_github_file_content(repo_url, "README.md", token)
    # get key package files
    package_files = []
    for file_path in [
        "pyproject.toml",
        "setup.py",
        "requirements.txt",
        "package.json",
        "Cargo.toml",
        "go.mod",
    ]:
        try:
            content = get_github_file_content(repo_url, file_path, token)
            if "Could not fetch" not in content:
                package_files.append(f"=== {file_path} ===\n{content}")
        except Exception:
            continue
    package_files_content = (
        "\n\n".join(package_files) if package_files else "No package files found"
    )
    return file_tree, readme_content, package_files_content
--- a/src/signatures.py
+++ b/src/signatures.py
@@ -0,0 +1,52 @@
 import dspy
 class AnalyzeRepository(dspy.Signature):
    """Analyze a repository structure and identify key components."""
    repo_url: str = dspy.InputField(desc="GitHub repository URL")
    file_tree: str = dspy.InputField(desc="Repository file structure")
    readme_content: str = dspy.InputField(desc="README.md content")
    project_purpose: str = dspy.OutputField(
        desc="Main purpose and goals of the project"
    )
    key_concepts: list[str] = dspy.OutputField(
        desc="List of important concepts and terminology"
    )
    architecture_overview: str = dspy.OutputField(
        desc="High-level architecture description"
    )
 class AnalyzeCodeStructure(dspy.Signature):
    """Analyze code structure to identify important directories and files."""
    file_tree: str = dspy.InputField(desc="Repository file structure")
    package_files: str = dspy.InputField(desc="Key package and configuration files")
    important_directories: list[str] = dspy.OutputField(
        desc="Key directories and their purposes"
    )
    entry_points: list[str] = dspy.OutputField(
        desc="Main entry points and important files"
    )
    development_info: str = dspy.OutputField(
        desc="Development setup and workflow information"
    )
 class GenerateLLMsTxt(dspy.Signature):
    """Generate a comprehensive llms.txt file from analyzed repository information."""
    project_purpose: str = dspy.InputField()
    key_concepts: list[str] = dspy.InputField()
    architecture_overview: str = dspy.InputField()
    important_directories: list[str] = dspy.InputField()
    entry_points: list[str] = dspy.InputField()
    development_info: str = dspy.InputField()
    usage_examples: str = dspy.InputField(desc="Common usage patterns and examples")
    llms_txt_content: str = dspy.OutputField(
        desc="Complete llms.txt file content following the standard format"
    )