llms-txt-generator/src/analyzer.py

import os
from typing import Optional

import dspy
from modaic import PrecompiledAgent, PrecompiledConfig

from .github_utils import gather_repository_info
from .signatures import AnalyzeRepository, AnalyzeCodeStructure, GenerateLLMsTxt


class RepositoryAnalyzerConfig(PrecompiledConfig):
    """Configuration for RepositoryAnalyzer."""

    repo_analyzer_model: str = "gpt-5.1-2025-11-13"
    code_analyzer_model: str = "gpt-5.1-2025-11-13"
    llms_txt_generator_model: str = "gpt-4o"
    max_tokens: int = 16000
    temperature: float = 1.0


class RepositoryAnalyzer(PrecompiledAgent):
    """Analyzes repository and generates llms.txt documentation."""

    config: RepositoryAnalyzerConfig

    def __init__(self, config: RepositoryAnalyzerConfig, **kwargs):
        super().__init__(config, **kwargs)
        self.repo_analyzer_lm = dspy.LM(
            config.repo_analyzer_model,
            max_tokens=config.max_tokens,
            temperature=config.temperature,
        )
        self.code_analyzer_lm = dspy.LM(
            config.code_analyzer_model,
            max_tokens=config.max_tokens,
            temperature=config.temperature,
        )
        self.llms_txt_generator_lm = dspy.LM(
            config.llms_txt_generator_model,
            max_tokens=config.max_tokens,
            temperature=config.temperature,
        )
        self.analyze_repo = dspy.ChainOfThought(AnalyzeRepository)
        self.analyze_repo.set_lm(self.repo_analyzer_lm)
        self.analyze_structure = dspy.ChainOfThought(AnalyzeCodeStructure)
        self.analyze_structure.set_lm(self.code_analyzer_lm)
        self.generate_examples = dspy.ChainOfThought("repo_info -> usage_examples")
        self.generate_examples.set_lm(self.code_analyzer_lm)
        self.generate_llms_txt = dspy.ChainOfThought(GenerateLLMsTxt)
        self.generate_llms_txt.set_lm(self.llms_txt_generator_lm)

    def forward(
        self,
        repo_url: str,
        output_file: str = "llms.txt",
    ):
        """
        Analyze repository and generate llms.txt content.

        This method handles the complete pipeline:
        1. Fetches repository information from GitHub
        2. Analyzes repository structure and purpose
        3. Generates llms.txt documentation
        4. Writes the result to a file

        Args:
            repo_url: GitHub repository URL (e.g., https://github.com/stanfordnlp/dspy)
            output_file: Path to output file (default: llms.txt)

        Returns:
            dspy.Prediction with llms_txt_content, analysis, and structure
        """
        print(f"Fetching repository information from {repo_url}...")
        file_tree, readme_content, package_files = gather_repository_info(
            repo_url, token=os.environ.get("GITHUB_ACCESS_TOKEN")
        )

        print("Analyzing repository structure and purpose...")
        repo_analysis = self.analyze_repo(
            repo_url=repo_url, file_tree=file_tree, readme_content=readme_content
        )

        print("Analyzing code structure...")
        structure_analysis = self.analyze_structure(
            file_tree=file_tree, package_files=package_files
        )

        print("Generating usage examples...")
        usage_examples = self.generate_examples(
            repo_info=f"Purpose: {repo_analysis.project_purpose}\nConcepts: {repo_analysis.key_concepts}"
        )

        print("Generating llms.txt content...")
        llms_txt = self.generate_llms_txt(
            project_purpose=repo_analysis.project_purpose,
            key_concepts=repo_analysis.key_concepts,
            architecture_overview=repo_analysis.architecture_overview,
            important_directories=structure_analysis.important_directories,
            entry_points=structure_analysis.entry_points,
            development_info=structure_analysis.development_info,
            usage_examples=usage_examples.usage_examples,
        )

        with open(output_file, "w") as f:
            f.write(llms_txt.llms_txt_content)

        print(f"\nGenerated llms.txt saved to: {output_file}")
        print(f"\nPreview (first 500 characters):")
        print(llms_txt.llms_txt_content[:500] + "...")

        return dspy.Prediction(
            llms_txt_content=llms_txt.llms_txt_content,
            analysis=repo_analysis,
            structure=structure_analysis,
        )