Files
llms-txt-generator/src/analyzer.py
2025-11-24 22:13:32 -05:00

118 lines
4.3 KiB
Python

import os
from typing import Optional
import dspy
from modaic import PrecompiledAgent, PrecompiledConfig
from .github_utils import gather_repository_info
from .signatures import AnalyzeRepository, AnalyzeCodeStructure, GenerateLLMsTxt
class RepositoryAnalyzerConfig(PrecompiledConfig):
"""Configuration for RepositoryAnalyzer."""
repo_analyzer_model: str = "gpt-5.1-2025-11-13"
code_analyzer_model: str = "gpt-5.1-2025-11-13"
llms_txt_generator_model: str = "gpt-4o"
max_tokens: int = 16000
temperature: float = 1.0
class RepositoryAnalyzer(PrecompiledAgent):
"""Analyzes repository and generates llms.txt documentation."""
config: RepositoryAnalyzerConfig
def __init__(self, config: RepositoryAnalyzerConfig, **kwargs):
super().__init__(config, **kwargs)
self.repo_analyzer_lm = dspy.LM(
config.repo_analyzer_model,
max_tokens=config.max_tokens,
temperature=config.temperature,
)
self.code_analyzer_lm = dspy.LM(
config.code_analyzer_model,
max_tokens=config.max_tokens,
temperature=config.temperature,
)
self.llms_txt_generator_lm = dspy.LM(
config.llms_txt_generator_model,
max_tokens=config.max_tokens,
temperature=config.temperature,
)
self.analyze_repo = dspy.ChainOfThought(AnalyzeRepository)
self.analyze_repo.set_lm(self.repo_analyzer_lm)
self.analyze_structure = dspy.ChainOfThought(AnalyzeCodeStructure)
self.analyze_structure.set_lm(self.code_analyzer_lm)
self.generate_examples = dspy.ChainOfThought("repo_info -> usage_examples")
self.generate_examples.set_lm(self.code_analyzer_lm)
self.generate_llms_txt = dspy.ChainOfThought(GenerateLLMsTxt)
self.generate_llms_txt.set_lm(self.llms_txt_generator_lm)
def forward(
self,
repo_url: str,
output_file: str = "llms.txt",
):
"""
Analyze repository and generate llms.txt content.
This method handles the complete pipeline:
1. Fetches repository information from GitHub
2. Analyzes repository structure and purpose
3. Generates llms.txt documentation
4. Writes the result to a file
Args:
repo_url: GitHub repository URL (e.g., https://github.com/stanfordnlp/dspy)
output_file: Path to output file (default: llms.txt)
Returns:
dspy.Prediction with llms_txt_content, analysis, and structure
"""
print(f"Fetching repository information from {repo_url}...")
file_tree, readme_content, package_files = gather_repository_info(
repo_url, token=os.environ.get("GITHUB_ACCESS_TOKEN")
)
print("Analyzing repository structure and purpose...")
repo_analysis = self.analyze_repo(
repo_url=repo_url, file_tree=file_tree, readme_content=readme_content
)
print("Analyzing code structure...")
structure_analysis = self.analyze_structure(
file_tree=file_tree, package_files=package_files
)
print("Generating usage examples...")
usage_examples = self.generate_examples(
repo_info=f"Purpose: {repo_analysis.project_purpose}\nConcepts: {repo_analysis.key_concepts}"
)
print("Generating llms.txt content...")
llms_txt = self.generate_llms_txt(
project_purpose=repo_analysis.project_purpose,
key_concepts=repo_analysis.key_concepts,
architecture_overview=repo_analysis.architecture_overview,
important_directories=structure_analysis.important_directories,
entry_points=structure_analysis.entry_points,
development_info=structure_analysis.development_info,
usage_examples=usage_examples.usage_examples,
)
with open(output_file, "w") as f:
f.write(llms_txt.llms_txt_content)
print(f"\nGenerated llms.txt saved to: {output_file}")
print(f"\nPreview (first 500 characters):")
print(llms_txt.llms_txt_content[:500] + "...")
return dspy.Prediction(
llms_txt_content=llms_txt.llms_txt_content,
analysis=repo_analysis,
structure=structure_analysis,
)