(no commit message)

This commit is contained in:
2025-11-24 22:04:16 -05:00
parent e5757a367b
commit 2c346586a9
12 changed files with 601 additions and 18 deletions

18
LICENSE
View File

@@ -1,18 +0,0 @@
MIT License
Copyright (c) 2025 johwang
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
associated documentation files (the "Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the
following conditions:
The above copyright notice and this permission notice shall be included in all copies or substantial
portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT
LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO
EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
USE OR OTHER DEALINGS IN THE SOFTWARE.

BIN
README.md

Binary file not shown.

193
agent.json Normal file
View File

@@ -0,0 +1,193 @@
{
"analyze_repo.predict": {
"traces": [],
"train": [],
"demos": [],
"signature": {
"instructions": "Analyze a repository structure and identify key components.",
"fields": [
{
"prefix": "Repo Url:",
"description": "GitHub repository URL"
},
{
"prefix": "File Tree:",
"description": "Repository file structure"
},
{
"prefix": "Readme Content:",
"description": "README.md content"
},
{
"prefix": "Reasoning: Let's think step by step in order to",
"description": "${reasoning}"
},
{
"prefix": "Project Purpose:",
"description": "Main purpose and goals of the project"
},
{
"prefix": "Key Concepts:",
"description": "List of important concepts and terminology"
},
{
"prefix": "Architecture Overview:",
"description": "High-level architecture description"
}
]
},
"lm": {
"model": "gpt-4o",
"model_type": "chat",
"cache": true,
"num_retries": 3,
"finetuning_model": null,
"launch_kwargs": {},
"train_kwargs": {},
"temperature": 0.7,
"max_tokens": 8192
}
},
"analyze_structure.predict": {
"traces": [],
"train": [],
"demos": [],
"signature": {
"instructions": "Analyze code structure to identify important directories and files.",
"fields": [
{
"prefix": "File Tree:",
"description": "Repository file structure"
},
{
"prefix": "Package Files:",
"description": "Key package and configuration files"
},
{
"prefix": "Reasoning: Let's think step by step in order to",
"description": "${reasoning}"
},
{
"prefix": "Important Directories:",
"description": "Key directories and their purposes"
},
{
"prefix": "Entry Points:",
"description": "Main entry points and important files"
},
{
"prefix": "Development Info:",
"description": "Development setup and workflow information"
}
]
},
"lm": {
"model": "gpt-4o",
"model_type": "chat",
"cache": true,
"num_retries": 3,
"finetuning_model": null,
"launch_kwargs": {},
"train_kwargs": {},
"temperature": 0.7,
"max_tokens": 8192
}
},
"generate_examples.predict": {
"traces": [],
"train": [],
"demos": [],
"signature": {
"instructions": "Given the fields `repo_info`, produce the fields `usage_examples`.",
"fields": [
{
"prefix": "Repo Info:",
"description": "${repo_info}"
},
{
"prefix": "Reasoning: Let's think step by step in order to",
"description": "${reasoning}"
},
{
"prefix": "Usage Examples:",
"description": "${usage_examples}"
}
]
},
"lm": {
"model": "gpt-4o",
"model_type": "chat",
"cache": true,
"num_retries": 3,
"finetuning_model": null,
"launch_kwargs": {},
"train_kwargs": {},
"temperature": 0.7,
"max_tokens": 8192
}
},
"generate_llms_txt.predict": {
"traces": [],
"train": [],
"demos": [],
"signature": {
"instructions": "Generate a comprehensive llms.txt file from analyzed repository information.",
"fields": [
{
"prefix": "Project Purpose:",
"description": "${project_purpose}"
},
{
"prefix": "Key Concepts:",
"description": "${key_concepts}"
},
{
"prefix": "Architecture Overview:",
"description": "${architecture_overview}"
},
{
"prefix": "Important Directories:",
"description": "${important_directories}"
},
{
"prefix": "Entry Points:",
"description": "${entry_points}"
},
{
"prefix": "Development Info:",
"description": "${development_info}"
},
{
"prefix": "Usage Examples:",
"description": "Common usage patterns and examples"
},
{
"prefix": "Reasoning: Let's think step by step in order to",
"description": "${reasoning}"
},
{
"prefix": "Llms Txt Content:",
"description": "Complete llms.txt file content following the standard format"
}
]
},
"lm": {
"model": "gpt-4o",
"model_type": "chat",
"cache": true,
"num_retries": 3,
"finetuning_model": null,
"launch_kwargs": {},
"train_kwargs": {},
"temperature": 0.7,
"max_tokens": 8192
}
},
"metadata": {
"dependency_versions": {
"python": "3.13",
"dspy": "3.0.4",
"cloudpickle": "3.1"
}
}
}

4
auto_classes.json Normal file
View File

@@ -0,0 +1,4 @@
{
"AutoConfig": "src.analyzer.RepositoryAnalyzerConfig",
"AutoAgent": "src.analyzer.RepositoryAnalyzer"
}

15
compile.py Normal file
View File

@@ -0,0 +1,15 @@
import sys
from src.analyzer import RepositoryAnalyzer, RepositoryAnalyzerConfig
llms_txt_generator = RepositoryAnalyzer(
config=RepositoryAnalyzerConfig()
)
def main():
try:
llms_txt_generator.push_to_hub("johwang/llms-txt-generator", with_code=True)
except Exception as e:
print(f"Error: {e}", file=sys.stderr)
if __name__ == "__main__":
main()

7
config.json Normal file
View File

@@ -0,0 +1,7 @@
{
"repo_analyzer_model": "gpt-4o",
"code_analyzer_model": "gpt-4o",
"llms_txt_generator_model": "gpt-4o",
"max_tokens": 8192,
"temperature": 0.7
}

17
pyproject.toml Normal file
View File

@@ -0,0 +1,17 @@
[project]
name = "llms-txt-generator"
version = "0.1.0"
description = "Generate llms.txt documentation for GitHub repositories using DSPy"
readme = "README.md"
requires-python = ">=3.10"
dependencies = ["dspy>=3.0.4", "requests>=2.31.0", "python-dotenv>=1.0.0"]
[project.scripts]
llmstxt-gen = "main:main"
[project.optional-dependencies]
dev = [
"pytest>=7.4.0",
"black>=23.0.0",
"ruff>=0.1.0",
]

14
src/__init__.py Normal file
View File

@@ -0,0 +1,14 @@
from .signatures import AnalyzeRepository, AnalyzeCodeStructure, GenerateLLMsTxt
from .analyzer import RepositoryAnalyzer, RepositoryAnalyzerConfig
from .github_utils import gather_repository_info
from .generator import generate_llms_txt
__all__ = [
"AnalyzeRepository",
"AnalyzeCodeStructure",
"GenerateLLMsTxt",
"RepositoryAnalyzer",
"RepositoryAnalyzerConfig",
"gather_repository_info",
"generate_llms_txt",
]

117
src/analyzer.py Normal file
View File

@@ -0,0 +1,117 @@
import os
from typing import Optional
import dspy
from modaic import PrecompiledAgent, PrecompiledConfig
from .github_utils import gather_repository_info
from .signatures import AnalyzeRepository, AnalyzeCodeStructure, GenerateLLMsTxt
class RepositoryAnalyzerConfig(PrecompiledConfig):
"""Configuration for RepositoryAnalyzer."""
repo_analyzer_model: str = "gpt-4o"
code_analyzer_model: str = "gpt-4o"
llms_txt_generator_model: str = "gpt-4o"
max_tokens: int = 8192
temperature: float = 0.7
class RepositoryAnalyzer(PrecompiledAgent):
"""Analyzes repository and generates llms.txt documentation."""
config: RepositoryAnalyzerConfig
def __init__(self, config: RepositoryAnalyzerConfig, **kwargs):
super().__init__(config, **kwargs)
self.repo_analyzer_lm = dspy.LM(
config.repo_analyzer_model,
max_tokens=config.max_tokens,
temperature=config.temperature,
)
self.code_analyzer_lm = dspy.LM(
config.code_analyzer_model,
max_tokens=config.max_tokens,
temperature=config.temperature,
)
self.llms_txt_generator_lm = dspy.LM(
config.llms_txt_generator_model,
max_tokens=config.max_tokens,
temperature=config.temperature,
)
self.analyze_repo = dspy.ChainOfThought(AnalyzeRepository)
self.analyze_repo.set_lm(self.repo_analyzer_lm)
self.analyze_structure = dspy.ChainOfThought(AnalyzeCodeStructure)
self.analyze_structure.set_lm(self.code_analyzer_lm)
self.generate_examples = dspy.ChainOfThought("repo_info -> usage_examples")
self.generate_examples.set_lm(self.code_analyzer_lm)
self.generate_llms_txt = dspy.ChainOfThought(GenerateLLMsTxt)
self.generate_llms_txt.set_lm(self.llms_txt_generator_lm)
def forward(
self,
repo_url: str,
output_file: str = "llms.txt",
):
"""
Analyze repository and generate llms.txt content.
This method handles the complete pipeline:
1. Fetches repository information from GitHub
2. Analyzes repository structure and purpose
3. Generates llms.txt documentation
4. Writes the result to a file
Args:
repo_url: GitHub repository URL (e.g., https://github.com/stanfordnlp/dspy)
output_file: Path to output file (default: llms.txt)
Returns:
dspy.Prediction with llms_txt_content, analysis, and structure
"""
print(f"Fetching repository information from {repo_url}...")
file_tree, readme_content, package_files = gather_repository_info(
repo_url, token=os.environ.get("GITHUB_ACCESS_TOKEN")
)
print("Analyzing repository structure and purpose...")
repo_analysis = self.analyze_repo(
repo_url=repo_url, file_tree=file_tree, readme_content=readme_content
)
print("Analyzing code structure...")
structure_analysis = self.analyze_structure(
file_tree=file_tree, package_files=package_files
)
print("Generating usage examples...")
usage_examples = self.generate_examples(
repo_info=f"Purpose: {repo_analysis.project_purpose}\nConcepts: {repo_analysis.key_concepts}"
)
print("Generating llms.txt content...")
llms_txt = self.generate_llms_txt(
project_purpose=repo_analysis.project_purpose,
key_concepts=repo_analysis.key_concepts,
architecture_overview=repo_analysis.architecture_overview,
important_directories=structure_analysis.important_directories,
entry_points=structure_analysis.entry_points,
development_info=structure_analysis.development_info,
usage_examples=usage_examples.usage_examples,
)
with open(output_file, "w") as f:
f.write(llms_txt.llms_txt_content)
print(f"\nGenerated llms.txt saved to: {output_file}")
print(f"\nPreview (first 500 characters):")
print(llms_txt.llms_txt_content[:500] + "...")
return dspy.Prediction(
llms_txt_content=llms_txt.llms_txt_content,
analysis=repo_analysis,
structure=structure_analysis,
)

56
src/generator.py Normal file
View File

@@ -0,0 +1,56 @@
from typing import Optional
from dotenv import load_dotenv
from .analyzer import RepositoryAnalyzer, RepositoryAnalyzerConfig
load_dotenv()
def generate_llms_txt(
repo_url: str,
output_file: str = "llms.txt",
model: str = "gpt-4o",
github_token: Optional[str] = None,
) -> str:
"""
Generate llms.txt file for a given repository.
This is a convenience wrapper around RepositoryAnalyzer that handles
configuration and invokes the analyzer's forward method.
Args:
repo_url: GitHub repository URL (e.g., https://github.com/stanfordnlp/dspy)
output_file: Path to output file (default: llms.txt)
model: Language model to use for analysis (default: gpt-4o)
github_token: GitHub access token (optional, can also use GITHUB_ACCESS_TOKEN env var)
Returns:
The generated llms.txt content
Example:
>>> result = generate_llms_txt("https://github.com/stanfordnlp/dspy")
>>> print(result[:100])
"""
# Create analyzer configuration
config = RepositoryAnalyzerConfig(
repo_analyzer_model=model,
code_analyzer_model=model,
llms_txt_generator_model=model,
)
# Create analyzer instance
analyzer = RepositoryAnalyzer(config=config)
# Run the complete pipeline
result = analyzer(
repo_url=repo_url,
output_file=output_file,
github_token=github_token,
)
return result.llms_txt_content
if __name__ == "__main__":
generate_llms_txt("https://github.com/stanfordnlp/dspy")

126
src/github_utils.py Normal file
View File

@@ -0,0 +1,126 @@
import base64
import os
from typing import Optional
from dotenv import load_dotenv
import requests
load_dotenv()
def get_github_file_tree(repo_url: str, token: Optional[str] = None) -> str:
"""
Get repository file structure from GitHub API.
Args:
repo_url: GitHub repository URL (e.g., https://github.com/owner/repo)
token: Optional GitHub access token for authentication
Returns:
Newline-separated list of file paths in the repository
Raises:
Exception: If the API request fails
"""
# extract owner/repo from URL
parts = repo_url.rstrip("/").split("/")
owner, repo = parts[-2], parts[-1]
# try both main and master branches
for branch in ["main", "master"]:
api_url = f"https://api.github.com/repos/{owner}/{repo}/git/trees/{branch}?recursive=1"
headers = {}
if token:
headers["Authorization"] = f"Bearer {token}"
elif os.environ.get("GITHUB_ACCESS_TOKEN"):
headers["Authorization"] = f"Bearer {os.environ.get('GITHUB_ACCESS_TOKEN')}"
response = requests.get(api_url, headers=headers)
if response.status_code == 200:
tree_data = response.json()
file_paths = [
item["path"] for item in tree_data["tree"] if item["type"] == "blob"
]
return "\n".join(sorted(file_paths))
elif response.status_code != 404:
raise Exception(
f"Failed to fetch repository tree: {response.status_code} - {response.text}"
)
raise Exception(f"Could not fetch repository tree. Tried branches: main, master")
def get_github_file_content(
repo_url: str, file_path: str, token: Optional[str] = None
) -> str:
"""
Get specific file content from GitHub.
Args:
repo_url: GitHub repository URL
file_path: Path to the file within the repository
token: Optional GitHub access token for authentication
Returns:
Content of the file as a string
"""
parts = repo_url.rstrip("/").split("/")
owner, repo = parts[-2], parts[-1]
api_url = f"https://api.github.com/repos/{owner}/{repo}/contents/{file_path}"
headers = {}
if token:
headers["Authorization"] = f"Bearer {token}"
elif os.environ.get("GITHUB_ACCESS_TOKEN"):
headers["Authorization"] = f"Bearer {os.environ.get('GITHUB_ACCESS_TOKEN')}"
response = requests.get(api_url, headers=headers)
if response.status_code == 200:
content = base64.b64decode(response.json()["content"]).decode("utf-8")
return content
else:
return f"Could not fetch {file_path}"
def gather_repository_info(
repo_url: str, token: Optional[str] = None
) -> tuple[str, str, str]:
"""
Gather all necessary repository information.
Args:
repo_url: GitHub repository URL
token: Optional GitHub access token for authentication
Returns:
Tuple of (file_tree, readme_content, package_files_content)
"""
file_tree = get_github_file_tree(repo_url, token)
readme_content = get_github_file_content(repo_url, "README.md", token)
# get key package files
package_files = []
for file_path in [
"pyproject.toml",
"setup.py",
"requirements.txt",
"package.json",
"Cargo.toml",
"go.mod",
]:
try:
content = get_github_file_content(repo_url, file_path, token)
if "Could not fetch" not in content:
package_files.append(f"=== {file_path} ===\n{content}")
except Exception:
continue
package_files_content = (
"\n\n".join(package_files) if package_files else "No package files found"
)
return file_tree, readme_content, package_files_content

52
src/signatures.py Normal file
View File

@@ -0,0 +1,52 @@
import dspy
class AnalyzeRepository(dspy.Signature):
"""Analyze a repository structure and identify key components."""
repo_url: str = dspy.InputField(desc="GitHub repository URL")
file_tree: str = dspy.InputField(desc="Repository file structure")
readme_content: str = dspy.InputField(desc="README.md content")
project_purpose: str = dspy.OutputField(
desc="Main purpose and goals of the project"
)
key_concepts: list[str] = dspy.OutputField(
desc="List of important concepts and terminology"
)
architecture_overview: str = dspy.OutputField(
desc="High-level architecture description"
)
class AnalyzeCodeStructure(dspy.Signature):
"""Analyze code structure to identify important directories and files."""
file_tree: str = dspy.InputField(desc="Repository file structure")
package_files: str = dspy.InputField(desc="Key package and configuration files")
important_directories: list[str] = dspy.OutputField(
desc="Key directories and their purposes"
)
entry_points: list[str] = dspy.OutputField(
desc="Main entry points and important files"
)
development_info: str = dspy.OutputField(
desc="Development setup and workflow information"
)
class GenerateLLMsTxt(dspy.Signature):
"""Generate a comprehensive llms.txt file from analyzed repository information."""
project_purpose: str = dspy.InputField()
key_concepts: list[str] = dspy.InputField()
architecture_overview: str = dspy.InputField()
important_directories: list[str] = dspy.InputField()
entry_points: list[str] = dspy.InputField()
development_info: str = dspy.InputField()
usage_examples: str = dspy.InputField(desc="Common usage patterns and examples")
llms_txt_content: str = dspy.OutputField(
desc="Complete llms.txt file content following the standard format"
)