diff --git a/LICENSE b/LICENSE deleted file mode 100644 index dba48ff..0000000 --- a/LICENSE +++ /dev/null @@ -1,18 +0,0 @@ -MIT License - -Copyright (c) 2025 johwang - -Permission is hereby granted, free of charge, to any person obtaining a copy of this software and -associated documentation files (the "Software"), to deal in the Software without restriction, including -without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the -following conditions: - -The above copyright notice and this permission notice shall be included in all copies or substantial -portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT -LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO -EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER -IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE -USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/README.md b/README.md index 61f14cd..bdd289c 100644 Binary files a/README.md and b/README.md differ diff --git a/agent.json b/agent.json new file mode 100644 index 0000000..686dc98 --- /dev/null +++ b/agent.json @@ -0,0 +1,193 @@ +{ + "analyze_repo.predict": { + "traces": [], + "train": [], + "demos": [], + "signature": { + "instructions": "Analyze a repository structure and identify key components.", + "fields": [ + { + "prefix": "Repo Url:", + "description": "GitHub repository URL" + }, + { + "prefix": "File Tree:", + "description": "Repository file structure" + }, + { + "prefix": "Readme Content:", + "description": "README.md content" + }, + { + "prefix": "Reasoning: Let's think step by step in order to", + "description": "${reasoning}" + }, + { + "prefix": "Project Purpose:", + "description": "Main purpose and goals of the project" + }, + { + "prefix": "Key Concepts:", + "description": "List of important concepts and terminology" + }, + { + "prefix": "Architecture Overview:", + "description": "High-level architecture description" + } + ] + }, + "lm": { + "model": "gpt-4o", + "model_type": "chat", + "cache": true, + "num_retries": 3, + "finetuning_model": null, + "launch_kwargs": {}, + "train_kwargs": {}, + "temperature": 0.7, + "max_tokens": 8192 + } + }, + "analyze_structure.predict": { + "traces": [], + "train": [], + "demos": [], + "signature": { + "instructions": "Analyze code structure to identify important directories and files.", + "fields": [ + { + "prefix": "File Tree:", + "description": "Repository file structure" + }, + { + "prefix": "Package Files:", + "description": "Key package and configuration files" + }, + { + "prefix": "Reasoning: Let's think step by step in order to", + "description": "${reasoning}" + }, + { + "prefix": "Important Directories:", + "description": "Key directories and their purposes" + }, + { + "prefix": "Entry Points:", + "description": "Main entry points and important files" + }, + { + "prefix": "Development Info:", + "description": "Development setup and workflow information" + } + ] + }, + "lm": { + "model": "gpt-4o", + "model_type": "chat", + "cache": true, + "num_retries": 3, + "finetuning_model": null, + "launch_kwargs": {}, + "train_kwargs": {}, + "temperature": 0.7, + "max_tokens": 8192 + } + }, + "generate_examples.predict": { + "traces": [], + "train": [], + "demos": [], + "signature": { + "instructions": "Given the fields `repo_info`, produce the fields `usage_examples`.", + "fields": [ + { + "prefix": "Repo Info:", + "description": "${repo_info}" + }, + { + "prefix": "Reasoning: Let's think step by step in order to", + "description": "${reasoning}" + }, + { + "prefix": "Usage Examples:", + "description": "${usage_examples}" + } + ] + }, + "lm": { + "model": "gpt-4o", + "model_type": "chat", + "cache": true, + "num_retries": 3, + "finetuning_model": null, + "launch_kwargs": {}, + "train_kwargs": {}, + "temperature": 0.7, + "max_tokens": 8192 + } + }, + "generate_llms_txt.predict": { + "traces": [], + "train": [], + "demos": [], + "signature": { + "instructions": "Generate a comprehensive llms.txt file from analyzed repository information.", + "fields": [ + { + "prefix": "Project Purpose:", + "description": "${project_purpose}" + }, + { + "prefix": "Key Concepts:", + "description": "${key_concepts}" + }, + { + "prefix": "Architecture Overview:", + "description": "${architecture_overview}" + }, + { + "prefix": "Important Directories:", + "description": "${important_directories}" + }, + { + "prefix": "Entry Points:", + "description": "${entry_points}" + }, + { + "prefix": "Development Info:", + "description": "${development_info}" + }, + { + "prefix": "Usage Examples:", + "description": "Common usage patterns and examples" + }, + { + "prefix": "Reasoning: Let's think step by step in order to", + "description": "${reasoning}" + }, + { + "prefix": "Llms Txt Content:", + "description": "Complete llms.txt file content following the standard format" + } + ] + }, + "lm": { + "model": "gpt-4o", + "model_type": "chat", + "cache": true, + "num_retries": 3, + "finetuning_model": null, + "launch_kwargs": {}, + "train_kwargs": {}, + "temperature": 0.7, + "max_tokens": 8192 + } + }, + "metadata": { + "dependency_versions": { + "python": "3.13", + "dspy": "3.0.4", + "cloudpickle": "3.1" + } + } +} \ No newline at end of file diff --git a/auto_classes.json b/auto_classes.json new file mode 100644 index 0000000..dde64b5 --- /dev/null +++ b/auto_classes.json @@ -0,0 +1,4 @@ +{ + "AutoConfig": "src.analyzer.RepositoryAnalyzerConfig", + "AutoAgent": "src.analyzer.RepositoryAnalyzer" +} \ No newline at end of file diff --git a/compile.py b/compile.py new file mode 100644 index 0000000..181e03a --- /dev/null +++ b/compile.py @@ -0,0 +1,15 @@ +import sys +from src.analyzer import RepositoryAnalyzer, RepositoryAnalyzerConfig + +llms_txt_generator = RepositoryAnalyzer( + config=RepositoryAnalyzerConfig() +) + +def main(): + try: + llms_txt_generator.push_to_hub("johwang/llms-txt-generator", with_code=True) + except Exception as e: + print(f"Error: {e}", file=sys.stderr) + +if __name__ == "__main__": + main() diff --git a/config.json b/config.json new file mode 100644 index 0000000..cd004d0 --- /dev/null +++ b/config.json @@ -0,0 +1,7 @@ +{ + "repo_analyzer_model": "gpt-4o", + "code_analyzer_model": "gpt-4o", + "llms_txt_generator_model": "gpt-4o", + "max_tokens": 8192, + "temperature": 0.7 +} \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..12f864e --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,17 @@ +[project] +name = "llms-txt-generator" +version = "0.1.0" +description = "Generate llms.txt documentation for GitHub repositories using DSPy" +readme = "README.md" +requires-python = ">=3.10" +dependencies = ["dspy>=3.0.4", "requests>=2.31.0", "python-dotenv>=1.0.0"] + +[project.scripts] +llmstxt-gen = "main:main" + +[project.optional-dependencies] +dev = [ + "pytest>=7.4.0", + "black>=23.0.0", + "ruff>=0.1.0", +] diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..aeaad7f --- /dev/null +++ b/src/__init__.py @@ -0,0 +1,14 @@ +from .signatures import AnalyzeRepository, AnalyzeCodeStructure, GenerateLLMsTxt +from .analyzer import RepositoryAnalyzer, RepositoryAnalyzerConfig +from .github_utils import gather_repository_info +from .generator import generate_llms_txt + +__all__ = [ + "AnalyzeRepository", + "AnalyzeCodeStructure", + "GenerateLLMsTxt", + "RepositoryAnalyzer", + "RepositoryAnalyzerConfig", + "gather_repository_info", + "generate_llms_txt", +] diff --git a/src/analyzer.py b/src/analyzer.py new file mode 100644 index 0000000..20c682c --- /dev/null +++ b/src/analyzer.py @@ -0,0 +1,117 @@ +import os +from typing import Optional + +import dspy +from modaic import PrecompiledAgent, PrecompiledConfig + +from .github_utils import gather_repository_info +from .signatures import AnalyzeRepository, AnalyzeCodeStructure, GenerateLLMsTxt + + +class RepositoryAnalyzerConfig(PrecompiledConfig): + """Configuration for RepositoryAnalyzer.""" + + repo_analyzer_model: str = "gpt-4o" + code_analyzer_model: str = "gpt-4o" + llms_txt_generator_model: str = "gpt-4o" + max_tokens: int = 8192 + temperature: float = 0.7 + + +class RepositoryAnalyzer(PrecompiledAgent): + """Analyzes repository and generates llms.txt documentation.""" + + config: RepositoryAnalyzerConfig + + def __init__(self, config: RepositoryAnalyzerConfig, **kwargs): + super().__init__(config, **kwargs) + self.repo_analyzer_lm = dspy.LM( + config.repo_analyzer_model, + max_tokens=config.max_tokens, + temperature=config.temperature, + ) + self.code_analyzer_lm = dspy.LM( + config.code_analyzer_model, + max_tokens=config.max_tokens, + temperature=config.temperature, + ) + self.llms_txt_generator_lm = dspy.LM( + config.llms_txt_generator_model, + max_tokens=config.max_tokens, + temperature=config.temperature, + ) + self.analyze_repo = dspy.ChainOfThought(AnalyzeRepository) + self.analyze_repo.set_lm(self.repo_analyzer_lm) + self.analyze_structure = dspy.ChainOfThought(AnalyzeCodeStructure) + self.analyze_structure.set_lm(self.code_analyzer_lm) + self.generate_examples = dspy.ChainOfThought("repo_info -> usage_examples") + self.generate_examples.set_lm(self.code_analyzer_lm) + self.generate_llms_txt = dspy.ChainOfThought(GenerateLLMsTxt) + self.generate_llms_txt.set_lm(self.llms_txt_generator_lm) + + def forward( + self, + repo_url: str, + output_file: str = "llms.txt", + ): + """ + Analyze repository and generate llms.txt content. + + This method handles the complete pipeline: + 1. Fetches repository information from GitHub + 2. Analyzes repository structure and purpose + 3. Generates llms.txt documentation + 4. Writes the result to a file + + Args: + repo_url: GitHub repository URL (e.g., https://github.com/stanfordnlp/dspy) + output_file: Path to output file (default: llms.txt) + + Returns: + dspy.Prediction with llms_txt_content, analysis, and structure + """ + print(f"Fetching repository information from {repo_url}...") + file_tree, readme_content, package_files = gather_repository_info( + repo_url, token=os.environ.get("GITHUB_ACCESS_TOKEN") + ) + + print("Analyzing repository structure and purpose...") + repo_analysis = self.analyze_repo( + repo_url=repo_url, file_tree=file_tree, readme_content=readme_content + ) + + print("Analyzing code structure...") + structure_analysis = self.analyze_structure( + file_tree=file_tree, package_files=package_files + ) + + print("Generating usage examples...") + usage_examples = self.generate_examples( + repo_info=f"Purpose: {repo_analysis.project_purpose}\nConcepts: {repo_analysis.key_concepts}" + ) + + print("Generating llms.txt content...") + llms_txt = self.generate_llms_txt( + project_purpose=repo_analysis.project_purpose, + key_concepts=repo_analysis.key_concepts, + architecture_overview=repo_analysis.architecture_overview, + important_directories=structure_analysis.important_directories, + entry_points=structure_analysis.entry_points, + development_info=structure_analysis.development_info, + usage_examples=usage_examples.usage_examples, + ) + + with open(output_file, "w") as f: + f.write(llms_txt.llms_txt_content) + + print(f"\nGenerated llms.txt saved to: {output_file}") + print(f"\nPreview (first 500 characters):") + print(llms_txt.llms_txt_content[:500] + "...") + + return dspy.Prediction( + llms_txt_content=llms_txt.llms_txt_content, + analysis=repo_analysis, + structure=structure_analysis, + ) + + diff --git a/src/generator.py b/src/generator.py new file mode 100644 index 0000000..003482a --- /dev/null +++ b/src/generator.py @@ -0,0 +1,56 @@ +from typing import Optional + +from dotenv import load_dotenv + +from .analyzer import RepositoryAnalyzer, RepositoryAnalyzerConfig + +load_dotenv() + + +def generate_llms_txt( + repo_url: str, + output_file: str = "llms.txt", + model: str = "gpt-4o", + github_token: Optional[str] = None, +) -> str: + """ + Generate llms.txt file for a given repository. + + This is a convenience wrapper around RepositoryAnalyzer that handles + configuration and invokes the analyzer's forward method. + + Args: + repo_url: GitHub repository URL (e.g., https://github.com/stanfordnlp/dspy) + output_file: Path to output file (default: llms.txt) + model: Language model to use for analysis (default: gpt-4o) + github_token: GitHub access token (optional, can also use GITHUB_ACCESS_TOKEN env var) + + Returns: + The generated llms.txt content + + Example: + >>> result = generate_llms_txt("https://github.com/stanfordnlp/dspy") + >>> print(result[:100]) + """ + # Create analyzer configuration + config = RepositoryAnalyzerConfig( + repo_analyzer_model=model, + code_analyzer_model=model, + llms_txt_generator_model=model, + ) + + # Create analyzer instance + analyzer = RepositoryAnalyzer(config=config) + + # Run the complete pipeline + result = analyzer( + repo_url=repo_url, + output_file=output_file, + github_token=github_token, + ) + + return result.llms_txt_content + + +if __name__ == "__main__": + generate_llms_txt("https://github.com/stanfordnlp/dspy") diff --git a/src/github_utils.py b/src/github_utils.py new file mode 100644 index 0000000..4dd0708 --- /dev/null +++ b/src/github_utils.py @@ -0,0 +1,126 @@ +import base64 +import os +from typing import Optional +from dotenv import load_dotenv + +import requests + +load_dotenv() + + +def get_github_file_tree(repo_url: str, token: Optional[str] = None) -> str: + """ + Get repository file structure from GitHub API. + + Args: + repo_url: GitHub repository URL (e.g., https://github.com/owner/repo) + token: Optional GitHub access token for authentication + + Returns: + Newline-separated list of file paths in the repository + + Raises: + Exception: If the API request fails + """ + # extract owner/repo from URL + parts = repo_url.rstrip("/").split("/") + owner, repo = parts[-2], parts[-1] + + # try both main and master branches + for branch in ["main", "master"]: + api_url = f"https://api.github.com/repos/{owner}/{repo}/git/trees/{branch}?recursive=1" + headers = {} + + if token: + headers["Authorization"] = f"Bearer {token}" + elif os.environ.get("GITHUB_ACCESS_TOKEN"): + headers["Authorization"] = f"Bearer {os.environ.get('GITHUB_ACCESS_TOKEN')}" + + response = requests.get(api_url, headers=headers) + + if response.status_code == 200: + tree_data = response.json() + file_paths = [ + item["path"] for item in tree_data["tree"] if item["type"] == "blob" + ] + return "\n".join(sorted(file_paths)) + elif response.status_code != 404: + raise Exception( + f"Failed to fetch repository tree: {response.status_code} - {response.text}" + ) + + raise Exception(f"Could not fetch repository tree. Tried branches: main, master") + + +def get_github_file_content( + repo_url: str, file_path: str, token: Optional[str] = None +) -> str: + """ + Get specific file content from GitHub. + + Args: + repo_url: GitHub repository URL + file_path: Path to the file within the repository + token: Optional GitHub access token for authentication + + Returns: + Content of the file as a string + """ + parts = repo_url.rstrip("/").split("/") + owner, repo = parts[-2], parts[-1] + + api_url = f"https://api.github.com/repos/{owner}/{repo}/contents/{file_path}" + headers = {} + + if token: + headers["Authorization"] = f"Bearer {token}" + elif os.environ.get("GITHUB_ACCESS_TOKEN"): + headers["Authorization"] = f"Bearer {os.environ.get('GITHUB_ACCESS_TOKEN')}" + + response = requests.get(api_url, headers=headers) + + if response.status_code == 200: + content = base64.b64decode(response.json()["content"]).decode("utf-8") + return content + else: + return f"Could not fetch {file_path}" + + +def gather_repository_info( + repo_url: str, token: Optional[str] = None +) -> tuple[str, str, str]: + """ + Gather all necessary repository information. + + Args: + repo_url: GitHub repository URL + token: Optional GitHub access token for authentication + + Returns: + Tuple of (file_tree, readme_content, package_files_content) + """ + file_tree = get_github_file_tree(repo_url, token) + readme_content = get_github_file_content(repo_url, "README.md", token) + + # get key package files + package_files = [] + for file_path in [ + "pyproject.toml", + "setup.py", + "requirements.txt", + "package.json", + "Cargo.toml", + "go.mod", + ]: + try: + content = get_github_file_content(repo_url, file_path, token) + if "Could not fetch" not in content: + package_files.append(f"=== {file_path} ===\n{content}") + except Exception: + continue + + package_files_content = ( + "\n\n".join(package_files) if package_files else "No package files found" + ) + + return file_tree, readme_content, package_files_content diff --git a/src/signatures.py b/src/signatures.py new file mode 100644 index 0000000..7d4a97c --- /dev/null +++ b/src/signatures.py @@ -0,0 +1,52 @@ +import dspy + + +class AnalyzeRepository(dspy.Signature): + """Analyze a repository structure and identify key components.""" + + repo_url: str = dspy.InputField(desc="GitHub repository URL") + file_tree: str = dspy.InputField(desc="Repository file structure") + readme_content: str = dspy.InputField(desc="README.md content") + + project_purpose: str = dspy.OutputField( + desc="Main purpose and goals of the project" + ) + key_concepts: list[str] = dspy.OutputField( + desc="List of important concepts and terminology" + ) + architecture_overview: str = dspy.OutputField( + desc="High-level architecture description" + ) + + +class AnalyzeCodeStructure(dspy.Signature): + """Analyze code structure to identify important directories and files.""" + + file_tree: str = dspy.InputField(desc="Repository file structure") + package_files: str = dspy.InputField(desc="Key package and configuration files") + + important_directories: list[str] = dspy.OutputField( + desc="Key directories and their purposes" + ) + entry_points: list[str] = dspy.OutputField( + desc="Main entry points and important files" + ) + development_info: str = dspy.OutputField( + desc="Development setup and workflow information" + ) + + +class GenerateLLMsTxt(dspy.Signature): + """Generate a comprehensive llms.txt file from analyzed repository information.""" + + project_purpose: str = dspy.InputField() + key_concepts: list[str] = dspy.InputField() + architecture_overview: str = dspy.InputField() + important_directories: list[str] = dspy.InputField() + entry_points: list[str] = dspy.InputField() + development_info: str = dspy.InputField() + usage_examples: str = dspy.InputField(desc="Common usage patterns and examples") + + llms_txt_content: str = dspy.OutputField( + desc="Complete llms.txt file content following the standard format" + )