llm_fragments_repomix.py

  1from typing import List, Dict, Tuple
  2import llm
  3import os
  4import pathlib
  5import subprocess
  6import tempfile
  7import shutil
  8
  9
 10def parse_fragment_string(fragment_string: str) -> Tuple[str, Dict[str, str]]:
 11    """
 12    Parse a fragment string into URL and arguments
 13    
 14    Format: url:arg1:arg2=value:arg3
 15    Returns: (url, {arg1: True, arg2: "value", arg3: True})
 16    """
 17    # Define known repomix flags to detect where arguments start
 18    known_flags = {
 19        "compress", "remove-comments", "remove-empty-lines", 
 20        "output-show-line-numbers", "no-file-summary", 
 21        "no-directory-structure", "no-files", 
 22        "include-empty-directories", "no-git-sort-by-changes",
 23        "include-diffs", "no-gitignore", "no-default-patterns",
 24        "no-security-check", "verbose", "quiet", "style"
 25    }
 26    
 27    # Split on colons to get all parts
 28    parts = fragment_string.split(":")
 29    if len(parts) < 1:
 30        raise ValueError("Invalid fragment string format")
 31    
 32    # Handle different URL formats
 33    if fragment_string.startswith("https://"):
 34        # Find where arguments start
 35        # parts = ['https', '//github.com/user/repo', 'compress']
 36        arg_start_idx = None
 37        for i, part in enumerate(parts):
 38            if i <= 1:  # Skip protocol parts (https, //domain/path)
 39                continue
 40            if "=" in part or part in known_flags:
 41                arg_start_idx = i
 42                break
 43        
 44        if arg_start_idx is None:
 45            url = fragment_string
 46            arg_parts = []
 47        else:
 48            # Reconstruct URL by joining the parts before arguments
 49            url_parts = parts[:arg_start_idx]
 50            url = ":".join(url_parts)
 51            arg_parts = parts[arg_start_idx:]
 52            
 53    elif fragment_string.startswith("ssh://"):
 54        # Similar logic for SSH URLs
 55        # parts = ['ssh', '//git@github.com', 'user/repo.git', 'compress']
 56        arg_start_idx = None
 57        for i, part in enumerate(parts):
 58            if i <= 1:  # Skip protocol parts (ssh, //domain)
 59                continue
 60            if "=" in part or part in known_flags:
 61                arg_start_idx = i
 62                break
 63        
 64        if arg_start_idx is None:
 65            url = fragment_string
 66            arg_parts = []
 67        else:
 68            url_parts = parts[:arg_start_idx]
 69            url = ":".join(url_parts)
 70            arg_parts = parts[arg_start_idx:]
 71            
 72    elif fragment_string.startswith("git@"):
 73        # git@host:path format - need to be careful about colons
 74        # Look for arguments after the repo path
 75        arg_start_idx = None
 76        
 77        # Skip the first colon (after hostname) when looking for arguments
 78        for i, part in enumerate(parts):
 79            if i <= 1:  # Skip git@host and first path part
 80                continue
 81            if "=" in part or part in known_flags:
 82                arg_start_idx = i
 83                break
 84        
 85        if arg_start_idx is None:
 86            url = fragment_string
 87            arg_parts = []
 88        else:
 89            url_parts = parts[:arg_start_idx]
 90            url = ":".join(url_parts)
 91            arg_parts = parts[arg_start_idx:]
 92    else:
 93        # No protocol prefix, assume simple format
 94        url = parts[0]
 95        arg_parts = parts[1:]
 96    
 97    # Parse arguments
 98    args = {}
 99    for arg in arg_parts:
100        if arg and "=" in arg:
101            key, value = arg.split("=", 1)
102            args[key] = value
103        elif arg:
104            args[arg] = True
105    
106    return url, args
107
108
109def build_repomix_command(repo_path: str, args: Dict[str, str]) -> List[str]:
110    """
111    Build repomix command with arguments
112    
113    Args:
114        repo_path: Path to the repository
115        args: Dictionary of arguments
116        
117    Returns:
118        List of command parts
119    """
120    cmd = ["repomix", "--stdout"]
121    
122    # Map of supported arguments to their command-line flags
123    supported_args = {
124        "compress": "--compress",
125        "style": "--style",
126        "include": "--include",
127        "ignore": "--ignore",
128        "remove-comments": "--remove-comments",
129        "remove-empty-lines": "--remove-empty-lines",
130        "output-show-line-numbers": "--output-show-line-numbers",
131        "no-file-summary": "--no-file-summary",
132        "no-directory-structure": "--no-directory-structure",
133        "no-files": "--no-files",
134        "header-text": "--header-text",
135        "instruction-file-path": "--instruction-file-path",
136        "include-empty-directories": "--include-empty-directories",
137        "no-git-sort-by-changes": "--no-git-sort-by-changes",
138        "include-diffs": "--include-diffs",
139        "no-gitignore": "--no-gitignore",
140        "no-default-patterns": "--no-default-patterns",
141        "no-security-check": "--no-security-check",
142        "token-count-encoding": "--token-count-encoding",
143        "top-files-len": "--top-files-len",
144        "verbose": "--verbose",
145        "quiet": "--quiet",
146    }
147    
148    # Add arguments to command
149    for arg, value in args.items():
150        if arg in supported_args:
151            flag = supported_args[arg]
152            if value is True:
153                # Boolean flag
154                cmd.append(flag)
155            elif value and value != "":
156                # Value argument
157                cmd.extend([flag, value])
158            # Skip empty string values
159    
160    # Add repository path
161    cmd.append(repo_path)
162    
163    return cmd
164
165
166@llm.hookimpl
167def register_fragment_loaders(register):
168    register("repomix", repomix_loader)
169
170
171def repomix_loader(argument: str) -> List[llm.Fragment]:
172    """
173    Load repository contents as fragments using Repomix
174    
175    Argument can be:
176    - A git repository URL: https://git.sr.ht/~amolith/willow
177    - URL with arguments: https://git.sr.ht/~amolith/willow:compress:include=*.py
178    
179    Examples:
180        repomix:https://git.sr.ht/~amolith/willow
181        repomix:ssh://git.sr.ht:~amolith/willow:compress
182        repomix:git@github.com:user/repo.git:include=*.ts,*.js:ignore=*.md
183    """
184    # Parse the fragment string to extract URL and arguments
185    url, args = parse_fragment_string(argument)
186    
187    if not url.startswith(("https://", "ssh://", "git@")):
188        raise ValueError(
189            f"Repository URL must start with https://, ssh://, or git@ - got: {url}"
190        )
191    
192    # Check if repomix is available
193    if not shutil.which("repomix"):
194        raise ValueError(
195            "repomix command not found. Please install repomix first: "
196            "https://github.com/yamadashy/repomix"
197        )
198    
199    # Create a temporary directory for the cloned repository
200    with tempfile.TemporaryDirectory() as temp_dir:
201        repo_path = pathlib.Path(temp_dir) / "repo"
202        
203        try:
204            # Clone the repository
205            subprocess.run(
206                ["git", "clone", "--depth=1", url, str(repo_path)],
207                check=True,
208                capture_output=True,
209                text=True,
210            )
211            
212            # Build repomix command with arguments
213            repomix_cmd = build_repomix_command(str(repo_path), args)
214            
215            # Run repomix on the cloned repository
216            result = subprocess.run(
217                repomix_cmd,
218                check=True,
219                capture_output=True,
220                text=True,
221            )
222            
223            # Create a single fragment with the repomix output
224            fragments = [
225                llm.Fragment(
226                    content=result.stdout,
227                    source=f"repomix:{argument}"
228                )
229            ]
230            
231            return fragments
232            
233        except subprocess.CalledProcessError as e:
234            # Handle Git or repomix errors
235            if "git" in str(e.cmd):
236                raise ValueError(
237                    f"Failed to clone repository {url}: {e.stderr}"
238                )
239            elif "repomix" in str(e.cmd):
240                raise ValueError(
241                    f"Failed to run repomix on {url}: {e.stderr}"
242                )
243            else:
244                raise ValueError(
245                    f"Command failed: {e.stderr}"
246                )
247        except Exception as e:
248            # Handle other errors
249            raise ValueError(
250                f"Error processing repository {argument}: {str(e)}"
251            )