1from typing import List, Dict, Tuple
2import llm
3import os
4import pathlib
5import subprocess
6import tempfile
7import shutil
8
9
10def parse_fragment_string(fragment_string: str) -> Tuple[str, Dict[str, str]]:
11 """
12 Parse a fragment string into URL and arguments
13
14 Format: url:arg1:arg2=value:arg3
15 Returns: (url, {arg1: True, arg2: "value", arg3: True})
16 """
17 # Define known repomix flags to detect where arguments start
18 known_flags = {
19 "compress", "remove-comments", "remove-empty-lines",
20 "output-show-line-numbers", "no-file-summary",
21 "no-directory-structure", "no-files",
22 "include-empty-directories", "no-git-sort-by-changes",
23 "include-diffs", "no-gitignore", "no-default-patterns",
24 "no-security-check", "verbose", "quiet", "style"
25 }
26
27 # Split on colons to get all parts
28 parts = fragment_string.split(":")
29 if len(parts) < 1:
30 raise ValueError("Invalid fragment string format")
31
32 # Handle different URL formats
33 if fragment_string.startswith("https://"):
34 # Find where arguments start
35 # parts = ['https', '//github.com/user/repo', 'compress']
36 arg_start_idx = None
37 for i, part in enumerate(parts):
38 if i <= 1: # Skip protocol parts (https, //domain/path)
39 continue
40 if "=" in part or part in known_flags:
41 arg_start_idx = i
42 break
43
44 if arg_start_idx is None:
45 url = fragment_string
46 arg_parts = []
47 else:
48 # Reconstruct URL by joining the parts before arguments
49 url_parts = parts[:arg_start_idx]
50 url = ":".join(url_parts)
51 arg_parts = parts[arg_start_idx:]
52
53 elif fragment_string.startswith("ssh://"):
54 # Similar logic for SSH URLs
55 # parts = ['ssh', '//git@github.com', 'user/repo.git', 'compress']
56 arg_start_idx = None
57 for i, part in enumerate(parts):
58 if i <= 1: # Skip protocol parts (ssh, //domain)
59 continue
60 if "=" in part or part in known_flags:
61 arg_start_idx = i
62 break
63
64 if arg_start_idx is None:
65 url = fragment_string
66 arg_parts = []
67 else:
68 url_parts = parts[:arg_start_idx]
69 url = ":".join(url_parts)
70 arg_parts = parts[arg_start_idx:]
71
72 elif fragment_string.startswith("git@"):
73 # git@host:path format - need to be careful about colons
74 # Look for arguments after the repo path
75 arg_start_idx = None
76
77 # Skip the first colon (after hostname) when looking for arguments
78 for i, part in enumerate(parts):
79 if i <= 1: # Skip git@host and first path part
80 continue
81 if "=" in part or part in known_flags:
82 arg_start_idx = i
83 break
84
85 if arg_start_idx is None:
86 url = fragment_string
87 arg_parts = []
88 else:
89 url_parts = parts[:arg_start_idx]
90 url = ":".join(url_parts)
91 arg_parts = parts[arg_start_idx:]
92 else:
93 # No protocol prefix, assume simple format
94 url = parts[0]
95 arg_parts = parts[1:]
96
97 # Parse arguments
98 args = {}
99 for arg in arg_parts:
100 if arg and "=" in arg:
101 key, value = arg.split("=", 1)
102 args[key] = value
103 elif arg:
104 args[arg] = True
105
106 return url, args
107
108
109def build_repomix_command(repo_path: str, args: Dict[str, str]) -> List[str]:
110 """
111 Build repomix command with arguments
112
113 Args:
114 repo_path: Path to the repository
115 args: Dictionary of arguments
116
117 Returns:
118 List of command parts
119 """
120 cmd = ["repomix", "--stdout"]
121
122 # Map of supported arguments to their command-line flags
123 supported_args = {
124 "compress": "--compress",
125 "style": "--style",
126 "include": "--include",
127 "ignore": "--ignore",
128 "remove-comments": "--remove-comments",
129 "remove-empty-lines": "--remove-empty-lines",
130 "output-show-line-numbers": "--output-show-line-numbers",
131 "no-file-summary": "--no-file-summary",
132 "no-directory-structure": "--no-directory-structure",
133 "no-files": "--no-files",
134 "header-text": "--header-text",
135 "instruction-file-path": "--instruction-file-path",
136 "include-empty-directories": "--include-empty-directories",
137 "no-git-sort-by-changes": "--no-git-sort-by-changes",
138 "include-diffs": "--include-diffs",
139 "no-gitignore": "--no-gitignore",
140 "no-default-patterns": "--no-default-patterns",
141 "no-security-check": "--no-security-check",
142 "token-count-encoding": "--token-count-encoding",
143 "top-files-len": "--top-files-len",
144 "verbose": "--verbose",
145 "quiet": "--quiet",
146 }
147
148 # Add arguments to command
149 for arg, value in args.items():
150 if arg in supported_args:
151 flag = supported_args[arg]
152 if value is True:
153 # Boolean flag
154 cmd.append(flag)
155 elif value and value != "":
156 # Value argument
157 cmd.extend([flag, value])
158 # Skip empty string values
159
160 # Add repository path
161 cmd.append(repo_path)
162
163 return cmd
164
165
166@llm.hookimpl
167def register_fragment_loaders(register):
168 register("repomix", repomix_loader)
169
170
171def repomix_loader(argument: str) -> List[llm.Fragment]:
172 """
173 Load repository contents as fragments using Repomix
174
175 Argument can be:
176 - A git repository URL: https://git.sr.ht/~amolith/willow
177 - URL with arguments: https://git.sr.ht/~amolith/willow:compress:include=*.py
178
179 Examples:
180 repomix:https://git.sr.ht/~amolith/willow
181 repomix:ssh://git.sr.ht:~amolith/willow:compress
182 repomix:git@github.com:user/repo.git:include=*.ts,*.js:ignore=*.md
183 """
184 # Parse the fragment string to extract URL and arguments
185 url, args = parse_fragment_string(argument)
186
187 if not url.startswith(("https://", "ssh://", "git@")):
188 raise ValueError(
189 f"Repository URL must start with https://, ssh://, or git@ - got: {url}"
190 )
191
192 # Check if repomix is available
193 if not shutil.which("repomix"):
194 raise ValueError(
195 "repomix command not found. Please install repomix first: "
196 "https://github.com/yamadashy/repomix"
197 )
198
199 # Create a temporary directory for the cloned repository
200 with tempfile.TemporaryDirectory() as temp_dir:
201 repo_path = pathlib.Path(temp_dir) / "repo"
202
203 try:
204 # Clone the repository
205 subprocess.run(
206 ["git", "clone", "--depth=1", url, str(repo_path)],
207 check=True,
208 capture_output=True,
209 text=True,
210 )
211
212 # Build repomix command with arguments
213 repomix_cmd = build_repomix_command(str(repo_path), args)
214
215 # Run repomix on the cloned repository
216 result = subprocess.run(
217 repomix_cmd,
218 check=True,
219 capture_output=True,
220 text=True,
221 )
222
223 # Create a single fragment with the repomix output
224 fragments = [
225 llm.Fragment(
226 content=result.stdout,
227 source=f"repomix:{argument}"
228 )
229 ]
230
231 return fragments
232
233 except subprocess.CalledProcessError as e:
234 # Handle Git or repomix errors
235 if "git" in str(e.cmd):
236 raise ValueError(
237 f"Failed to clone repository {url}: {e.stderr}"
238 )
239 elif "repomix" in str(e.cmd):
240 raise ValueError(
241 f"Failed to run repomix on {url}: {e.stderr}"
242 )
243 else:
244 raise ValueError(
245 f"Command failed: {e.stderr}"
246 )
247 except Exception as e:
248 # Handle other errors
249 raise ValueError(
250 f"Error processing repository {argument}: {str(e)}"
251 )