1"""
2This script analyzes all the highlight.scm files in our embedded languages and extensions.
3It counts the number of unique instances of @{name} and the languages in which they are used.
4
5This is useful to help avoid accidentally introducing new tags when appropriate ones already exist when adding new languages.
6
7Flags:
8-v, --verbose: Include a detailed list of languages for each tag found in the highlight.scm files.
9"""
10
11from collections import defaultdict
12from pathlib import Path
13from typing import Any
14import argparse
15import re
16
17pattern = re.compile(r'@(?!_)[a-zA-Z_.]+')
18
19def parse_arguments():
20 parser = argparse.ArgumentParser(description='Analyze highlight.scm files for unique instances and their languages.')
21 parser.add_argument('-v', '--verbose', action='store_true', help='Include a list of languages for each tag.')
22 return parser.parse_args()
23
24def find_highlight_files(root_dir):
25 for path in Path(root_dir).rglob('highlights.scm'):
26 yield path
27
28def count_instances(files):
29 instances: defaultdict[list[Any], dict[str, Any]] = defaultdict(lambda: {'count': 0, 'languages': set()})
30 for file_path in files:
31 language = file_path.parent.name
32 with open(file_path, "r") as file:
33 text = file.read()
34 matches = pattern.findall(text)
35 for match in matches:
36 instances[match]['count'] += 1
37 instances[match]['languages'].add(language)
38 return instances
39
40def print_instances(instances, verbose=False):
41 for item, details in sorted(instances.items(), key=lambda x: x[0]):
42 languages = ', '.join(sorted(details['languages']))
43 if verbose:
44 print(f"{item} ({details['count']}) - [{languages}]")
45 else:
46 print(f"{item} ({details['count']})")
47
48def main():
49 args = parse_arguments()
50
51 base_dir = Path(__file__).parent.parent
52 core_path = base_dir / 'crates/languages/src'
53 extension_path = base_dir / 'extensions/'
54
55 core_instances = count_instances(find_highlight_files(core_path))
56 extension_instances = count_instances(find_highlight_files(extension_path))
57
58 unique_extension_instances = {k: v for k, v in extension_instances.items() if k not in core_instances}
59
60 print('Shared:\n')
61 print_instances(core_instances, args.verbose)
62
63 if unique_extension_instances:
64 print('\nExtension-only:\n')
65 print_instances(unique_extension_instances, args.verbose)
66
67if __name__ == '__main__':
68 main()