1#!/usr/bin/env python3
 2"""
 3This script analyzes all the highlight.scm files in our embedded languages and extensions.
 4It counts the number of unique instances of @{name} and the languages in which they are used.
 5
 6This is useful to help avoid accidentally introducing new tags when appropriate ones already exist when adding new languages.
 7
 8Flags:
 9-v, --verbose: Include a detailed list of languages for each tag found in the highlight.scm files.
10"""
11
12from collections import defaultdict
13from pathlib import Path
14from typing import Any
15import argparse
16import re
17
18pattern = re.compile(r'@(?!_)[a-zA-Z_.]+')
19
20def parse_arguments():
21    parser = argparse.ArgumentParser(description='Analyze highlight.scm files for unique instances and their languages.')
22    parser.add_argument('-v', '--verbose', action='store_true', help='Include a list of languages for each tag.')
23    return parser.parse_args()
24
25def find_highlight_files(root_dir):
26    for path in Path(root_dir).rglob('highlights.scm'):
27        yield path
28
29def count_instances(files):
30    instances: defaultdict[list[Any], dict[str, Any]] = defaultdict(lambda: {'count': 0, 'languages': set()})
31    for file_path in files:
32        language = file_path.parent.name
33        with open(file_path, "r") as file:
34            text = file.read()
35            matches = pattern.findall(text)
36            for match in matches:
37                instances[match]['count'] += 1
38                instances[match]['languages'].add(language)
39    return instances
40
41def print_instances(instances, verbose=False):
42    for item, details in sorted(instances.items(), key=lambda x: x[0]):
43        languages = ', '.join(sorted(details['languages']))
44        if verbose:
45            print(f"{item} ({details['count']}) - [{languages}]")
46        else:
47            print(f"{item} ({details['count']})")
48
49def main():
50    args = parse_arguments()
51
52    base_dir = Path(__file__).parent.parent
53    core_path = base_dir / 'crates/languages/src'
54    extension_path = base_dir / 'extensions/'
55
56    core_instances = count_instances(find_highlight_files(core_path))
57    extension_instances = count_instances(find_highlight_files(extension_path))
58
59    unique_extension_instances = {k: v for k, v in extension_instances.items() if k not in core_instances}
60
61    print('Shared:\n')
62    print_instances(core_instances, args.verbose)
63
64    if unique_extension_instances:
65        print('\nExtension-only:\n')
66        print_instances(unique_extension_instances, args.verbose)
67
68if __name__ == '__main__':
69    main()