analyze_highlights.py

 1#!/usr/bin/env python3
 2"""
 3This script analyzes all the highlights.scm files in our embedded languages and extensions.
 4It counts the number of unique instances of @{name} and the languages in which they are used.
 5
 6This is useful to help avoid accidentally introducing new tags when appropriate ones already exist when adding new languages.
 7
 8Flags:
 9-v, --verbose: Include a detailed list of languages for each tag found in the highlights.scm files.
10"""
11
12import argparse
13import re
14from collections import defaultdict
15from pathlib import Path
16from typing import Any
17
18pattern = re.compile(r'@(?!_)[a-zA-Z_.]+')
19
20def parse_arguments():
21    parser = argparse.ArgumentParser(description='Analyze highlights.scm files for unique instances and their languages.')
22    parser.add_argument('-v', '--verbose', action='store_true', help='Include a list of languages for each tag.')
23    return parser.parse_args()
24
25def find_highlight_files(root_dir):
26    for path in Path(root_dir).rglob('highlights.scm'):
27        yield path
28
29def count_instances(files):
30    instances: defaultdict[list[Any], dict[str, Any]] = defaultdict(lambda: {'count': 0, 'languages': set()})
31    for file_path in files:
32        language = file_path.parent.name
33        with open(file_path, "r") as file:
34            text = file.read()
35            matches = pattern.findall(text)
36            for match in matches:
37                instances[match]['count'] += 1
38                instances[match]['languages'].add(language)
39    return instances
40
41def print_instances(instances, verbose=False):
42    for item, details in sorted(instances.items(), key=lambda x: x[0]):
43        languages = ', '.join(sorted(details['languages']))
44        if verbose:
45            print(f"{item} ({details['count']}) - [{languages}]")
46        else:
47            print(f"{item} ({details['count']})")
48
49def main():
50    args = parse_arguments()
51
52    base_dir = Path(__file__).parent.parent
53    core_path = base_dir / 'crates/languages/src'
54    extension_path = base_dir / 'extensions/'
55
56    core_instances = count_instances(find_highlight_files(core_path))
57    extension_instances = count_instances(find_highlight_files(extension_path))
58
59    unique_extension_instances = {k: v for k, v in extension_instances.items() if k not in core_instances}
60
61    print('Shared:\n')
62    print_instances(core_instances, args.verbose)
63
64    if unique_extension_instances:
65        print('\nExtension-only:\n')
66        print_instances(unique_extension_instances, args.verbose)
67
68if __name__ == '__main__':
69    main()