license_detection.rs

  1use std::{
  2    collections::BTreeSet,
  3    fmt::{Display, Formatter},
  4    ops::Range,
  5    path::PathBuf,
  6    sync::{Arc, LazyLock},
  7};
  8
  9use anyhow::{Result, anyhow};
 10use fs::Fs;
 11use futures::StreamExt as _;
 12use gpui::{App, AppContext as _, Entity, Subscription, Task};
 13use itertools::Itertools;
 14use postage::watch;
 15use project::Worktree;
 16use strum::VariantArray;
 17use util::{ResultExt as _, maybe, rel_path::RelPath};
 18use worktree::ChildEntriesOptions;
 19
 20/// Matches the most common license locations, with US and UK English spelling.
 21static LICENSE_FILE_NAME_REGEX: LazyLock<regex::bytes::Regex> = LazyLock::new(|| {
 22    regex::bytes::RegexBuilder::new(
 23        "^ \
 24        (?: license | licence)? \
 25        (?: [\\-._]? \
 26            (?: apache (?: [\\-._] (?: 2.0 | 2 ))? | \
 27                0? bsd (?: [\\-._] [0123])? (?: [\\-._] clause)? | \
 28                isc | \
 29                mit | \
 30                upl | \
 31                zlib))? \
 32        (?: [\\-._]? (?: license | licence))? \
 33        (?: \\.txt | \\.md)? \
 34        $",
 35    )
 36    .ignore_whitespace(true)
 37    .case_insensitive(true)
 38    .build()
 39    .unwrap()
 40});
 41
 42#[derive(Debug, Clone, Copy, Eq, Ord, PartialOrd, PartialEq, VariantArray)]
 43pub enum OpenSourceLicense {
 44    Apache2_0,
 45    BSDZero,
 46    BSD,
 47    ISC,
 48    MIT,
 49    UPL1_0,
 50    Zlib,
 51}
 52
 53impl Display for OpenSourceLicense {
 54    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
 55        write!(f, "{}", self.spdx_identifier())
 56    }
 57}
 58
 59impl OpenSourceLicense {
 60    /// These are SPDX identifiers for the licenses, except for BSD, where the variants are not
 61    /// distinguished.
 62    pub fn spdx_identifier(&self) -> &'static str {
 63        match self {
 64            OpenSourceLicense::Apache2_0 => "apache-2.0",
 65            OpenSourceLicense::BSDZero => "0bsd",
 66            OpenSourceLicense::BSD => "bsd",
 67            OpenSourceLicense::ISC => "isc",
 68            OpenSourceLicense::MIT => "mit",
 69            OpenSourceLicense::UPL1_0 => "upl-1.0",
 70            OpenSourceLicense::Zlib => "zlib",
 71        }
 72    }
 73
 74    pub fn patterns(&self) -> &'static [&'static str] {
 75        match self {
 76            OpenSourceLicense::Apache2_0 => &[
 77                include_str!("../license_patterns/apache-2.0-pattern"),
 78                include_str!("../license_patterns/apache-2.0-reference-pattern"),
 79            ],
 80            OpenSourceLicense::BSDZero => &[include_str!("../license_patterns/0bsd-pattern")],
 81            OpenSourceLicense::BSD => &[include_str!("../license_patterns/bsd-pattern")],
 82            OpenSourceLicense::ISC => &[include_str!("../license_patterns/isc-pattern")],
 83            OpenSourceLicense::MIT => &[include_str!("../license_patterns/mit-pattern")],
 84            OpenSourceLicense::UPL1_0 => &[include_str!("../license_patterns/upl-1.0-pattern")],
 85            OpenSourceLicense::Zlib => &[include_str!("../license_patterns/zlib-pattern")],
 86        }
 87    }
 88}
 89
 90// TODO: Consider using databake or similar to not parse at runtime.
 91static LICENSE_PATTERNS: LazyLock<LicensePatterns> = LazyLock::new(|| {
 92    let mut approximate_max_length = 0;
 93    let mut patterns = Vec::new();
 94    for license in OpenSourceLicense::VARIANTS {
 95        for pattern in license.patterns() {
 96            let (pattern, length) = parse_pattern(pattern).unwrap();
 97            patterns.push((*license, pattern));
 98            approximate_max_length = approximate_max_length.max(length);
 99        }
100    }
101    LicensePatterns {
102        patterns,
103        approximate_max_length,
104    }
105});
106
107fn detect_license(text: &str) -> Option<OpenSourceLicense> {
108    let text = canonicalize_license_text(text);
109    for (license, pattern) in LICENSE_PATTERNS.patterns.iter() {
110        log::trace!("Checking if license is {}", license);
111        if check_pattern(&pattern, &text) {
112            return Some(*license);
113        }
114    }
115
116    None
117}
118
119struct LicensePatterns {
120    patterns: Vec<(OpenSourceLicense, Vec<PatternPart>)>,
121    approximate_max_length: usize,
122}
123
124#[derive(Debug, Clone, Default, PartialEq, Eq)]
125struct PatternPart {
126    /// Indicates that matching `text` is optional. Skipping `match_any_chars` is conditional on
127    /// matching `text`.
128    optional: bool,
129    /// Indicates the number of characters that can be skipped before matching `text`.
130    match_any_chars: Range<usize>,
131    /// The text to match, may be empty.
132    text: String,
133}
134
135/// Lines that start with "-- " begin a `PatternPart`. `-- 1..10` specifies `match_any_chars:
136/// 1..10`. `-- 1..10 optional:` additionally specifies `optional: true`. It's a parse error for a
137/// line to start with `--` without matching this format.
138///
139/// Text that does not have `--` prefixes participate in the `text` field and are canonicalized by
140/// lowercasing, replacing all runs of whitespace with a single space, and otherwise only keeping
141/// ascii alphanumeric characters.
142fn parse_pattern(pattern_source: &str) -> Result<(Vec<PatternPart>, usize)> {
143    let mut pattern = Vec::new();
144    let mut part = PatternPart::default();
145    let mut approximate_max_length = 0;
146    for line in pattern_source.lines() {
147        if let Some(directive) = line.trim().strip_prefix("--") {
148            if part != PatternPart::default() {
149                pattern.push(part);
150                part = PatternPart::default();
151            }
152            let valid = maybe!({
153                let directive_chunks = directive.split_whitespace().collect::<Vec<_>>();
154                if !(1..=2).contains(&directive_chunks.len()) {
155                    return None;
156                }
157                if directive_chunks.len() == 2 {
158                    part.optional = true;
159                }
160                let range_chunks = directive_chunks[0].split("..").collect::<Vec<_>>();
161                if range_chunks.len() != 2 {
162                    return None;
163                }
164                part.match_any_chars.start = range_chunks[0].parse::<usize>().ok()?;
165                part.match_any_chars.end = range_chunks[1].parse::<usize>().ok()?;
166                if part.match_any_chars.start > part.match_any_chars.end {
167                    return None;
168                }
169                approximate_max_length += part.match_any_chars.end;
170                Some(())
171            });
172            if valid.is_none() {
173                return Err(anyhow!("Invalid pattern directive: {}", line));
174            }
175            continue;
176        }
177        approximate_max_length += line.len() + 1;
178        let line = canonicalize_license_text(line);
179        if line.is_empty() {
180            continue;
181        }
182        if !part.text.is_empty() {
183            part.text.push(' ');
184        }
185        part.text.push_str(&line);
186    }
187    if part != PatternPart::default() {
188        pattern.push(part);
189    }
190    Ok((pattern, approximate_max_length))
191}
192
193/// Checks a pattern against text by iterating over the pattern parts in reverse order, and checking
194/// matches with the end of a prefix of the input. Assumes that `canonicalize_license_text` has
195/// already been applied to the input.
196fn check_pattern(pattern: &[PatternPart], input: &str) -> bool {
197    let mut input_ix = input.len();
198    let mut match_any_chars = 0..0;
199    for part in pattern.iter().rev() {
200        if part.text.is_empty() {
201            match_any_chars.start += part.match_any_chars.start;
202            match_any_chars.end += part.match_any_chars.end;
203            continue;
204        }
205
206        let search_range_end = n_chars_before_offset(match_any_chars.start, input_ix, input);
207        let search_range_start = n_chars_before_offset(
208            match_any_chars.len() + part.text.len(),
209            search_range_end,
210            input,
211        );
212        let found_ix = input[search_range_start..search_range_end].rfind(&part.text);
213
214        if let Some(found_ix) = found_ix {
215            input_ix = search_range_start + found_ix;
216            match_any_chars = part.match_any_chars.clone();
217        } else if !part.optional {
218            log::trace!(
219                "Failed to match pattern\n`...{}`\nagainst input\n`...{}`",
220                &part.text[n_chars_before_offset(128, part.text.len(), &part.text)..],
221                &input[n_chars_before_offset(128, search_range_end, input)..search_range_end],
222            );
223            return false;
224        }
225    }
226    is_char_count_within_range(&input[..input_ix], match_any_chars)
227}
228
229fn n_chars_before_offset(char_count: usize, offset: usize, string: &str) -> usize {
230    if char_count == 0 {
231        return offset;
232    }
233    string[..offset]
234        .char_indices()
235        .nth_back(char_count.saturating_sub(1))
236        .map_or(0, |(byte_ix, _)| byte_ix)
237}
238
239fn is_char_count_within_range(string: &str, char_count_range: Range<usize>) -> bool {
240    if string.len() >= char_count_range.start * 4 && string.len() < char_count_range.end {
241        return true;
242    }
243    if string.len() < char_count_range.start || string.len() >= char_count_range.end * 4 {
244        return false;
245    }
246    char_count_range.contains(&string.chars().count())
247}
248
249/// Canonicalizes license text by removing all non-alphanumeric characters, lowercasing, and turning
250/// runs of whitespace into a single space. Unicode alphanumeric characters are intentionally
251/// preserved since these should cause license mismatch when not within a portion of the license
252/// where arbitrary text is allowed.
253fn canonicalize_license_text(license: &str) -> String {
254    license
255        .chars()
256        .filter(|c| c.is_ascii_whitespace() || c.is_alphanumeric())
257        .map(|c| c.to_ascii_lowercase())
258        .collect::<String>()
259        .split_ascii_whitespace()
260        .join(" ")
261}
262
263pub enum LicenseDetectionWatcher {
264    Local {
265        is_open_source_rx: watch::Receiver<bool>,
266        _is_open_source_task: Task<()>,
267        _worktree_subscription: Subscription,
268    },
269    SingleFile,
270    Remote,
271}
272
273impl LicenseDetectionWatcher {
274    pub fn new(worktree: &Entity<Worktree>, cx: &mut App) -> Self {
275        let worktree_ref = worktree.read(cx);
276        if worktree_ref.is_single_file() {
277            return Self::SingleFile;
278        }
279
280        let (files_to_check_tx, mut files_to_check_rx) = futures::channel::mpsc::unbounded();
281
282        let Worktree::Local(local_worktree) = worktree_ref else {
283            return Self::Remote;
284        };
285        let fs = local_worktree.fs().clone();
286
287        let options = ChildEntriesOptions {
288            include_files: true,
289            include_dirs: false,
290            include_ignored: true,
291        };
292        for top_file in local_worktree.child_entries_with_options(RelPath::empty(), options) {
293            let path_bytes = top_file.path.as_unix_str().as_bytes();
294            if top_file.is_created() && LICENSE_FILE_NAME_REGEX.is_match(path_bytes) {
295                let rel_path = top_file.path.clone();
296                files_to_check_tx.unbounded_send(rel_path).ok();
297            }
298        }
299
300        let _worktree_subscription =
301            cx.subscribe(worktree, move |_worktree, event, _cx| match event {
302                worktree::Event::UpdatedEntries(updated_entries) => {
303                    for updated_entry in updated_entries.iter() {
304                        let rel_path = &updated_entry.0;
305                        let path_bytes = rel_path.as_unix_str().as_bytes();
306                        if LICENSE_FILE_NAME_REGEX.is_match(path_bytes) {
307                            files_to_check_tx.unbounded_send(rel_path.clone()).ok();
308                        }
309                    }
310                }
311                worktree::Event::DeletedEntry(_)
312                | worktree::Event::UpdatedGitRepositories(_)
313                | worktree::Event::Deleted => {}
314            });
315
316        let worktree_snapshot = worktree.read(cx).snapshot();
317        let (mut is_open_source_tx, is_open_source_rx) = watch::channel_with::<bool>(false);
318
319        let _is_open_source_task = cx.background_spawn(async move {
320            let mut eligible_licenses = BTreeSet::new();
321            while let Some(rel_path) = files_to_check_rx.next().await {
322                let abs_path = worktree_snapshot.absolutize(&rel_path);
323                let was_open_source = !eligible_licenses.is_empty();
324                if Self::is_path_eligible(&fs, abs_path).await.unwrap_or(false) {
325                    eligible_licenses.insert(rel_path);
326                } else {
327                    eligible_licenses.remove(&rel_path);
328                }
329                let is_open_source = !eligible_licenses.is_empty();
330                if is_open_source != was_open_source {
331                    *is_open_source_tx.borrow_mut() = is_open_source;
332                }
333            }
334        });
335
336        Self::Local {
337            is_open_source_rx,
338            _is_open_source_task,
339            _worktree_subscription,
340        }
341    }
342
343    async fn is_path_eligible(fs: &Arc<dyn Fs>, abs_path: PathBuf) -> Option<bool> {
344        log::debug!("checking if `{abs_path:?}` is an open source license");
345        // resolve symlinks so that the file size from metadata is correct
346        let Some(abs_path) = fs.canonicalize(&abs_path).await.ok() else {
347            log::debug!(
348                "`{abs_path:?}` license file probably deleted (error canonicalizing the path)"
349            );
350            return None;
351        };
352        let metadata = fs.metadata(&abs_path).await.log_err()??;
353        if metadata.len > LICENSE_PATTERNS.approximate_max_length as u64 {
354            log::debug!(
355                "`{abs_path:?}` license file was skipped \
356                because its size of {} bytes was larger than the max size of {} bytes",
357                metadata.len,
358                LICENSE_PATTERNS.approximate_max_length
359            );
360            return None;
361        }
362        let text = fs.load(&abs_path).await.log_err()?;
363        let is_eligible = detect_license(&text).is_some();
364        if is_eligible {
365            log::debug!(
366                "`{abs_path:?}` matches a license that is eligible for data collection (if enabled)"
367            );
368        } else {
369            log::debug!(
370                "`{abs_path:?}` does not match a license that is eligible for data collection"
371            );
372        }
373        Some(is_eligible)
374    }
375
376    /// Answers false until we find out it's open source
377    pub fn is_project_open_source(&self) -> bool {
378        match self {
379            Self::Local {
380                is_open_source_rx, ..
381            } => *is_open_source_rx.borrow(),
382            Self::SingleFile | Self::Remote => false,
383        }
384    }
385}
386
387#[cfg(test)]
388mod tests {
389    use std::path::Path;
390
391    use fs::FakeFs;
392    use gpui::TestAppContext;
393    use project::WorktreeId;
394    use rand::Rng as _;
395    use serde_json::json;
396    use settings::SettingsStore;
397
398    use super::*;
399
400    const APACHE_2_0_TXT: &str = include_str!("../license_examples/apache-2.0-ex0.txt");
401    const ISC_TXT: &str = include_str!("../license_examples/isc.txt");
402    const MIT_TXT: &str = include_str!("../license_examples/mit-ex0.txt");
403    const UPL_1_0_TXT: &str = include_str!("../license_examples/upl-1.0.txt");
404    const BSD_0_TXT: &str = include_str!("../license_examples/0bsd.txt");
405
406    #[track_caller]
407    fn assert_matches_license(text: &str, license: OpenSourceLicense) {
408        assert_eq!(detect_license(text), Some(license));
409        assert!(text.len() < LICENSE_PATTERNS.approximate_max_length);
410    }
411
412    /*
413    // Uncomment this and run with `cargo test -p zeta -- --no-capture &> licenses-output` to
414    // traverse your entire home directory and run license detection on every file that has a
415    // license-like name.
416    #[test]
417    fn test_check_all_licenses_in_home_dir() {
418        let mut detected = Vec::new();
419        let mut unrecognized = Vec::new();
420        let mut walked_entries = 0;
421        let homedir = std::env::home_dir().unwrap();
422        for entry in walkdir::WalkDir::new(&homedir) {
423            walked_entries += 1;
424            if walked_entries % 10000 == 0 {
425                println!(
426                    "So far visited {} files in {}",
427                    walked_entries,
428                    homedir.display()
429                );
430            }
431            let Ok(entry) = entry else {
432                continue;
433            };
434            if !LICENSE_FILE_NAME_REGEX.is_match(entry.file_name().as_encoded_bytes()) {
435                continue;
436            }
437            let Ok(contents) = std::fs::read_to_string(entry.path()) else {
438                continue;
439            };
440            let path_string = entry.path().to_string_lossy().into_owned();
441            let license = detect_license(&contents);
442            match license {
443                Some(license) => detected.push((license, path_string)),
444                None => unrecognized.push(path_string),
445            }
446        }
447        println!("\nDetected licenses:\n");
448        detected.sort();
449        for (license, path) in &detected {
450            println!("{}: {}", license.spdx_identifier(), path);
451        }
452        println!("\nUnrecognized licenses:\n");
453        for path in &unrecognized {
454            println!("{}", path);
455        }
456        panic!(
457            "{} licenses detected, {} unrecognized",
458            detected.len(),
459            unrecognized.len()
460        );
461        println!("This line has a warning to make sure this test is always commented out");
462    }
463    */
464
465    #[test]
466    fn test_apache_positive_detection() {
467        assert_matches_license(APACHE_2_0_TXT, OpenSourceLicense::Apache2_0);
468        assert_matches_license(
469            include_str!("../license_examples/apache-2.0-ex1.txt"),
470            OpenSourceLicense::Apache2_0,
471        );
472        assert_matches_license(
473            include_str!("../license_examples/apache-2.0-ex2.txt"),
474            OpenSourceLicense::Apache2_0,
475        );
476        assert_matches_license(
477            include_str!("../license_examples/apache-2.0-ex3.txt"),
478            OpenSourceLicense::Apache2_0,
479        );
480        assert_matches_license(
481            include_str!("../license_examples/apache-2.0-ex4.txt"),
482            OpenSourceLicense::Apache2_0,
483        );
484        assert_matches_license(
485            include_str!("../../../LICENSE-APACHE"),
486            OpenSourceLicense::Apache2_0,
487        );
488    }
489
490    #[test]
491    fn test_apache_negative_detection() {
492        assert_eq!(
493            detect_license(&format!(
494                "{APACHE_2_0_TXT}\n\nThe terms in this license are void if P=NP."
495            )),
496            None
497        );
498    }
499
500    #[test]
501    fn test_bsd_1_clause_positive_detection() {
502        assert_matches_license(
503            include_str!("../license_examples/bsd-1-clause.txt"),
504            OpenSourceLicense::BSD,
505        );
506    }
507
508    #[test]
509    fn test_bsd_2_clause_positive_detection() {
510        assert_matches_license(
511            include_str!("../license_examples/bsd-2-clause-ex0.txt"),
512            OpenSourceLicense::BSD,
513        );
514    }
515
516    #[test]
517    fn test_bsd_3_clause_positive_detection() {
518        assert_matches_license(
519            include_str!("../license_examples/bsd-3-clause-ex0.txt"),
520            OpenSourceLicense::BSD,
521        );
522        assert_matches_license(
523            include_str!("../license_examples/bsd-3-clause-ex1.txt"),
524            OpenSourceLicense::BSD,
525        );
526        assert_matches_license(
527            include_str!("../license_examples/bsd-3-clause-ex2.txt"),
528            OpenSourceLicense::BSD,
529        );
530        assert_matches_license(
531            include_str!("../license_examples/bsd-3-clause-ex3.txt"),
532            OpenSourceLicense::BSD,
533        );
534        assert_matches_license(
535            include_str!("../license_examples/bsd-3-clause-ex4.txt"),
536            OpenSourceLicense::BSD,
537        );
538    }
539
540    #[test]
541    fn test_bsd_0_positive_detection() {
542        assert_matches_license(BSD_0_TXT, OpenSourceLicense::BSDZero);
543    }
544
545    #[test]
546    fn test_isc_positive_detection() {
547        assert_matches_license(ISC_TXT, OpenSourceLicense::ISC);
548    }
549
550    #[test]
551    fn test_isc_negative_detection() {
552        let license_text = format!(
553            r#"{ISC_TXT}
554
555            This project is dual licensed under the ISC License and the MIT License."#
556        );
557
558        assert_eq!(detect_license(&license_text), None);
559    }
560
561    #[test]
562    fn test_mit_positive_detection() {
563        assert_matches_license(MIT_TXT, OpenSourceLicense::MIT);
564        assert_matches_license(
565            include_str!("../license_examples/mit-ex1.txt"),
566            OpenSourceLicense::MIT,
567        );
568        assert_matches_license(
569            include_str!("../license_examples/mit-ex2.txt"),
570            OpenSourceLicense::MIT,
571        );
572        assert_matches_license(
573            include_str!("../license_examples/mit-ex3.txt"),
574            OpenSourceLicense::MIT,
575        );
576    }
577
578    #[test]
579    fn test_mit_negative_detection() {
580        let license_text = format!(
581            r#"{MIT_TXT}
582
583            This project is dual licensed under the MIT License and the Apache License, Version 2.0."#
584        );
585        assert_eq!(detect_license(&license_text), None);
586    }
587
588    #[test]
589    fn test_upl_positive_detection() {
590        assert_matches_license(UPL_1_0_TXT, OpenSourceLicense::UPL1_0);
591    }
592
593    #[test]
594    fn test_upl_negative_detection() {
595        let license_text = format!(
596            r#"{UPL_1_0_TXT}
597
598            This project is dual licensed under the UPL License and the MIT License."#
599        );
600
601        assert_eq!(detect_license(&license_text), None);
602    }
603
604    #[test]
605    fn test_zlib_positive_detection() {
606        assert_matches_license(
607            include_str!("../license_examples/zlib-ex0.txt"),
608            OpenSourceLicense::Zlib,
609        );
610    }
611
612    #[test]
613    fn random_strings_negative_detection() {
614        for _i in 0..20 {
615            let random_string = rand::rng()
616                .sample_iter::<char, _>(rand::distr::StandardUniform)
617                .take(512)
618                .collect::<String>();
619            assert_eq!(detect_license(&random_string), None);
620        }
621    }
622
623    #[test]
624    fn test_n_chars_before_offset() {
625        assert_eq!(n_chars_before_offset(2, 4, "hello"), 2);
626
627        let input = "ㄒ乇丂ㄒ";
628        assert_eq!(n_chars_before_offset(2, input.len(), input), "ㄒ乇".len());
629    }
630
631    #[test]
632    fn test_is_char_count_within_range() {
633        // TODO: make this into a proper property test.
634        for _i in 0..20 {
635            let mut rng = rand::rng();
636            let random_char_count = rng.random_range(0..64);
637            let random_string = rand::rng()
638                .sample_iter::<char, _>(rand::distr::StandardUniform)
639                .take(random_char_count)
640                .collect::<String>();
641            let min_chars = rng.random_range(0..10);
642            let max_chars = rng.random_range(min_chars..32);
643            let char_count_range = min_chars..max_chars;
644            assert_eq!(
645                is_char_count_within_range(&random_string, char_count_range.clone()),
646                char_count_range.contains(&random_char_count),
647            );
648        }
649    }
650
651    #[test]
652    fn test_license_file_name_regex() {
653        // Test basic license file names
654        assert!(LICENSE_FILE_NAME_REGEX.is_match(b"LICENSE"));
655        assert!(LICENSE_FILE_NAME_REGEX.is_match(b"LICENCE"));
656        assert!(LICENSE_FILE_NAME_REGEX.is_match(b"license"));
657        assert!(LICENSE_FILE_NAME_REGEX.is_match(b"licence"));
658
659        // Test with extensions
660        assert!(LICENSE_FILE_NAME_REGEX.is_match(b"LICENSE.txt"));
661        assert!(LICENSE_FILE_NAME_REGEX.is_match(b"LICENSE.md"));
662        assert!(LICENSE_FILE_NAME_REGEX.is_match(b"LICENCE.txt"));
663        assert!(LICENSE_FILE_NAME_REGEX.is_match(b"LICENCE.md"));
664
665        // Test with specific license types
666        assert!(LICENSE_FILE_NAME_REGEX.is_match(b"LICENSE-APACHE"));
667        assert!(LICENSE_FILE_NAME_REGEX.is_match(b"LICENSE-MIT"));
668        assert!(LICENSE_FILE_NAME_REGEX.is_match(b"LICENSE.MIT"));
669        assert!(LICENSE_FILE_NAME_REGEX.is_match(b"LICENSE_MIT"));
670        assert!(LICENSE_FILE_NAME_REGEX.is_match(b"LICENSE-ISC"));
671        assert!(LICENSE_FILE_NAME_REGEX.is_match(b"LICENSE-UPL"));
672
673        // Test with "license" coming after
674        assert!(LICENSE_FILE_NAME_REGEX.is_match(b"APACHE-LICENSE"));
675
676        // Test version numbers
677        assert!(LICENSE_FILE_NAME_REGEX.is_match(b"APACHE-2"));
678        assert!(LICENSE_FILE_NAME_REGEX.is_match(b"APACHE-2.0"));
679        assert!(LICENSE_FILE_NAME_REGEX.is_match(b"BSD-1"));
680        assert!(LICENSE_FILE_NAME_REGEX.is_match(b"BSD-2"));
681        assert!(LICENSE_FILE_NAME_REGEX.is_match(b"BSD-3"));
682        assert!(LICENSE_FILE_NAME_REGEX.is_match(b"BSD-3-CLAUSE"));
683
684        // Test combinations
685        assert!(LICENSE_FILE_NAME_REGEX.is_match(b"LICENSE-MIT.txt"));
686        assert!(LICENSE_FILE_NAME_REGEX.is_match(b"LICENCE.ISC.md"));
687        assert!(LICENSE_FILE_NAME_REGEX.is_match(b"license_upl"));
688        assert!(LICENSE_FILE_NAME_REGEX.is_match(b"LICENSE.APACHE.2.0"));
689
690        // Test case insensitive
691        assert!(LICENSE_FILE_NAME_REGEX.is_match(b"License"));
692        assert!(LICENSE_FILE_NAME_REGEX.is_match(b"license-mit.TXT"));
693        assert!(LICENSE_FILE_NAME_REGEX.is_match(b"LICENCE_isc.MD"));
694
695        // Test edge cases that should match
696        assert!(LICENSE_FILE_NAME_REGEX.is_match(b"license.mit"));
697        assert!(LICENSE_FILE_NAME_REGEX.is_match(b"licence-upl.txt"));
698
699        // Test non-matching patterns
700        assert!(!LICENSE_FILE_NAME_REGEX.is_match(b"COPYING"));
701        assert!(!LICENSE_FILE_NAME_REGEX.is_match(b"LICENSE.html"));
702        assert!(!LICENSE_FILE_NAME_REGEX.is_match(b"MYLICENSE"));
703        assert!(!LICENSE_FILE_NAME_REGEX.is_match(b"src/LICENSE"));
704        assert!(!LICENSE_FILE_NAME_REGEX.is_match(b"LICENSE.old"));
705        assert!(!LICENSE_FILE_NAME_REGEX.is_match(b"LICENSE-GPL"));
706        assert!(!LICENSE_FILE_NAME_REGEX.is_match(b"LICENSEABC"));
707    }
708
709    #[test]
710    fn test_canonicalize_license_text() {
711        let input = "  Paragraph 1\nwith multiple lines\n\n\n\nParagraph 2\nwith more lines\n  ";
712        let expected = "paragraph 1 with multiple lines paragraph 2 with more lines";
713        assert_eq!(canonicalize_license_text(input), expected);
714
715        // Test tabs and mixed whitespace
716        let input = "Word1\t\tWord2\n\n   Word3\r\n\r\n\r\nWord4   ";
717        let expected = "word1 word2 word3 word4";
718        assert_eq!(canonicalize_license_text(input), expected);
719    }
720
721    fn init_test(cx: &mut TestAppContext) {
722        cx.update(|cx| {
723            let settings_store = SettingsStore::test(cx);
724            cx.set_global(settings_store);
725        });
726    }
727
728    #[gpui::test]
729    async fn test_watcher_single_file(cx: &mut TestAppContext) {
730        init_test(cx);
731
732        let fs = FakeFs::new(cx.background_executor.clone());
733        fs.insert_tree("/root", json!({ "main.rs": "fn main() {}" }))
734            .await;
735
736        let worktree = Worktree::local(
737            Path::new("/root/main.rs"),
738            true,
739            fs.clone(),
740            Default::default(),
741            true,
742            WorktreeId::from_proto(0),
743            &mut cx.to_async(),
744        )
745        .await
746        .unwrap();
747
748        let watcher = cx.update(|cx| LicenseDetectionWatcher::new(&worktree, cx));
749        assert!(matches!(watcher, LicenseDetectionWatcher::SingleFile));
750        assert!(!watcher.is_project_open_source());
751    }
752
753    #[gpui::test]
754    async fn test_watcher_updates_on_changes(cx: &mut TestAppContext) {
755        init_test(cx);
756
757        let fs = FakeFs::new(cx.background_executor.clone());
758        fs.insert_tree("/root", json!({ "main.rs": "fn main() {}" }))
759            .await;
760
761        let worktree = Worktree::local(
762            Path::new("/root"),
763            true,
764            fs.clone(),
765            Default::default(),
766            true,
767            WorktreeId::from_proto(0),
768            &mut cx.to_async(),
769        )
770        .await
771        .unwrap();
772
773        let watcher = cx.update(|cx| LicenseDetectionWatcher::new(&worktree, cx));
774        assert!(matches!(watcher, LicenseDetectionWatcher::Local { .. }));
775        assert!(!watcher.is_project_open_source());
776
777        fs.write(Path::new("/root/LICENSE-MIT"), MIT_TXT.as_bytes())
778            .await
779            .unwrap();
780
781        cx.background_executor.run_until_parked();
782        assert!(watcher.is_project_open_source());
783
784        fs.write(Path::new("/root/LICENSE-APACHE"), APACHE_2_0_TXT.as_bytes())
785            .await
786            .unwrap();
787
788        cx.background_executor.run_until_parked();
789        assert!(watcher.is_project_open_source());
790
791        fs.write(Path::new("/root/LICENSE-MIT"), "Nevermind".as_bytes())
792            .await
793            .unwrap();
794
795        // Still considered open source as LICENSE-APACHE is present
796        cx.background_executor.run_until_parked();
797        assert!(watcher.is_project_open_source());
798
799        fs.write(
800            Path::new("/root/LICENSE-APACHE"),
801            "Also nevermind".as_bytes(),
802        )
803        .await
804        .unwrap();
805
806        cx.background_executor.run_until_parked();
807        assert!(!watcher.is_project_open_source());
808    }
809
810    #[gpui::test]
811    async fn test_watcher_initially_opensource_and_then_deleted(cx: &mut TestAppContext) {
812        init_test(cx);
813
814        let fs = FakeFs::new(cx.background_executor.clone());
815        fs.insert_tree(
816            "/root",
817            json!({ "main.rs": "fn main() {}", "LICENSE-MIT": MIT_TXT }),
818        )
819        .await;
820
821        let worktree = Worktree::local(
822            Path::new("/root"),
823            true,
824            fs.clone(),
825            Default::default(),
826            true,
827            WorktreeId::from_proto(0),
828            &mut cx.to_async(),
829        )
830        .await
831        .unwrap();
832
833        let watcher = cx.update(|cx| LicenseDetectionWatcher::new(&worktree, cx));
834        assert!(matches!(watcher, LicenseDetectionWatcher::Local { .. }));
835
836        cx.background_executor.run_until_parked();
837        assert!(watcher.is_project_open_source());
838
839        fs.remove_file(
840            Path::new("/root/LICENSE-MIT"),
841            fs::RemoveOptions {
842                recursive: false,
843                ignore_if_not_exists: false,
844            },
845        )
846        .await
847        .unwrap();
848
849        cx.background_executor.run_until_parked();
850        assert!(!watcher.is_project_open_source());
851    }
852}