license_detection.rs

  1use std::{
  2    collections::BTreeSet,
  3    fmt::{Display, Formatter},
  4    ops::Range,
  5    path::PathBuf,
  6    sync::{Arc, LazyLock},
  7};
  8
  9use anyhow::{Result, anyhow};
 10use fs::Fs;
 11use futures::StreamExt as _;
 12use gpui::{App, AppContext as _, Entity, Subscription, Task};
 13use itertools::Itertools;
 14use postage::watch;
 15use project::Worktree;
 16use strum::VariantArray;
 17use util::{ResultExt as _, maybe, rel_path::RelPath};
 18use worktree::ChildEntriesOptions;
 19
 20/// Matches the most common license locations, with US and UK English spelling.
 21static LICENSE_FILE_NAME_REGEX: LazyLock<regex::bytes::Regex> = LazyLock::new(|| {
 22    regex::bytes::RegexBuilder::new(
 23        "^ \
 24        (?: \
 25            (?: license | licence) \
 26            (?: [\\-._]? \
 27                (?: apache (?: [\\-._] (?: 2.0 | 2 ))? | \
 28                    0? bsd (?: [\\-._] [0123])? (?: [\\-._] clause)? | \
 29                    isc | \
 30                    mit | \
 31                    upl | \
 32                    zlib))? \
 33          | \
 34            (?: apache (?: [\\-._] (?: 2.0 | 2 ))? | \
 35                0? bsd (?: [\\-._] [0123])? (?: [\\-._] clause)? | \
 36                isc | \
 37                mit | \
 38                upl | \
 39                zlib) \
 40        ) \
 41        (?: [\\-._]? (?: license | licence))? \
 42        (?: \\.txt | \\.md)? \
 43        $",
 44    )
 45    .ignore_whitespace(true)
 46    .case_insensitive(true)
 47    .build()
 48    .unwrap()
 49});
 50
 51#[derive(Debug, Clone, Copy, Eq, Ord, PartialOrd, PartialEq, VariantArray)]
 52pub enum OpenSourceLicense {
 53    Apache2_0,
 54    BSDZero,
 55    BSD,
 56    ISC,
 57    MIT,
 58    UPL1_0,
 59    Zlib,
 60}
 61
 62impl Display for OpenSourceLicense {
 63    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
 64        write!(f, "{}", self.spdx_identifier())
 65    }
 66}
 67
 68impl OpenSourceLicense {
 69    /// These are SPDX identifiers for the licenses, except for BSD, where the variants are not
 70    /// distinguished.
 71    pub fn spdx_identifier(&self) -> &'static str {
 72        match self {
 73            OpenSourceLicense::Apache2_0 => "apache-2.0",
 74            OpenSourceLicense::BSDZero => "0bsd",
 75            OpenSourceLicense::BSD => "bsd",
 76            OpenSourceLicense::ISC => "isc",
 77            OpenSourceLicense::MIT => "mit",
 78            OpenSourceLicense::UPL1_0 => "upl-1.0",
 79            OpenSourceLicense::Zlib => "zlib",
 80        }
 81    }
 82
 83    pub fn patterns(&self) -> &'static [&'static str] {
 84        match self {
 85            OpenSourceLicense::Apache2_0 => &[
 86                include_str!("../license_patterns/apache-2.0-pattern"),
 87                include_str!("../license_patterns/apache-2.0-reference-pattern"),
 88            ],
 89            OpenSourceLicense::BSDZero => &[include_str!("../license_patterns/0bsd-pattern")],
 90            OpenSourceLicense::BSD => &[include_str!("../license_patterns/bsd-pattern")],
 91            OpenSourceLicense::ISC => &[include_str!("../license_patterns/isc-pattern")],
 92            OpenSourceLicense::MIT => &[include_str!("../license_patterns/mit-pattern")],
 93            OpenSourceLicense::UPL1_0 => &[include_str!("../license_patterns/upl-1.0-pattern")],
 94            OpenSourceLicense::Zlib => &[include_str!("../license_patterns/zlib-pattern")],
 95        }
 96    }
 97}
 98
 99// TODO: Consider using databake or similar to not parse at runtime.
100static LICENSE_PATTERNS: LazyLock<LicensePatterns> = LazyLock::new(|| {
101    let mut approximate_max_length = 0;
102    let mut patterns = Vec::new();
103    for license in OpenSourceLicense::VARIANTS {
104        for pattern in license.patterns() {
105            let (pattern, length) = parse_pattern(pattern).unwrap();
106            patterns.push((*license, pattern));
107            approximate_max_length = approximate_max_length.max(length);
108        }
109    }
110    LicensePatterns {
111        patterns,
112        approximate_max_length,
113    }
114});
115
116fn detect_license(text: &str) -> Option<OpenSourceLicense> {
117    let text = canonicalize_license_text(text);
118    for (license, pattern) in LICENSE_PATTERNS.patterns.iter() {
119        log::trace!("Checking if license is {}", license);
120        if check_pattern(&pattern, &text) {
121            return Some(*license);
122        }
123    }
124
125    None
126}
127
128struct LicensePatterns {
129    patterns: Vec<(OpenSourceLicense, Vec<PatternPart>)>,
130    approximate_max_length: usize,
131}
132
133#[derive(Debug, Clone, Default, PartialEq, Eq)]
134struct PatternPart {
135    /// Indicates that matching `text` is optional. Skipping `match_any_chars` is conditional on
136    /// matching `text`.
137    optional: bool,
138    /// Indicates the number of characters that can be skipped before matching `text`.
139    match_any_chars: Range<usize>,
140    /// The text to match, may be empty.
141    text: String,
142}
143
144/// Lines that start with "-- " begin a `PatternPart`. `-- 1..10` specifies `match_any_chars:
145/// 1..10`. `-- 1..10 optional:` additionally specifies `optional: true`. It's a parse error for a
146/// line to start with `--` without matching this format.
147///
148/// Text that does not have `--` prefixes participate in the `text` field and are canonicalized by
149/// lowercasing, replacing all runs of whitespace with a single space, and otherwise only keeping
150/// ascii alphanumeric characters.
151fn parse_pattern(pattern_source: &str) -> Result<(Vec<PatternPart>, usize)> {
152    let mut pattern = Vec::new();
153    let mut part = PatternPart::default();
154    let mut approximate_max_length = 0;
155    for line in pattern_source.lines() {
156        if let Some(directive) = line.trim().strip_prefix("--") {
157            if part != PatternPart::default() {
158                pattern.push(part);
159                part = PatternPart::default();
160            }
161            let valid = maybe!({
162                let directive_chunks = directive.split_whitespace().collect::<Vec<_>>();
163                if !(1..=2).contains(&directive_chunks.len()) {
164                    return None;
165                }
166                if directive_chunks.len() == 2 {
167                    part.optional = true;
168                }
169                let range_chunks = directive_chunks[0].split("..").collect::<Vec<_>>();
170                if range_chunks.len() != 2 {
171                    return None;
172                }
173                part.match_any_chars.start = range_chunks[0].parse::<usize>().ok()?;
174                part.match_any_chars.end = range_chunks[1].parse::<usize>().ok()?;
175                if part.match_any_chars.start > part.match_any_chars.end {
176                    return None;
177                }
178                approximate_max_length += part.match_any_chars.end;
179                Some(())
180            });
181            if valid.is_none() {
182                return Err(anyhow!("Invalid pattern directive: {}", line));
183            }
184            continue;
185        }
186        approximate_max_length += line.len() + 1;
187        let line = canonicalize_license_text(line);
188        if line.is_empty() {
189            continue;
190        }
191        if !part.text.is_empty() {
192            part.text.push(' ');
193        }
194        part.text.push_str(&line);
195    }
196    if part != PatternPart::default() {
197        pattern.push(part);
198    }
199    Ok((pattern, approximate_max_length))
200}
201
202/// Checks a pattern against text by iterating over the pattern parts in reverse order, and checking
203/// matches with the end of a prefix of the input. Assumes that `canonicalize_license_text` has
204/// already been applied to the input.
205fn check_pattern(pattern: &[PatternPart], input: &str) -> bool {
206    let mut input_ix = input.len();
207    let mut match_any_chars = 0..0;
208    for part in pattern.iter().rev() {
209        if part.text.is_empty() {
210            match_any_chars.start += part.match_any_chars.start;
211            match_any_chars.end += part.match_any_chars.end;
212            continue;
213        }
214
215        let search_range_end = n_chars_before_offset(match_any_chars.start, input_ix, input);
216        let search_range_start = n_chars_before_offset(
217            match_any_chars.len() + part.text.len(),
218            search_range_end,
219            input,
220        );
221        let found_ix = input[search_range_start..search_range_end].rfind(&part.text);
222
223        if let Some(found_ix) = found_ix {
224            input_ix = search_range_start + found_ix;
225            match_any_chars = part.match_any_chars.clone();
226        } else if !part.optional {
227            log::trace!(
228                "Failed to match pattern\n`...{}`\nagainst input\n`...{}`",
229                &part.text[n_chars_before_offset(128, part.text.len(), &part.text)..],
230                &input[n_chars_before_offset(128, search_range_end, input)..search_range_end],
231            );
232            return false;
233        }
234    }
235    is_char_count_within_range(&input[..input_ix], match_any_chars)
236}
237
238fn n_chars_before_offset(char_count: usize, offset: usize, string: &str) -> usize {
239    if char_count == 0 {
240        return offset;
241    }
242    string[..offset]
243        .char_indices()
244        .nth_back(char_count.saturating_sub(1))
245        .map_or(0, |(byte_ix, _)| byte_ix)
246}
247
248fn is_char_count_within_range(string: &str, char_count_range: Range<usize>) -> bool {
249    if string.len() >= char_count_range.start * 4 && string.len() < char_count_range.end {
250        return true;
251    }
252    if string.len() < char_count_range.start || string.len() >= char_count_range.end * 4 {
253        return false;
254    }
255    char_count_range.contains(&string.chars().count())
256}
257
258/// Canonicalizes license text by removing all non-alphanumeric characters, lowercasing, and turning
259/// runs of whitespace into a single space. Unicode alphanumeric characters are intentionally
260/// preserved since these should cause license mismatch when not within a portion of the license
261/// where arbitrary text is allowed.
262fn canonicalize_license_text(license: &str) -> String {
263    license
264        .chars()
265        .filter(|c| c.is_ascii_whitespace() || c.is_alphanumeric())
266        .map(|c| c.to_ascii_lowercase())
267        .collect::<String>()
268        .split_ascii_whitespace()
269        .join(" ")
270}
271
272pub enum LicenseDetectionWatcher {
273    Local {
274        is_open_source_rx: watch::Receiver<bool>,
275        _is_open_source_task: Task<()>,
276        _worktree_subscription: Subscription,
277    },
278    SingleFile,
279    Remote,
280}
281
282impl LicenseDetectionWatcher {
283    pub fn new(worktree: &Entity<Worktree>, cx: &mut App) -> Self {
284        let worktree_ref = worktree.read(cx);
285        if worktree_ref.is_single_file() {
286            return Self::SingleFile;
287        }
288
289        let (files_to_check_tx, mut files_to_check_rx) = futures::channel::mpsc::unbounded();
290
291        let Worktree::Local(local_worktree) = worktree_ref else {
292            return Self::Remote;
293        };
294        let fs = local_worktree.fs().clone();
295
296        let options = ChildEntriesOptions {
297            include_files: true,
298            include_dirs: false,
299            include_ignored: true,
300        };
301        for top_file in local_worktree.child_entries_with_options(RelPath::empty(), options) {
302            let path_bytes = top_file.path.as_unix_str().as_bytes();
303            if top_file.is_created() && LICENSE_FILE_NAME_REGEX.is_match(path_bytes) {
304                let rel_path = top_file.path.clone();
305                files_to_check_tx.unbounded_send(rel_path).ok();
306            }
307        }
308
309        let _worktree_subscription =
310            cx.subscribe(worktree, move |_worktree, event, _cx| match event {
311                worktree::Event::UpdatedEntries(updated_entries) => {
312                    for updated_entry in updated_entries.iter() {
313                        let rel_path = &updated_entry.0;
314                        let path_bytes = rel_path.as_unix_str().as_bytes();
315                        if LICENSE_FILE_NAME_REGEX.is_match(path_bytes) {
316                            files_to_check_tx.unbounded_send(rel_path.clone()).ok();
317                        }
318                    }
319                }
320                worktree::Event::DeletedEntry(_)
321                | worktree::Event::UpdatedGitRepositories(_)
322                | worktree::Event::UpdatedRootRepoCommonDir
323                | worktree::Event::Deleted => {}
324            });
325
326        let worktree_snapshot = worktree.read(cx).snapshot();
327        let (mut is_open_source_tx, is_open_source_rx) = watch::channel_with::<bool>(false);
328
329        let _is_open_source_task = cx.background_spawn(async move {
330            let mut eligible_licenses = BTreeSet::new();
331            while let Some(rel_path) = files_to_check_rx.next().await {
332                let abs_path = worktree_snapshot.absolutize(&rel_path);
333                let was_open_source = !eligible_licenses.is_empty();
334                if Self::is_path_eligible(&fs, abs_path).await.unwrap_or(false) {
335                    eligible_licenses.insert(rel_path);
336                } else {
337                    eligible_licenses.remove(&rel_path);
338                }
339                let is_open_source = !eligible_licenses.is_empty();
340                if is_open_source != was_open_source {
341                    *is_open_source_tx.borrow_mut() = is_open_source;
342                }
343            }
344        });
345
346        Self::Local {
347            is_open_source_rx,
348            _is_open_source_task,
349            _worktree_subscription,
350        }
351    }
352
353    async fn is_path_eligible(fs: &Arc<dyn Fs>, abs_path: PathBuf) -> Option<bool> {
354        log::debug!("checking if `{abs_path:?}` is an open source license");
355        // resolve symlinks so that the file size from metadata is correct
356        let Some(abs_path) = fs.canonicalize(&abs_path).await.ok() else {
357            log::debug!(
358                "`{abs_path:?}` license file probably deleted (error canonicalizing the path)"
359            );
360            return None;
361        };
362        let metadata = fs.metadata(&abs_path).await.log_err()??;
363        if metadata.is_dir {
364            return None;
365        }
366        if metadata.len > LICENSE_PATTERNS.approximate_max_length as u64 {
367            log::debug!(
368                "`{abs_path:?}` license file was skipped \
369                because its size of {} bytes was larger than the max size of {} bytes",
370                metadata.len,
371                LICENSE_PATTERNS.approximate_max_length
372            );
373            return None;
374        }
375        let text = fs.load(&abs_path).await.log_err()?;
376        let is_eligible = detect_license(&text).is_some();
377        if is_eligible {
378            log::debug!(
379                "`{abs_path:?}` matches a license that is eligible for data collection (if enabled)"
380            );
381        } else {
382            log::debug!(
383                "`{abs_path:?}` does not match a license that is eligible for data collection"
384            );
385        }
386        Some(is_eligible)
387    }
388
389    /// Answers false until we find out it's open source
390    pub fn is_project_open_source(&self) -> bool {
391        match self {
392            Self::Local {
393                is_open_source_rx, ..
394            } => *is_open_source_rx.borrow(),
395            Self::SingleFile | Self::Remote => false,
396        }
397    }
398}
399
400#[cfg(test)]
401mod tests {
402    use std::path::Path;
403
404    use fs::FakeFs;
405    use gpui::TestAppContext;
406    use project::WorktreeId;
407    use rand::Rng as _;
408    use serde_json::json;
409    use settings::SettingsStore;
410
411    use super::*;
412
413    const APACHE_2_0_TXT: &str = include_str!("../license_examples/apache-2.0-ex0.txt");
414    const ISC_TXT: &str = include_str!("../license_examples/isc.txt");
415    const MIT_TXT: &str = include_str!("../license_examples/mit-ex0.txt");
416    const UPL_1_0_TXT: &str = include_str!("../license_examples/upl-1.0.txt");
417    const BSD_0_TXT: &str = include_str!("../license_examples/0bsd.txt");
418
419    #[track_caller]
420    fn assert_matches_license(text: &str, license: OpenSourceLicense) {
421        assert_eq!(detect_license(text), Some(license));
422        assert!(text.len() < LICENSE_PATTERNS.approximate_max_length);
423    }
424
425    /*
426    // Uncomment this and run with `cargo test -p zeta -- --no-capture &> licenses-output` to
427    // traverse your entire home directory and run license detection on every file that has a
428    // license-like name.
429    #[test]
430    fn test_check_all_licenses_in_home_dir() {
431        let mut detected = Vec::new();
432        let mut unrecognized = Vec::new();
433        let mut walked_entries = 0;
434        let homedir = std::env::home_dir().unwrap();
435        for entry in walkdir::WalkDir::new(&homedir) {
436            walked_entries += 1;
437            if walked_entries % 10000 == 0 {
438                println!(
439                    "So far visited {} files in {}",
440                    walked_entries,
441                    homedir.display()
442                );
443            }
444            let Ok(entry) = entry else {
445                continue;
446            };
447            if !LICENSE_FILE_NAME_REGEX.is_match(entry.file_name().as_encoded_bytes()) {
448                continue;
449            }
450            let Ok(contents) = std::fs::read_to_string(entry.path()) else {
451                continue;
452            };
453            let path_string = entry.path().to_string_lossy().into_owned();
454            let license = detect_license(&contents);
455            match license {
456                Some(license) => detected.push((license, path_string)),
457                None => unrecognized.push(path_string),
458            }
459        }
460        println!("\nDetected licenses:\n");
461        detected.sort();
462        for (license, path) in &detected {
463            println!("{}: {}", license.spdx_identifier(), path);
464        }
465        println!("\nUnrecognized licenses:\n");
466        for path in &unrecognized {
467            println!("{}", path);
468        }
469        panic!(
470            "{} licenses detected, {} unrecognized",
471            detected.len(),
472            unrecognized.len()
473        );
474        println!("This line has a warning to make sure this test is always commented out");
475    }
476    */
477
478    #[test]
479    fn test_apache_positive_detection() {
480        assert_matches_license(APACHE_2_0_TXT, OpenSourceLicense::Apache2_0);
481        assert_matches_license(
482            include_str!("../license_examples/apache-2.0-ex1.txt"),
483            OpenSourceLicense::Apache2_0,
484        );
485        assert_matches_license(
486            include_str!("../license_examples/apache-2.0-ex2.txt"),
487            OpenSourceLicense::Apache2_0,
488        );
489        assert_matches_license(
490            include_str!("../license_examples/apache-2.0-ex3.txt"),
491            OpenSourceLicense::Apache2_0,
492        );
493        assert_matches_license(
494            include_str!("../license_examples/apache-2.0-ex4.txt"),
495            OpenSourceLicense::Apache2_0,
496        );
497        assert_matches_license(
498            include_str!("../../../LICENSE-APACHE"),
499            OpenSourceLicense::Apache2_0,
500        );
501    }
502
503    #[test]
504    fn test_apache_negative_detection() {
505        assert_eq!(
506            detect_license(&format!(
507                "{APACHE_2_0_TXT}\n\nThe terms in this license are void if P=NP."
508            )),
509            None
510        );
511    }
512
513    #[test]
514    fn test_bsd_1_clause_positive_detection() {
515        assert_matches_license(
516            include_str!("../license_examples/bsd-1-clause.txt"),
517            OpenSourceLicense::BSD,
518        );
519    }
520
521    #[test]
522    fn test_bsd_2_clause_positive_detection() {
523        assert_matches_license(
524            include_str!("../license_examples/bsd-2-clause-ex0.txt"),
525            OpenSourceLicense::BSD,
526        );
527    }
528
529    #[test]
530    fn test_bsd_3_clause_positive_detection() {
531        assert_matches_license(
532            include_str!("../license_examples/bsd-3-clause-ex0.txt"),
533            OpenSourceLicense::BSD,
534        );
535        assert_matches_license(
536            include_str!("../license_examples/bsd-3-clause-ex1.txt"),
537            OpenSourceLicense::BSD,
538        );
539        assert_matches_license(
540            include_str!("../license_examples/bsd-3-clause-ex2.txt"),
541            OpenSourceLicense::BSD,
542        );
543        assert_matches_license(
544            include_str!("../license_examples/bsd-3-clause-ex3.txt"),
545            OpenSourceLicense::BSD,
546        );
547        assert_matches_license(
548            include_str!("../license_examples/bsd-3-clause-ex4.txt"),
549            OpenSourceLicense::BSD,
550        );
551    }
552
553    #[test]
554    fn test_bsd_0_positive_detection() {
555        assert_matches_license(BSD_0_TXT, OpenSourceLicense::BSDZero);
556    }
557
558    #[test]
559    fn test_isc_positive_detection() {
560        assert_matches_license(ISC_TXT, OpenSourceLicense::ISC);
561    }
562
563    #[test]
564    fn test_isc_negative_detection() {
565        let license_text = format!(
566            r#"{ISC_TXT}
567
568            This project is dual licensed under the ISC License and the MIT License."#
569        );
570
571        assert_eq!(detect_license(&license_text), None);
572    }
573
574    #[test]
575    fn test_mit_positive_detection() {
576        assert_matches_license(MIT_TXT, OpenSourceLicense::MIT);
577        assert_matches_license(
578            include_str!("../license_examples/mit-ex1.txt"),
579            OpenSourceLicense::MIT,
580        );
581        assert_matches_license(
582            include_str!("../license_examples/mit-ex2.txt"),
583            OpenSourceLicense::MIT,
584        );
585        assert_matches_license(
586            include_str!("../license_examples/mit-ex3.txt"),
587            OpenSourceLicense::MIT,
588        );
589    }
590
591    #[test]
592    fn test_mit_negative_detection() {
593        let license_text = format!(
594            r#"{MIT_TXT}
595
596            This project is dual licensed under the MIT License and the Apache License, Version 2.0."#
597        );
598        assert_eq!(detect_license(&license_text), None);
599    }
600
601    #[test]
602    fn test_upl_positive_detection() {
603        assert_matches_license(UPL_1_0_TXT, OpenSourceLicense::UPL1_0);
604    }
605
606    #[test]
607    fn test_upl_negative_detection() {
608        let license_text = format!(
609            r#"{UPL_1_0_TXT}
610
611            This project is dual licensed under the UPL License and the MIT License."#
612        );
613
614        assert_eq!(detect_license(&license_text), None);
615    }
616
617    #[test]
618    fn test_zlib_positive_detection() {
619        assert_matches_license(
620            include_str!("../license_examples/zlib-ex0.txt"),
621            OpenSourceLicense::Zlib,
622        );
623    }
624
625    #[test]
626    fn random_strings_negative_detection() {
627        for _i in 0..20 {
628            let random_string = rand::rng()
629                .sample_iter::<char, _>(rand::distr::StandardUniform)
630                .take(512)
631                .collect::<String>();
632            assert_eq!(detect_license(&random_string), None);
633        }
634    }
635
636    #[test]
637    fn test_n_chars_before_offset() {
638        assert_eq!(n_chars_before_offset(2, 4, "hello"), 2);
639
640        let input = "ㄒ乇丂ㄒ";
641        assert_eq!(n_chars_before_offset(2, input.len(), input), "ㄒ乇".len());
642    }
643
644    #[test]
645    fn test_is_char_count_within_range() {
646        // TODO: make this into a proper property test.
647        for _i in 0..20 {
648            let mut rng = rand::rng();
649            let random_char_count = rng.random_range(0..64);
650            let random_string = rand::rng()
651                .sample_iter::<char, _>(rand::distr::StandardUniform)
652                .take(random_char_count)
653                .collect::<String>();
654            let min_chars = rng.random_range(0..10);
655            let max_chars = rng.random_range(min_chars..32);
656            let char_count_range = min_chars..max_chars;
657            assert_eq!(
658                is_char_count_within_range(&random_string, char_count_range.clone()),
659                char_count_range.contains(&random_char_count),
660            );
661        }
662    }
663
664    #[test]
665    fn test_license_file_name_regex() {
666        // Test basic license file names
667        assert!(LICENSE_FILE_NAME_REGEX.is_match(b"LICENSE"));
668        assert!(LICENSE_FILE_NAME_REGEX.is_match(b"LICENCE"));
669        assert!(LICENSE_FILE_NAME_REGEX.is_match(b"license"));
670        assert!(LICENSE_FILE_NAME_REGEX.is_match(b"licence"));
671
672        // Test with extensions
673        assert!(LICENSE_FILE_NAME_REGEX.is_match(b"LICENSE.txt"));
674        assert!(LICENSE_FILE_NAME_REGEX.is_match(b"LICENSE.md"));
675        assert!(LICENSE_FILE_NAME_REGEX.is_match(b"LICENCE.txt"));
676        assert!(LICENSE_FILE_NAME_REGEX.is_match(b"LICENCE.md"));
677
678        // Test with specific license types
679        assert!(LICENSE_FILE_NAME_REGEX.is_match(b"LICENSE-APACHE"));
680        assert!(LICENSE_FILE_NAME_REGEX.is_match(b"LICENSE-MIT"));
681        assert!(LICENSE_FILE_NAME_REGEX.is_match(b"LICENSE.MIT"));
682        assert!(LICENSE_FILE_NAME_REGEX.is_match(b"LICENSE_MIT"));
683        assert!(LICENSE_FILE_NAME_REGEX.is_match(b"LICENSE-ISC"));
684        assert!(LICENSE_FILE_NAME_REGEX.is_match(b"LICENSE-UPL"));
685
686        // Test with "license" coming after
687        assert!(LICENSE_FILE_NAME_REGEX.is_match(b"APACHE-LICENSE"));
688
689        // Test version numbers
690        assert!(LICENSE_FILE_NAME_REGEX.is_match(b"APACHE-2"));
691        assert!(LICENSE_FILE_NAME_REGEX.is_match(b"APACHE-2.0"));
692        assert!(LICENSE_FILE_NAME_REGEX.is_match(b"BSD-1"));
693        assert!(LICENSE_FILE_NAME_REGEX.is_match(b"BSD-2"));
694        assert!(LICENSE_FILE_NAME_REGEX.is_match(b"BSD-3"));
695        assert!(LICENSE_FILE_NAME_REGEX.is_match(b"BSD-3-CLAUSE"));
696
697        // Test combinations
698        assert!(LICENSE_FILE_NAME_REGEX.is_match(b"LICENSE-MIT.txt"));
699        assert!(LICENSE_FILE_NAME_REGEX.is_match(b"LICENCE.ISC.md"));
700        assert!(LICENSE_FILE_NAME_REGEX.is_match(b"license_upl"));
701        assert!(LICENSE_FILE_NAME_REGEX.is_match(b"LICENSE.APACHE.2.0"));
702
703        // Test case insensitive
704        assert!(LICENSE_FILE_NAME_REGEX.is_match(b"License"));
705        assert!(LICENSE_FILE_NAME_REGEX.is_match(b"license-mit.TXT"));
706        assert!(LICENSE_FILE_NAME_REGEX.is_match(b"LICENCE_isc.MD"));
707
708        // Test edge cases that should match
709        assert!(LICENSE_FILE_NAME_REGEX.is_match(b"license.mit"));
710        assert!(LICENSE_FILE_NAME_REGEX.is_match(b"licence-upl.txt"));
711
712        // Test non-matching patterns
713        assert!(!LICENSE_FILE_NAME_REGEX.is_match(b""));
714        assert!(!LICENSE_FILE_NAME_REGEX.is_match(b"COPYING"));
715        assert!(!LICENSE_FILE_NAME_REGEX.is_match(b"LICENSE.html"));
716        assert!(!LICENSE_FILE_NAME_REGEX.is_match(b"MYLICENSE"));
717        assert!(!LICENSE_FILE_NAME_REGEX.is_match(b"src/LICENSE"));
718        assert!(!LICENSE_FILE_NAME_REGEX.is_match(b"LICENSE.old"));
719        assert!(!LICENSE_FILE_NAME_REGEX.is_match(b"LICENSE-GPL"));
720        assert!(!LICENSE_FILE_NAME_REGEX.is_match(b"LICENSEABC"));
721    }
722
723    #[test]
724    fn test_canonicalize_license_text() {
725        let input = "  Paragraph 1\nwith multiple lines\n\n\n\nParagraph 2\nwith more lines\n  ";
726        let expected = "paragraph 1 with multiple lines paragraph 2 with more lines";
727        assert_eq!(canonicalize_license_text(input), expected);
728
729        // Test tabs and mixed whitespace
730        let input = "Word1\t\tWord2\n\n   Word3\r\n\r\n\r\nWord4   ";
731        let expected = "word1 word2 word3 word4";
732        assert_eq!(canonicalize_license_text(input), expected);
733    }
734
735    fn init_test(cx: &mut TestAppContext) {
736        cx.update(|cx| {
737            let settings_store = SettingsStore::test(cx);
738            cx.set_global(settings_store);
739        });
740    }
741
742    #[gpui::test]
743    async fn test_watcher_single_file(cx: &mut TestAppContext) {
744        init_test(cx);
745
746        let fs = FakeFs::new(cx.background_executor.clone());
747        fs.insert_tree("/root", json!({ "main.rs": "fn main() {}" }))
748            .await;
749
750        let worktree = Worktree::local(
751            Path::new("/root/main.rs"),
752            true,
753            fs.clone(),
754            Default::default(),
755            true,
756            WorktreeId::from_proto(0),
757            &mut cx.to_async(),
758        )
759        .await
760        .unwrap();
761
762        let watcher = cx.update(|cx| LicenseDetectionWatcher::new(&worktree, cx));
763        assert!(matches!(watcher, LicenseDetectionWatcher::SingleFile));
764        assert!(!watcher.is_project_open_source());
765    }
766
767    #[gpui::test]
768    async fn test_watcher_updates_on_changes(cx: &mut TestAppContext) {
769        init_test(cx);
770
771        let fs = FakeFs::new(cx.background_executor.clone());
772        fs.insert_tree("/root", json!({ "main.rs": "fn main() {}" }))
773            .await;
774
775        let worktree = Worktree::local(
776            Path::new("/root"),
777            true,
778            fs.clone(),
779            Default::default(),
780            true,
781            WorktreeId::from_proto(0),
782            &mut cx.to_async(),
783        )
784        .await
785        .unwrap();
786
787        let watcher = cx.update(|cx| LicenseDetectionWatcher::new(&worktree, cx));
788        assert!(matches!(watcher, LicenseDetectionWatcher::Local { .. }));
789        assert!(!watcher.is_project_open_source());
790
791        fs.write(Path::new("/root/LICENSE-MIT"), MIT_TXT.as_bytes())
792            .await
793            .unwrap();
794
795        cx.background_executor.run_until_parked();
796        assert!(watcher.is_project_open_source());
797
798        fs.write(Path::new("/root/LICENSE-APACHE"), APACHE_2_0_TXT.as_bytes())
799            .await
800            .unwrap();
801
802        cx.background_executor.run_until_parked();
803        assert!(watcher.is_project_open_source());
804
805        fs.write(Path::new("/root/LICENSE-MIT"), "Nevermind".as_bytes())
806            .await
807            .unwrap();
808
809        // Still considered open source as LICENSE-APACHE is present
810        cx.background_executor.run_until_parked();
811        assert!(watcher.is_project_open_source());
812
813        fs.write(
814            Path::new("/root/LICENSE-APACHE"),
815            "Also nevermind".as_bytes(),
816        )
817        .await
818        .unwrap();
819
820        cx.background_executor.run_until_parked();
821        assert!(!watcher.is_project_open_source());
822    }
823
824    #[gpui::test]
825    async fn test_watcher_initially_opensource_and_then_deleted(cx: &mut TestAppContext) {
826        init_test(cx);
827
828        let fs = FakeFs::new(cx.background_executor.clone());
829        fs.insert_tree(
830            "/root",
831            json!({ "main.rs": "fn main() {}", "LICENSE-MIT": MIT_TXT }),
832        )
833        .await;
834
835        let worktree = Worktree::local(
836            Path::new("/root"),
837            true,
838            fs.clone(),
839            Default::default(),
840            true,
841            WorktreeId::from_proto(0),
842            &mut cx.to_async(),
843        )
844        .await
845        .unwrap();
846
847        let watcher = cx.update(|cx| LicenseDetectionWatcher::new(&worktree, cx));
848        assert!(matches!(watcher, LicenseDetectionWatcher::Local { .. }));
849
850        cx.background_executor.run_until_parked();
851        assert!(watcher.is_project_open_source());
852
853        fs.remove_file(
854            Path::new("/root/LICENSE-MIT"),
855            fs::RemoveOptions {
856                recursive: false,
857                ignore_if_not_exists: false,
858            },
859        )
860        .await
861        .unwrap();
862
863        cx.background_executor.run_until_parked();
864        assert!(!watcher.is_project_open_source());
865    }
866}