invisibles.rs

  1use std::sync::LazyLock;
  2
  3use collections::HashMap;
  4
  5// Invisibility in a Unicode context is not well defined, so we have to guess.
  6//
  7// We highlight all ASCII control codes, and unicode whitespace because they are likely
  8// confused with a normal space (U+0020).
  9//
 10// We also highlight the handful of blank non-space characters:
 11//   U+2800 BRAILLE PATTERN BLANK - Category: So
 12//   U+115F HANGUL CHOSEONG FILLER - Category: Lo
 13//   U+1160 HANGUL CHOSEONG FILLER - Category: Lo
 14//   U+3164 HANGUL FILLER - Category: Lo
 15//   U+FFA0 HALFWIDTH HANGUL FILLER - Category: Lo
 16//   U+FFFC OBJECT REPLACEMENT CHARACTER - Category: So
 17//
 18// For the rest of Unicode, invisibility happens for two reasons:
 19// * A Format character (like a byte order mark or right-to-left override)
 20// * An invisible Nonspacing Mark character (like U+034F, or variation selectors)
 21//
 22// We don't consider unassigned codepoints invisible as the font renderer already shows
 23// a replacement character in that case (and there are a *lot* of them)
 24//
 25// Control characters are mostly fine to highlight; except:
 26// * U+E0020..=U+E007F are used in emoji flags. We don't highlight them right now, but we could if we tightened our heuristics.
 27// * U+200D is used to join characters. We highlight this but don't replace it. As our font system ignores mid-glyph highlights this mostly works to highlight unexpected uses.
 28//
 29// Nonspacing marks are handled like U+200D. This means that mid-glyph we ignore them, but
 30// probably causes issues with end-of-glyph usage.
 31//
 32// ref: https://invisible-characters.com
 33// ref: https://www.compart.com/en/unicode/category/Cf
 34// ref: https://gist.github.com/ConradIrwin/f759e1fc29267143c4c7895aa495dca5?h=1
 35// ref: https://unicode.org/Public/emoji/13.0/emoji-test.txt
 36// https://github.com/bits/UTF-8-Unicode-Test-Documents/blob/master/UTF-8_sequence_separated/utf8_sequence_0-0x10ffff_assigned_including-unprintable-asis.txt
 37pub fn is_invisible(c: char) -> bool {
 38    if c <= '\u{1f}' {
 39        c != '\t' && c != '\n' && c != '\r'
 40    } else if c >= '\u{7f}' {
 41        c <= '\u{9f}' || c.is_whitespace() || contains(c, &FORMAT) || contains(c, &OTHER)
 42    } else {
 43        false
 44    }
 45}
 46
 47pub(crate) fn replacement(c: char) -> Option<&'static str> {
 48    if !is_invisible(c) {
 49        return None;
 50    }
 51    if c <= '\x7f' {
 52        REPLACEMENTS.get(&c).copied()
 53    } else if contains(c, &PRESERVE) {
 54        None
 55    } else {
 56        Some(" ")
 57    }
 58}
 59
 60const REPLACEMENTS: LazyLock<HashMap<char, &'static str>> = LazyLock::new(|| {
 61    [
 62        ('\x00', ""),
 63        ('\x01', ""),
 64        ('\x02', ""),
 65        ('\x03', ""),
 66        ('\x04', ""),
 67        ('\x05', ""),
 68        ('\x06', ""),
 69        ('\x07', ""),
 70        ('\x08', ""),
 71        ('\x0B', ""),
 72        ('\x0C', ""),
 73        ('\x0D', ""),
 74        ('\x0E', ""),
 75        ('\x0F', ""),
 76        ('\x10', ""),
 77        ('\x11', ""),
 78        ('\x12', ""),
 79        ('\x13', ""),
 80        ('\x14', ""),
 81        ('\x15', ""),
 82        ('\x16', ""),
 83        ('\x17', ""),
 84        ('\x18', ""),
 85        ('\x19', ""),
 86        ('\x1A', ""),
 87        ('\x1B', ""),
 88        ('\x1C', ""),
 89        ('\x1D', ""),
 90        ('\x1E', ""),
 91        ('\x1F', ""),
 92        ('\u{007F}', ""),
 93    ]
 94    .into_iter()
 95    .collect()
 96});
 97
 98// generated using ucd-generate: ucd-generate general-category --include Format --chars ucd-16.0.0
 99pub const FORMAT: &'static [(char, char)] = &[
100    ('\u{ad}', '\u{ad}'),
101    ('\u{600}', '\u{605}'),
102    ('\u{61c}', '\u{61c}'),
103    ('\u{6dd}', '\u{6dd}'),
104    ('\u{70f}', '\u{70f}'),
105    ('\u{890}', '\u{891}'),
106    ('\u{8e2}', '\u{8e2}'),
107    ('\u{180e}', '\u{180e}'),
108    ('\u{200b}', '\u{200f}'),
109    ('\u{202a}', '\u{202e}'),
110    ('\u{2060}', '\u{2064}'),
111    ('\u{2066}', '\u{206f}'),
112    ('\u{feff}', '\u{feff}'),
113    ('\u{fff9}', '\u{fffb}'),
114    ('\u{110bd}', '\u{110bd}'),
115    ('\u{110cd}', '\u{110cd}'),
116    ('\u{13430}', '\u{1343f}'),
117    ('\u{1bca0}', '\u{1bca3}'),
118    ('\u{1d173}', '\u{1d17a}'),
119    ('\u{e0001}', '\u{e0001}'),
120    ('\u{e0020}', '\u{e007f}'),
121];
122
123// hand-made base on https://invisible-characters.com (Excluding Cf)
124pub const OTHER: &'static [(char, char)] = &[
125    ('\u{034f}', '\u{034f}'),
126    ('\u{115F}', '\u{1160}'),
127    ('\u{17b4}', '\u{17b5}'),
128    ('\u{180b}', '\u{180d}'),
129    ('\u{2800}', '\u{2800}'),
130    ('\u{3164}', '\u{3164}'),
131    ('\u{fe00}', '\u{fe0d}'),
132    ('\u{ffa0}', '\u{ffa0}'),
133    ('\u{fffc}', '\u{fffc}'),
134    ('\u{e0100}', '\u{e01ef}'),
135];
136
137// a subset of FORMAT/OTHER that may appear within glyphs
138const PRESERVE: &'static [(char, char)] = &[
139    ('\u{034f}', '\u{034f}'),
140    ('\u{200d}', '\u{200d}'),
141    ('\u{17b4}', '\u{17b5}'),
142    ('\u{180b}', '\u{180d}'),
143    ('\u{e0061}', '\u{e007a}'),
144    ('\u{e007f}', '\u{e007f}'),
145];
146
147fn contains(c: char, list: &[(char, char)]) -> bool {
148    for (start, end) in list {
149        if c < *start {
150            return false;
151        }
152        if c <= *end {
153            return true;
154        }
155    }
156    false
157}