invisibles.rs

  1// Invisibility in a Unicode context is not well defined, so we have to guess.
  2//
  3// We highlight all ASCII control codes, and unicode whitespace because they are likely
  4// confused with an ASCII space in a programming context (U+0020).
  5//
  6// We also highlight the handful of blank non-space characters:
  7//   U+2800 BRAILLE PATTERN BLANK - Category: So
  8//   U+115F HANGUL CHOSEONG FILLER - Category: Lo
  9//   U+1160 HANGUL CHOSEONG FILLER - Category: Lo
 10//   U+3164 HANGUL FILLER - Category: Lo
 11//   U+FFA0 HALFWIDTH HANGUL FILLER - Category: Lo
 12//   U+FFFC OBJECT REPLACEMENT CHARACTER - Category: So
 13//
 14// For the rest of Unicode, invisibility happens for two reasons:
 15// * A Format character (like a byte order mark or right-to-left override)
 16// * An invisible Nonspacing Mark character (like U+034F, or variation selectors)
 17//
 18// We don't consider unassigned codepoints invisible as the font renderer already shows
 19// a replacement character in that case (and there are a *lot* of them)
 20//
 21// Control characters are mostly fine to highlight; except:
 22// * U+E0020..=U+E007F are used in emoji flags. We don't highlight them right now, but we could if we tightened our heuristics.
 23// * U+200D is used to join characters. We highlight this but don't replace it. As our font system ignores mid-glyph highlights this mostly works to highlight unexpected uses.
 24//
 25// Nonspacing marks are handled like U+200D. This means that mid-glyph we ignore them, but
 26// probably causes issues with end-of-glyph usage.
 27//
 28// ref: https://invisible-characters.com
 29// ref: https://www.compart.com/en/unicode/category/Cf
 30// ref: https://gist.github.com/ConradIrwin/f759e1fc29267143c4c7895aa495dca5?h=1
 31// ref: https://unicode.org/Public/emoji/13.0/emoji-test.txt
 32// https://github.com/bits/UTF-8-Unicode-Test-Documents/blob/master/UTF-8_sequence_separated/utf8_sequence_0-0x10ffff_assigned_including-unprintable-asis.txt
 33#[ztracing::instrument(skip_all)]
 34pub fn is_invisible(c: char) -> bool {
 35    if c <= '\u{1f}' {
 36        c != '\t' && c != '\n' && c != '\r'
 37    } else if c >= '\u{7f}' {
 38        c <= '\u{9f}'
 39            || (c.is_whitespace() && c != IDEOGRAPHIC_SPACE)
 40            || contains(c, FORMAT)
 41            || contains(c, OTHER)
 42    } else {
 43        false
 44    }
 45}
 46// ASCII control characters have fancy unicode glyphs, everything else
 47// is replaced by a space - unless it is used in combining characters in
 48// which case we need to leave it in the string.
 49pub fn replacement(c: char) -> Option<&'static str> {
 50    if c <= '\x1f' {
 51        Some(C0_SYMBOLS[c as usize])
 52    } else if c == '\x7f' {
 53        Some(DEL)
 54    } else if contains(c, PRESERVE) {
 55        None
 56    } else {
 57        Some(FIXED_WIDTH_SPACE)
 58    }
 59}
 60
 61const FIXED_WIDTH_SPACE: &str = "\u{2007}";
 62
 63// IDEOGRAPHIC SPACE is common alongside Chinese and other wide character sets.
 64// We don't highlight this for now (as it already shows up wide in the editor),
 65// but could if we tracked state in the classifier.
 66const IDEOGRAPHIC_SPACE: char = '\u{3000}';
 67
 68const C0_SYMBOLS: &[&str] = &[
 69    "␀", "␁", "␂", "␃", "␄", "␅", "␆", "␇", "␈", "␉", "␊", "␋", "␌", "␍", "␎", "␏", "␐", "␑", "␒",
 70    "␓", "␔", "␕", "␖", "␗", "␘", "␙", "␚", "␛", "␜", "␝", "␞", "␟",
 71];
 72const DEL: &str = "␡";
 73
 74// generated using ucd-generate: ucd-generate general-category --include Format --chars ucd-16.0.0
 75pub const FORMAT: &[(char, char)] = &[
 76    ('\u{ad}', '\u{ad}'),
 77    ('\u{600}', '\u{605}'),
 78    ('\u{61c}', '\u{61c}'),
 79    ('\u{6dd}', '\u{6dd}'),
 80    ('\u{70f}', '\u{70f}'),
 81    ('\u{890}', '\u{891}'),
 82    ('\u{8e2}', '\u{8e2}'),
 83    ('\u{180e}', '\u{180e}'),
 84    ('\u{200b}', '\u{200f}'),
 85    ('\u{202a}', '\u{202e}'),
 86    ('\u{2060}', '\u{2064}'),
 87    ('\u{2066}', '\u{206f}'),
 88    ('\u{feff}', '\u{feff}'),
 89    ('\u{fff9}', '\u{fffb}'),
 90    ('\u{110bd}', '\u{110bd}'),
 91    ('\u{110cd}', '\u{110cd}'),
 92    ('\u{13430}', '\u{1343f}'),
 93    ('\u{1bca0}', '\u{1bca3}'),
 94    ('\u{1d173}', '\u{1d17a}'),
 95    ('\u{e0001}', '\u{e0001}'),
 96    ('\u{e0020}', '\u{e007f}'),
 97];
 98
 99// hand-made base on https://invisible-characters.com (Excluding Cf)
100pub const OTHER: &[(char, char)] = &[
101    ('\u{034f}', '\u{034f}'),
102    ('\u{115F}', '\u{1160}'),
103    ('\u{17b4}', '\u{17b5}'),
104    ('\u{180b}', '\u{180d}'),
105    ('\u{2800}', '\u{2800}'),
106    ('\u{3164}', '\u{3164}'),
107    ('\u{fe00}', '\u{fe0d}'),
108    ('\u{ffa0}', '\u{ffa0}'),
109    ('\u{fffc}', '\u{fffc}'),
110    ('\u{e0100}', '\u{e01ef}'),
111];
112
113// a subset of FORMAT/OTHER that may appear within glyphs
114const PRESERVE: &[(char, char)] = &[
115    ('\u{034f}', '\u{034f}'),
116    ('\u{200d}', '\u{200d}'),
117    ('\u{17b4}', '\u{17b5}'),
118    ('\u{180b}', '\u{180d}'),
119    ('\u{e0061}', '\u{e007a}'),
120    ('\u{e007f}', '\u{e007f}'),
121];
122
123fn contains(c: char, list: &[(char, char)]) -> bool {
124    for &(start, end) in list {
125        if c < start {
126            return false;
127        }
128        if c <= end {
129            return true;
130        }
131    }
132    false
133}