invisibles.rs

  1// Invisibility in a Unicode context is not well defined, so we have to guess.
  2//
  3// We highlight all ASCII control codes, and unicode whitespace because they are likely
  4// confused with an ASCII space in a programming context (U+0020).
  5//
  6// We also highlight the handful of blank non-space characters:
  7//   U+2800 BRAILLE PATTERN BLANK - Category: So
  8//   U+115F HANGUL CHOSEONG FILLER - Category: Lo
  9//   U+1160 HANGUL CHOSEONG FILLER - Category: Lo
 10//   U+3164 HANGUL FILLER - Category: Lo
 11//   U+FFA0 HALFWIDTH HANGUL FILLER - Category: Lo
 12//   U+FFFC OBJECT REPLACEMENT CHARACTER - Category: So
 13//
 14// For the rest of Unicode, invisibility happens for two reasons:
 15// * A Format character (like a byte order mark or right-to-left override)
 16// * An invisible Nonspacing Mark character (like U+034F, or variation selectors)
 17//
 18// We don't consider unassigned codepoints invisible as the font renderer already shows
 19// a replacement character in that case (and there are a *lot* of them)
 20//
 21// Control characters are mostly fine to highlight; except:
 22// * U+E0020..=U+E007F are used in emoji flags. We don't highlight them right now, but we could if we tightened our heuristics.
 23// * U+200D is used to join characters. We highlight this but don't replace it. As our font system ignores mid-glyph highlights this mostly works to highlight unexpected uses.
 24//
 25// Nonspacing marks are handled like U+200D. This means that mid-glyph we ignore them, but
 26// probably causes issues with end-of-glyph usage.
 27//
 28// ref: https://invisible-characters.com
 29// ref: https://www.compart.com/en/unicode/category/Cf
 30// ref: https://gist.github.com/ConradIrwin/f759e1fc29267143c4c7895aa495dca5?h=1
 31// ref: https://unicode.org/Public/emoji/13.0/emoji-test.txt
 32// https://github.com/bits/UTF-8-Unicode-Test-Documents/blob/master/UTF-8_sequence_separated/utf8_sequence_0-0x10ffff_assigned_including-unprintable-asis.txt
 33pub fn is_invisible(c: char) -> bool {
 34    if c <= '\u{1f}' {
 35        c != '\t' && c != '\n' && c != '\r'
 36    } else if c >= '\u{7f}' {
 37        c <= '\u{9f}'
 38            || (c.is_whitespace() && c != IDEOGRAPHIC_SPACE)
 39            || contains(c, FORMAT)
 40            || contains(c, OTHER)
 41    } else {
 42        false
 43    }
 44}
 45// ASCII control characters have fancy unicode glyphs, everything else
 46// is replaced by a space - unless it is used in combining characters in
 47// which case we need to leave it in the string.
 48pub fn replacement(c: char) -> Option<&'static str> {
 49    if c <= '\x1f' {
 50        Some(C0_SYMBOLS[c as usize])
 51    } else if c == '\x7f' {
 52        Some(DEL)
 53    } else if contains(c, PRESERVE) {
 54        None
 55    } else {
 56        Some(FIXED_WIDTH_SPACE)
 57    }
 58}
 59
 60const FIXED_WIDTH_SPACE: &str = "\u{2007}";
 61
 62// IDEOGRAPHIC SPACE is common alongside Chinese and other wide character sets.
 63// We don't highlight this for now (as it already shows up wide in the editor),
 64// but could if we tracked state in the classifier.
 65const IDEOGRAPHIC_SPACE: char = '\u{3000}';
 66
 67const C0_SYMBOLS: &[&str] = &[
 68    "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
 69    "", "", "", "", "", "", "", "", "", "", "", "", "",
 70];
 71const DEL: &str = "";
 72
 73// generated using ucd-generate: ucd-generate general-category --include Format --chars ucd-16.0.0
 74pub const FORMAT: &[(char, char)] = &[
 75    ('\u{ad}', '\u{ad}'),
 76    ('\u{600}', '\u{605}'),
 77    ('\u{61c}', '\u{61c}'),
 78    ('\u{6dd}', '\u{6dd}'),
 79    ('\u{70f}', '\u{70f}'),
 80    ('\u{890}', '\u{891}'),
 81    ('\u{8e2}', '\u{8e2}'),
 82    ('\u{180e}', '\u{180e}'),
 83    ('\u{200b}', '\u{200f}'),
 84    ('\u{202a}', '\u{202e}'),
 85    ('\u{2060}', '\u{2064}'),
 86    ('\u{2066}', '\u{206f}'),
 87    ('\u{feff}', '\u{feff}'),
 88    ('\u{fff9}', '\u{fffb}'),
 89    ('\u{110bd}', '\u{110bd}'),
 90    ('\u{110cd}', '\u{110cd}'),
 91    ('\u{13430}', '\u{1343f}'),
 92    ('\u{1bca0}', '\u{1bca3}'),
 93    ('\u{1d173}', '\u{1d17a}'),
 94    ('\u{e0001}', '\u{e0001}'),
 95    ('\u{e0020}', '\u{e007f}'),
 96];
 97
 98// hand-made base on https://invisible-characters.com (Excluding Cf)
 99pub const OTHER: &[(char, char)] = &[
100    ('\u{034f}', '\u{034f}'),
101    ('\u{115F}', '\u{1160}'),
102    ('\u{17b4}', '\u{17b5}'),
103    ('\u{180b}', '\u{180d}'),
104    ('\u{2800}', '\u{2800}'),
105    ('\u{3164}', '\u{3164}'),
106    ('\u{fe00}', '\u{fe0d}'),
107    ('\u{ffa0}', '\u{ffa0}'),
108    ('\u{fffc}', '\u{fffc}'),
109    ('\u{e0100}', '\u{e01ef}'),
110];
111
112// a subset of FORMAT/OTHER that may appear within glyphs
113const PRESERVE: &[(char, char)] = &[
114    ('\u{034f}', '\u{034f}'),
115    ('\u{200d}', '\u{200d}'),
116    ('\u{17b4}', '\u{17b5}'),
117    ('\u{180b}', '\u{180d}'),
118    ('\u{e0061}', '\u{e007a}'),
119    ('\u{e007f}', '\u{e007f}'),
120];
121
122fn contains(c: char, list: &[(char, char)]) -> bool {
123    for &(start, end) in list {
124        if c < start {
125            return false;
126        }
127        if c <= end {
128            return true;
129        }
130    }
131    false
132}