1// Invisibility in a Unicode context is not well defined, so we have to guess.
2//
3// We highlight all ASCII control codes, and unicode whitespace because they are likely
4// confused with an ASCII space in a programming context (U+0020).
5//
6// We also highlight the handful of blank non-space characters:
7// U+2800 BRAILLE PATTERN BLANK - Category: So
8// U+115F HANGUL CHOSEONG FILLER - Category: Lo
9// U+1160 HANGUL CHOSEONG FILLER - Category: Lo
10// U+3164 HANGUL FILLER - Category: Lo
11// U+FFA0 HALFWIDTH HANGUL FILLER - Category: Lo
12// U+FFFC OBJECT REPLACEMENT CHARACTER - Category: So
13//
14// For the rest of Unicode, invisibility happens for two reasons:
15// * A Format character (like a byte order mark or right-to-left override)
16// * An invisible Nonspacing Mark character (like U+034F, or variation selectors)
17//
18// We don't consider unassigned codepoints invisible as the font renderer already shows
19// a replacement character in that case (and there are a *lot* of them)
20//
21// Control characters are mostly fine to highlight; except:
22// * U+E0020..=U+E007F are used in emoji flags. We don't highlight them right now, but we could if we tightened our heuristics.
23// * U+200D is used to join characters. We highlight this but don't replace it. As our font system ignores mid-glyph highlights this mostly works to highlight unexpected uses.
24//
25// Nonspacing marks are handled like U+200D. This means that mid-glyph we ignore them, but
26// probably causes issues with end-of-glyph usage.
27//
28// ref: https://invisible-characters.com
29// ref: https://www.compart.com/en/unicode/category/Cf
30// ref: https://gist.github.com/ConradIrwin/f759e1fc29267143c4c7895aa495dca5?h=1
31// ref: https://unicode.org/Public/emoji/13.0/emoji-test.txt
32// https://github.com/bits/UTF-8-Unicode-Test-Documents/blob/master/UTF-8_sequence_separated/utf8_sequence_0-0x10ffff_assigned_including-unprintable-asis.txt
33#[ztracing::instrument(skip_all)]
34pub fn is_invisible(c: char) -> bool {
35 if c <= '\u{1f}' {
36 c != '\t' && c != '\n' && c != '\r'
37 } else if c >= '\u{7f}' {
38 c <= '\u{9f}'
39 || (c.is_whitespace() && c != IDEOGRAPHIC_SPACE)
40 || contains(c, FORMAT)
41 || contains(c, OTHER)
42 } else {
43 false
44 }
45}
46// ASCII control characters have fancy unicode glyphs, everything else
47// is replaced by a space - unless it is used in combining characters in
48// which case we need to leave it in the string.
49pub fn replacement(c: char) -> Option<&'static str> {
50 if c <= '\x1f' {
51 Some(C0_SYMBOLS[c as usize])
52 } else if c == '\x7f' {
53 Some(DEL)
54 } else if contains(c, PRESERVE) {
55 None
56 } else {
57 Some(FIXED_WIDTH_SPACE)
58 }
59}
60
61const FIXED_WIDTH_SPACE: &str = "\u{2007}";
62
63// IDEOGRAPHIC SPACE is common alongside Chinese and other wide character sets.
64// We don't highlight this for now (as it already shows up wide in the editor),
65// but could if we tracked state in the classifier.
66const IDEOGRAPHIC_SPACE: char = '\u{3000}';
67
68const C0_SYMBOLS: &[&str] = &[
69 "␀", "␁", "␂", "␃", "␄", "␅", "␆", "␇", "␈", "␉", "␊", "␋", "␌", "␍", "␎", "␏", "␐", "␑", "␒",
70 "␓", "␔", "␕", "␖", "␗", "␘", "␙", "␚", "␛", "␜", "␝", "␞", "␟",
71];
72const DEL: &str = "␡";
73
74// generated using ucd-generate: ucd-generate general-category --include Format --chars ucd-16.0.0
75pub const FORMAT: &[(char, char)] = &[
76 ('\u{ad}', '\u{ad}'),
77 ('\u{600}', '\u{605}'),
78 ('\u{61c}', '\u{61c}'),
79 ('\u{6dd}', '\u{6dd}'),
80 ('\u{70f}', '\u{70f}'),
81 ('\u{890}', '\u{891}'),
82 ('\u{8e2}', '\u{8e2}'),
83 ('\u{180e}', '\u{180e}'),
84 ('\u{200b}', '\u{200f}'),
85 ('\u{202a}', '\u{202e}'),
86 ('\u{2060}', '\u{2064}'),
87 ('\u{2066}', '\u{206f}'),
88 ('\u{feff}', '\u{feff}'),
89 ('\u{fff9}', '\u{fffb}'),
90 ('\u{110bd}', '\u{110bd}'),
91 ('\u{110cd}', '\u{110cd}'),
92 ('\u{13430}', '\u{1343f}'),
93 ('\u{1bca0}', '\u{1bca3}'),
94 ('\u{1d173}', '\u{1d17a}'),
95 ('\u{e0001}', '\u{e0001}'),
96 ('\u{e0020}', '\u{e007f}'),
97];
98
99// hand-made base on https://invisible-characters.com (Excluding Cf)
100pub const OTHER: &[(char, char)] = &[
101 ('\u{034f}', '\u{034f}'),
102 ('\u{115F}', '\u{1160}'),
103 ('\u{17b4}', '\u{17b5}'),
104 ('\u{180b}', '\u{180d}'),
105 ('\u{2800}', '\u{2800}'),
106 ('\u{3164}', '\u{3164}'),
107 ('\u{fe00}', '\u{fe0d}'),
108 ('\u{ffa0}', '\u{ffa0}'),
109 ('\u{fffc}', '\u{fffc}'),
110 ('\u{e0100}', '\u{e01ef}'),
111];
112
113// a subset of FORMAT/OTHER that may appear within glyphs
114const PRESERVE: &[(char, char)] = &[
115 ('\u{034f}', '\u{034f}'),
116 ('\u{200d}', '\u{200d}'),
117 ('\u{17b4}', '\u{17b5}'),
118 ('\u{180b}', '\u{180d}'),
119 ('\u{e0061}', '\u{e007a}'),
120 ('\u{e007f}', '\u{e007f}'),
121];
122
123fn contains(c: char, list: &[(char, char)]) -> bool {
124 for &(start, end) in list {
125 if c < start {
126 return false;
127 }
128 if c <= end {
129 return true;
130 }
131 }
132 false
133}