1// Invisibility in a Unicode context is not well defined, so we have to guess.
2//
3// We highlight all ASCII control codes, and unicode whitespace because they are likely
4// confused with an ASCII space in a programming context (U+0020).
5//
6// We also highlight the handful of blank non-space characters:
7// U+2800 BRAILLE PATTERN BLANK - Category: So
8// U+115F HANGUL CHOSEONG FILLER - Category: Lo
9// U+1160 HANGUL CHOSEONG FILLER - Category: Lo
10// U+3164 HANGUL FILLER - Category: Lo
11// U+FFA0 HALFWIDTH HANGUL FILLER - Category: Lo
12// U+FFFC OBJECT REPLACEMENT CHARACTER - Category: So
13//
14// For the rest of Unicode, invisibility happens for two reasons:
15// * A Format character (like a byte order mark or right-to-left override)
16// * An invisible Nonspacing Mark character (like U+034F, or variation selectors)
17//
18// We don't consider unassigned codepoints invisible as the font renderer already shows
19// a replacement character in that case (and there are a *lot* of them)
20//
21// Control characters are mostly fine to highlight; except:
22// * U+E0020..=U+E007F are used in emoji flags. We don't highlight them right now, but we could if we tightened our heuristics.
23// * U+200D is used to join characters. We highlight this but don't replace it. As our font system ignores mid-glyph highlights this mostly works to highlight unexpected uses.
24//
25// Nonspacing marks are handled like U+200D. This means that mid-glyph we ignore them, but
26// probably causes issues with end-of-glyph usage.
27//
28// ref: https://invisible-characters.com
29// ref: https://www.compart.com/en/unicode/category/Cf
30// ref: https://gist.github.com/ConradIrwin/f759e1fc29267143c4c7895aa495dca5?h=1
31// ref: https://unicode.org/Public/emoji/13.0/emoji-test.txt
32// https://github.com/bits/UTF-8-Unicode-Test-Documents/blob/master/UTF-8_sequence_separated/utf8_sequence_0-0x10ffff_assigned_including-unprintable-asis.txt
33pub fn is_invisible(c: char) -> bool {
34 if c <= '\u{1f}' {
35 c != '\t' && c != '\n' && c != '\r'
36 } else if c >= '\u{7f}' {
37 c <= '\u{9f}'
38 || (c.is_whitespace() && c != IDEOGRAPHIC_SPACE)
39 || contains(c, FORMAT)
40 || contains(c, OTHER)
41 } else {
42 false
43 }
44}
45// ASCII control characters have fancy unicode glyphs, everything else
46// is replaced by a space - unless it is used in combining characters in
47// which case we need to leave it in the string.
48pub fn replacement(c: char) -> Option<&'static str> {
49 if c <= '\x1f' {
50 Some(C0_SYMBOLS[c as usize])
51 } else if c == '\x7f' {
52 Some(DEL)
53 } else if contains(c, PRESERVE) {
54 None
55 } else {
56 Some(FIXED_WIDTH_SPACE)
57 }
58}
59
60const FIXED_WIDTH_SPACE: &str = "\u{2007}";
61
62// IDEOGRAPHIC SPACE is common alongside Chinese and other wide character sets.
63// We don't highlight this for now (as it already shows up wide in the editor),
64// but could if we tracked state in the classifier.
65const IDEOGRAPHIC_SPACE: char = '\u{3000}';
66
67const C0_SYMBOLS: &[&str] = &[
68 "␀", "␁", "␂", "␃", "␄", "␅", "␆", "␇", "␈", "␉", "␊", "␋", "␌", "␍", "␎", "␏", "␐", "␑", "␒",
69 "␓", "␔", "␕", "␖", "␗", "␘", "␙", "␚", "␛", "␜", "␝", "␞", "␟",
70];
71const DEL: &str = "␡";
72
73// generated using ucd-generate: ucd-generate general-category --include Format --chars ucd-16.0.0
74pub const FORMAT: &[(char, char)] = &[
75 ('\u{ad}', '\u{ad}'),
76 ('\u{600}', '\u{605}'),
77 ('\u{61c}', '\u{61c}'),
78 ('\u{6dd}', '\u{6dd}'),
79 ('\u{70f}', '\u{70f}'),
80 ('\u{890}', '\u{891}'),
81 ('\u{8e2}', '\u{8e2}'),
82 ('\u{180e}', '\u{180e}'),
83 ('\u{200b}', '\u{200f}'),
84 ('\u{202a}', '\u{202e}'),
85 ('\u{2060}', '\u{2064}'),
86 ('\u{2066}', '\u{206f}'),
87 ('\u{feff}', '\u{feff}'),
88 ('\u{fff9}', '\u{fffb}'),
89 ('\u{110bd}', '\u{110bd}'),
90 ('\u{110cd}', '\u{110cd}'),
91 ('\u{13430}', '\u{1343f}'),
92 ('\u{1bca0}', '\u{1bca3}'),
93 ('\u{1d173}', '\u{1d17a}'),
94 ('\u{e0001}', '\u{e0001}'),
95 ('\u{e0020}', '\u{e007f}'),
96];
97
98// hand-made base on https://invisible-characters.com (Excluding Cf)
99pub const OTHER: &[(char, char)] = &[
100 ('\u{034f}', '\u{034f}'),
101 ('\u{115F}', '\u{1160}'),
102 ('\u{17b4}', '\u{17b5}'),
103 ('\u{180b}', '\u{180d}'),
104 ('\u{2800}', '\u{2800}'),
105 ('\u{3164}', '\u{3164}'),
106 ('\u{fe00}', '\u{fe0d}'),
107 ('\u{ffa0}', '\u{ffa0}'),
108 ('\u{fffc}', '\u{fffc}'),
109 ('\u{e0100}', '\u{e01ef}'),
110];
111
112// a subset of FORMAT/OTHER that may appear within glyphs
113const PRESERVE: &[(char, char)] = &[
114 ('\u{034f}', '\u{034f}'),
115 ('\u{200d}', '\u{200d}'),
116 ('\u{17b4}', '\u{17b5}'),
117 ('\u{180b}', '\u{180d}'),
118 ('\u{e0061}', '\u{e007a}'),
119 ('\u{e007f}', '\u{e007f}'),
120];
121
122fn contains(c: char, list: &[(char, char)]) -> bool {
123 for &(start, end) in list {
124 if c < start {
125 return false;
126 }
127 if c <= end {
128 return true;
129 }
130 }
131 false
132}