1use std::sync::LazyLock;
2
3use collections::HashMap;
4
5// Invisibility in a Unicode context is not well defined, so we have to guess.
6//
7// We highlight all ASCII control codes, and unicode whitespace because they are likely
8// confused with a normal space (U+0020).
9//
10// We also highlight the handful of blank non-space characters:
11// U+2800 BRAILLE PATTERN BLANK - Category: So
12// U+115F HANGUL CHOSEONG FILLER - Category: Lo
13// U+1160 HANGUL CHOSEONG FILLER - Category: Lo
14// U+3164 HANGUL FILLER - Category: Lo
15// U+FFA0 HALFWIDTH HANGUL FILLER - Category: Lo
16// U+FFFC OBJECT REPLACEMENT CHARACTER - Category: So
17//
18// For the rest of Unicode, invisibility happens for two reasons:
19// * A Format character (like a byte order mark or right-to-left override)
20// * An invisible Nonspacing Mark character (like U+034F, or variation selectors)
21//
22// We don't consider unassigned codepoints invisible as the font renderer already shows
23// a replacement character in that case (and there are a *lot* of them)
24//
25// Control characters are mostly fine to highlight; except:
26// * U+E0020..=U+E007F are used in emoji flags. We don't highlight them right now, but we could if we tightened our heuristics.
27// * U+200D is used to join characters. We highlight this but don't replace it. As our font system ignores mid-glyph highlights this mostly works to highlight unexpected uses.
28//
29// Nonspacing marks are handled like U+200D. This means that mid-glyph we ignore them, but
30// probably causes issues with end-of-glyph usage.
31//
32// ref: https://invisible-characters.com
33// ref: https://www.compart.com/en/unicode/category/Cf
34// ref: https://gist.github.com/ConradIrwin/f759e1fc29267143c4c7895aa495dca5?h=1
35// ref: https://unicode.org/Public/emoji/13.0/emoji-test.txt
36// https://github.com/bits/UTF-8-Unicode-Test-Documents/blob/master/UTF-8_sequence_separated/utf8_sequence_0-0x10ffff_assigned_including-unprintable-asis.txt
37pub fn is_invisible(c: char) -> bool {
38 if c <= '\u{1f}' {
39 c != '\t' && c != '\n' && c != '\r'
40 } else if c >= '\u{7f}' {
41 c <= '\u{9f}' || c.is_whitespace() || contains(c, &FORMAT) || contains(c, &OTHER)
42 } else {
43 false
44 }
45}
46
47pub(crate) fn replacement(c: char) -> Option<&'static str> {
48 if !is_invisible(c) {
49 return None;
50 }
51 if c <= '\x7f' {
52 REPLACEMENTS.get(&c).copied()
53 } else if contains(c, &PRESERVE) {
54 None
55 } else {
56 Some(" ")
57 }
58}
59
60const REPLACEMENTS: LazyLock<HashMap<char, &'static str>> = LazyLock::new(|| {
61 [
62 ('\x00', "␀"),
63 ('\x01', "␁"),
64 ('\x02', "␂"),
65 ('\x03', "␃"),
66 ('\x04', "␄"),
67 ('\x05', "␅"),
68 ('\x06', "␆"),
69 ('\x07', "␇"),
70 ('\x08', "␈"),
71 ('\x0B', "␋"),
72 ('\x0C', "␌"),
73 ('\x0D', "␍"),
74 ('\x0E', "␎"),
75 ('\x0F', "␏"),
76 ('\x10', "␐"),
77 ('\x11', "␑"),
78 ('\x12', "␒"),
79 ('\x13', "␓"),
80 ('\x14', "␔"),
81 ('\x15', "␕"),
82 ('\x16', "␖"),
83 ('\x17', "␗"),
84 ('\x18', "␘"),
85 ('\x19', "␙"),
86 ('\x1A', "␚"),
87 ('\x1B', "␛"),
88 ('\x1C', "␜"),
89 ('\x1D', "␝"),
90 ('\x1E', "␞"),
91 ('\x1F', "␟"),
92 ('\u{007F}', "␡"),
93 ]
94 .into_iter()
95 .collect()
96});
97
98// generated using ucd-generate: ucd-generate general-category --include Format --chars ucd-16.0.0
99pub const FORMAT: &'static [(char, char)] = &[
100 ('\u{ad}', '\u{ad}'),
101 ('\u{600}', '\u{605}'),
102 ('\u{61c}', '\u{61c}'),
103 ('\u{6dd}', '\u{6dd}'),
104 ('\u{70f}', '\u{70f}'),
105 ('\u{890}', '\u{891}'),
106 ('\u{8e2}', '\u{8e2}'),
107 ('\u{180e}', '\u{180e}'),
108 ('\u{200b}', '\u{200f}'),
109 ('\u{202a}', '\u{202e}'),
110 ('\u{2060}', '\u{2064}'),
111 ('\u{2066}', '\u{206f}'),
112 ('\u{feff}', '\u{feff}'),
113 ('\u{fff9}', '\u{fffb}'),
114 ('\u{110bd}', '\u{110bd}'),
115 ('\u{110cd}', '\u{110cd}'),
116 ('\u{13430}', '\u{1343f}'),
117 ('\u{1bca0}', '\u{1bca3}'),
118 ('\u{1d173}', '\u{1d17a}'),
119 ('\u{e0001}', '\u{e0001}'),
120 ('\u{e0020}', '\u{e007f}'),
121];
122
123// hand-made base on https://invisible-characters.com (Excluding Cf)
124pub const OTHER: &'static [(char, char)] = &[
125 ('\u{034f}', '\u{034f}'),
126 ('\u{115F}', '\u{1160}'),
127 ('\u{17b4}', '\u{17b5}'),
128 ('\u{180b}', '\u{180d}'),
129 ('\u{2800}', '\u{2800}'),
130 ('\u{3164}', '\u{3164}'),
131 ('\u{fe00}', '\u{fe0d}'),
132 ('\u{ffa0}', '\u{ffa0}'),
133 ('\u{fffc}', '\u{fffc}'),
134 ('\u{e0100}', '\u{e01ef}'),
135];
136
137// a subset of FORMAT/OTHER that may appear within glyphs
138const PRESERVE: &'static [(char, char)] = &[
139 ('\u{034f}', '\u{034f}'),
140 ('\u{200d}', '\u{200d}'),
141 ('\u{17b4}', '\u{17b5}'),
142 ('\u{180b}', '\u{180d}'),
143 ('\u{e0061}', '\u{e007a}'),
144 ('\u{e007f}', '\u{e007f}'),
145];
146
147fn contains(c: char, list: &[(char, char)]) -> bool {
148 for (start, end) in list {
149 if c < *start {
150 return false;
151 }
152 if c <= *end {
153 return true;
154 }
155 }
156 false
157}