lib_sanitizer_test.go

  1package htmlsanitizer
  2
  3import (
  4	"strings"
  5	"testing"
  6)
  7
  8func TestLibSanitizerRemovesUnsafeHTML(t *testing.T) {
  9	sanitizer := NewLibSanitizer()
 10	input := []byte(`
 11		<p onclick="alert(1)">Hello</p>
 12		<script>alert(1)</script>
 13		<style>body { background-image: url("javascript:alert(1)") }</style>
 14		<a href="javascript:alert(1)">bad link</a>
 15		<a href="https://example.com">good link</a>
 16		<img src="file:///tmp/bad.png" alt="bad image">
 17		<img src="cid:test@example.com" alt="cid image">
 18		<img src="data:text/html,<script>alert(1)</script>" alt="bad data">
 19		<img src="data:image/png;base64,iVBORw0KGgo=" alt="data image">
 20	`)
 21
 22	got := string(sanitizer.SanitizeBytes(input))
 23
 24	for _, forbidden := range []string{
 25		"onclick",
 26		"<script",
 27		"<style",
 28		"javascript:",
 29		"file:///tmp/bad.png",
 30		"data:text/html",
 31	} {
 32		if strings.Contains(got, forbidden) {
 33			t.Fatalf("sanitized HTML contains %q:\n%s", forbidden, got)
 34		}
 35	}
 36
 37	for _, want := range []string{
 38		`href="https://example.com"`,
 39		`src="cid:test@example.com"`,
 40		`src="data:image/png;base64,iVBORw0KGgo="`,
 41	} {
 42		if !strings.Contains(got, want) {
 43			t.Fatalf("sanitized HTML does not contain %q:\n%s", want, got)
 44		}
 45	}
 46}
 47
 48func TestLibSanitizerDoesNotAllowDataOrCIDLinks(t *testing.T) {
 49	sanitizer := NewLibSanitizer()
 50	input := []byte(`
 51		<a href="data:image/png;base64,iVBORw0KGgo=">data link</a>
 52		<a href="cid:test@example.com">cid link</a>
 53		<a href="ftp://example.com/file.txt">ftp link</a>
 54		<a href="file:///tmp/bad.txt">file link</a>
 55		<a href="vbscript:msgbox(1)">vbscript link</a>
 56		<a href="//example.com/protocol-relative">protocol relative link</a>
 57		<a href="/relative/path">relative link</a>
 58		<a href=":not-a-url">broken link</a>
 59	`)
 60
 61	got := string(sanitizer.SanitizeBytes(input))
 62
 63	for _, forbidden := range []string{
 64		"href=\"data:image",
 65		"href=\"cid:",
 66		"href=\"ftp:",
 67		"href=\"file:",
 68		"href=\"vbscript:",
 69		"href=\"//example.com",
 70		"href=\"/relative",
 71		"href=\":not-a-url",
 72	} {
 73		if strings.Contains(got, forbidden) {
 74			t.Fatalf("sanitized HTML contains %q:\n%s", forbidden, got)
 75		}
 76	}
 77
 78	for _, wantText := range []string{
 79		"data link",
 80		"cid link",
 81		"ftp link",
 82		"file link",
 83		"vbscript link",
 84		"protocol relative link",
 85		"relative link",
 86		"broken link",
 87	} {
 88		if !strings.Contains(got, wantText) {
 89			t.Fatalf("sanitized HTML should keep link text %q:\n%s", wantText, got)
 90		}
 91	}
 92}
 93
 94func TestLibSanitizerAllowsSafeLinks(t *testing.T) {
 95	sanitizer := NewLibSanitizer()
 96	input := []byte(`
 97		<a href="http://example.com/path?x=1">http link</a>
 98		<a href="https://example.com/path?x=1">https link</a>
 99		<a href="HTTPS://example.com/path?x=1">uppercase https link</a>
100		<a href="mailto:security@example.com">mailto link</a>
101		<a href="MAILTO:security@example.com">uppercase mailto link</a>
102		<a href="tel:+15551234567">tel link</a>
103	`)
104
105	got := string(sanitizer.SanitizeBytes(input))
106
107	for _, want := range []string{
108		`href="http://example.com/path?x=1"`,
109		`href="https://example.com/path?x=1"`,
110		`href="https://example.com/path?x=1"`,
111		`href="mailto:security@example.com"`,
112		`href="mailto:security@example.com"`,
113		`href="tel:+15551234567"`,
114	} {
115		if !strings.Contains(got, want) {
116			t.Fatalf("sanitized HTML does not contain %q:\n%s", want, got)
117		}
118	}
119}
120
121func TestLibSanitizerFiltersImageSources(t *testing.T) {
122	sanitizer := NewLibSanitizer()
123	input := []byte(`
124		<img src="http://example.com/image.png" alt="http image">
125		<img src="https://example.com/image.png" alt="https image">
126		<img src="cid:test@example.com" alt="cid image">
127		<img src="data:image/png;base64,iVBORw0KGgo=" alt="data image">
128		<img src="javascript:alert(1)" alt="javascript image">
129		<img src="file:///tmp/bad.png" alt="file image">
130		<img src="data:text/html,<script>alert(1)</script>" alt="html data image">
131		<img src="/relative.png" alt="relative image">
132	`)
133
134	got := string(sanitizer.SanitizeBytes(input))
135
136	for _, want := range []string{
137		`src="http://example.com/image.png"`,
138		`src="https://example.com/image.png"`,
139		`src="cid:test@example.com"`,
140		`src="data:image/png;base64,iVBORw0KGgo="`,
141	} {
142		if !strings.Contains(got, want) {
143			t.Fatalf("sanitized HTML does not contain %q:\n%s", want, got)
144		}
145	}
146
147	for _, forbidden := range []string{
148		"src=\"javascript:",
149		"src=\"file:",
150		"src=\"data:text/html",
151		"src=\"/relative.png",
152	} {
153		if strings.Contains(got, forbidden) {
154			t.Fatalf("sanitized HTML contains %q:\n%s", forbidden, got)
155		}
156	}
157}
158
159func TestLibSanitizerRemovesUnknownElementsButKeepsText(t *testing.T) {
160	sanitizer := NewLibSanitizer()
161	input := []byte(`
162		<form action="https://example.com"><input name="token" value="secret">form text</form>
163		<iframe src="https://example.com">iframe text</iframe>
164		<object data="https://example.com">object text</object>
165		<p>safe text</p>
166	`)
167
168	got := string(sanitizer.SanitizeBytes(input))
169
170	for _, forbidden := range []string{
171		"<form",
172		"<input",
173		"<iframe",
174		"<object",
175		"action=",
176		"value=\"secret\"",
177		"src=\"https://example.com\"",
178		"data=\"https://example.com\"",
179	} {
180		if strings.Contains(got, forbidden) {
181			t.Fatalf("sanitized HTML contains %q:\n%s", forbidden, got)
182		}
183	}
184
185	for _, wantText := range []string{
186		"form text",
187		"safe text",
188	} {
189		if !strings.Contains(got, wantText) {
190			t.Fatalf("sanitized HTML should keep text %q:\n%s", wantText, got)
191		}
192	}
193}
194
195func TestLibSanitizerRemovesUnsafeGlobalAttributes(t *testing.T) {
196	sanitizer := NewLibSanitizer()
197	input := []byte(`
198		<p style="color: red" class="promo" data-secret="token" id="message">styled text</p>
199		<blockquote cite="https://example.com" onclick="alert(1)">quote text</blockquote>
200	`)
201
202	got := string(sanitizer.SanitizeBytes(input))
203
204	for _, forbidden := range []string{
205		"style=",
206		"class=",
207		"data-secret",
208		"id=",
209		"onclick=",
210	} {
211		if strings.Contains(got, forbidden) {
212			t.Fatalf("sanitized HTML contains %q:\n%s", forbidden, got)
213		}
214	}
215
216	for _, want := range []string{
217		"styled text",
218		`cite="https://example.com"`,
219		"quote text",
220	} {
221		if !strings.Contains(got, want) {
222			t.Fatalf("sanitized HTML does not contain %q:\n%s", want, got)
223		}
224	}
225}
226
227func TestLibSanitizerRejectsCIDWithQueryOrFragment(t *testing.T) {
228	sanitizer := NewLibSanitizer()
229	input := []byte(`
230		<img src="cid:test@example.com?x=1" alt="cid query">
231		<img src="cid:test@example.com#frag" alt="cid fragment">
232		<img src="cid:test@example.com" alt="cid ok">
233	`)
234
235	got := string(sanitizer.SanitizeBytes(input))
236
237	for _, forbidden := range []string{
238		`src="cid:test@example.com?x=1"`,
239		`src="cid:test@example.com#frag"`,
240	} {
241		if strings.Contains(got, forbidden) {
242			t.Fatalf("sanitized HTML contains %q:\n%s", forbidden, got)
243		}
244	}
245
246	if !strings.Contains(got, `src="cid:test@example.com"`) {
247		t.Fatalf("sanitized HTML should keep clean cid source:\n%s", got)
248	}
249}
250
251func TestLibSanitizerRejectsInvalidDataImages(t *testing.T) {
252	sanitizer := NewLibSanitizer()
253	input := []byte(`
254		<img src="data:image/png;base64,not base64!" alt="invalid base64">
255		<img src="data:image/svg+xml;base64,PHN2Zy8+" alt="svg data">
256		<img src="data:image/png;base64,iVBORw0KGgo=" alt="png data">
257		<img src="data:image/png;base64,iVBORw0KGgo" alt="raw png data">
258	`)
259
260	got := string(sanitizer.SanitizeBytes(input))
261
262	for _, forbidden := range []string{
263		"not base64",
264		"data:image/svg+xml",
265	} {
266		if strings.Contains(got, forbidden) {
267			t.Fatalf("sanitized HTML contains %q:\n%s", forbidden, got)
268		}
269	}
270
271	if !strings.Contains(got, `src="data:image/png;base64,iVBORw0KGgo="`) {
272		t.Fatalf("sanitized HTML should keep valid png data URI:\n%s", got)
273	}
274	if !strings.Contains(got, `src="data:image/png;base64,iVBORw0KGgo"`) {
275		t.Fatalf("sanitized HTML should keep valid unpadded png data URI:\n%s", got)
276	}
277}