1use html5ever::{
2 Attribute, ParseOpts, QualName, parse_document,
3 tendril::{Tendril, TendrilSink, fmt::UTF8},
4};
5use markup5ever_rcdom::{Node, NodeData, RcDom};
6use std::{cell::RefCell, io, rc::Rc, str};
7
8#[derive(Default)]
9pub(crate) struct MinifierOptions {
10 pub omit_doctype: bool,
11 pub preserve_comments: bool,
12 pub collapse_whitespace: bool,
13}
14
15pub(crate) struct Minifier<'a, W: io::Write> {
16 w: &'a mut W,
17 options: MinifierOptions,
18 preceding_whitespace: bool,
19}
20
21impl<'a, W> Minifier<'a, W>
22where
23 W: io::Write,
24{
25 /// Creates a new `Minifier` instance.
26 #[inline]
27 pub fn new(w: &'a mut W, options: MinifierOptions) -> Self {
28 Self {
29 w,
30 options,
31 preceding_whitespace: false,
32 }
33 }
34
35 /// Minifies the given reader input.
36 ///
37 /// # Errors
38 ///
39 /// Will return `Err` if unable to write to the output writer.
40 #[inline]
41 pub fn minify<R: io::Read>(&mut self, mut r: &mut R) -> io::Result<()> {
42 let dom = parse_document(RcDom::default(), ParseOpts::default())
43 .from_utf8()
44 .read_from(&mut r)?;
45
46 if !self.options.omit_doctype {
47 self.w.write_all(b"<!doctype html>")?;
48 }
49
50 self.minify_node(&None, &dom.document)
51 }
52
53 fn minify_node<'b>(&mut self, ctx: &'b Option<Context>, node: &'b Node) -> io::Result<()> {
54 match &node.data {
55 NodeData::Text { contents } => {
56 // Check if whitespace collapsing disabled
57 let contents = contents.borrow();
58 let contents = contents.as_ref();
59
60 if !self.options.collapse_whitespace {
61 return self.w.write_all(contents.as_bytes());
62 }
63
64 // Check if parent is whitespace preserving element or contains code (<script>, <style>)
65 let (skip_collapse_whitespace, contains_code) =
66 ctx.as_ref().map_or((false, false), |ctx| {
67 if let NodeData::Element { name, .. } = &ctx.parent.data {
68 let name = name.local.as_ref();
69
70 (preserve_whitespace(name), contains_code(name))
71 } else {
72 (false, false)
73 }
74 });
75
76 if skip_collapse_whitespace {
77 return self.w.write_all(contents.as_bytes());
78 }
79
80 if contains_code {
81 return self
82 .w
83 .write_all(contents.trim_matches(is_ascii_whitespace).as_bytes());
84 }
85
86 // Early exit if empty to forego expensive trim logic
87 if contents.is_empty() {
88 return io::Result::Ok(());
89 }
90
91 let (trim_left, trim_right) = ctx
92 .as_ref()
93 .map_or((true, true), |ctx| ctx.trim(self.preceding_whitespace));
94 let contents = match (trim_left, trim_right) {
95 (true, true) => contents.trim_matches(is_ascii_whitespace),
96 (true, false) => contents.trim_start_matches(is_ascii_whitespace),
97 (false, true) => contents.trim_end_matches(is_ascii_whitespace),
98 _ => contents,
99 };
100
101 // Second empty check after trimming whitespace
102 if !contents.is_empty() {
103 // replace \n, \r to ' '
104 let contents = contents
105 .bytes()
106 .map(|c| if matches!(c, b'\n' | b'\r') { b' ' } else { c })
107 .collect::<Vec<u8>>();
108
109 self.write_collapse_whitespace(&contents, reserved_entity, None)?;
110
111 self.preceding_whitespace = !trim_right
112 && contents
113 .iter()
114 .last()
115 .map_or(false, u8::is_ascii_whitespace);
116 }
117
118 Ok(())
119 }
120
121 NodeData::Comment { contents } if self.options.preserve_comments => {
122 self.w.write_all(b"<!--")?;
123 self.w.write_all(contents.as_bytes())?;
124 self.w.write_all(b"-->")
125 }
126
127 NodeData::Document => self.minify_children(ctx, node),
128
129 NodeData::Element { name, attrs, .. } => {
130 let attrs = attrs.borrow();
131 let tag = name.local.as_ref();
132
133 if is_self_closing(tag) {
134 return self.write_start_tag(name, &attrs);
135 }
136
137 let (omit_start_tag, omit_end_tag) =
138 self.omit_tags(ctx, node, tag, attrs.is_empty());
139
140 if !omit_start_tag {
141 self.write_start_tag(name, &attrs)?;
142 }
143
144 self.minify_children(ctx, node)?;
145
146 if !omit_end_tag {
147 self.write_end_tag(name)?;
148 }
149
150 Ok(())
151 }
152
153 _ => Ok(()),
154 }
155 }
156
157 fn next_is_comment<'b, I>(&self, v: I) -> bool
158 where
159 I: IntoIterator<Item = &'b Rc<Node>>,
160 {
161 v.into_iter()
162 .find_map(|node| match &node.data {
163 NodeData::Text { contents } => {
164 if self.options.collapse_whitespace && is_whitespace(contents) {
165 // Blocks of whitespace are skipped
166 None
167 } else {
168 Some(false)
169 }
170 }
171 NodeData::Comment { .. } => Some(self.options.preserve_comments),
172 _ => Some(false),
173 })
174 .unwrap_or(false)
175 }
176
177 fn is_whitespace(&self, s: &RefCell<Tendril<UTF8>>) -> Option<bool> {
178 if self.options.collapse_whitespace && is_whitespace(s) {
179 None
180 } else {
181 Some(
182 !s.borrow()
183 .as_bytes()
184 .iter()
185 .next()
186 .map_or(false, u8::is_ascii_whitespace),
187 )
188 }
189 }
190
191 /// Determines if start and end tags can be omitted.
192 /// Whitespace rules are ignored if `collapse_whitespace` is enabled.
193 #[allow(clippy::too_many_lines)]
194 fn omit_tags(
195 &self,
196 ctx: &Option<Context>,
197 node: &Node,
198 name: &str,
199 empty_attributes: bool,
200 ) -> (bool, bool) {
201 ctx.as_ref().map_or((false, false), |ctx| match name {
202 "html" => {
203 // The end tag may be omitted if the <html> element is not immediately followed by a comment.
204 let omit_end = ctx.right.map_or(true, |right| !self.next_is_comment(right));
205 // The start tag may be omitted if the first thing inside the <html> element is not a comment.
206 let omit_start =
207 empty_attributes && omit_end && !self.next_is_comment(&*node.children.borrow());
208
209 (omit_start, omit_end)
210 }
211 "head" => {
212 // The end tag may be omitted if the first thing following the <head> element is not a space character or a comment.
213 let omit_end = ctx.right.map_or(true, |right| {
214 right
215 .iter()
216 .find_map(|node| match &node.data {
217 NodeData::Text { contents } => self.is_whitespace(contents),
218 NodeData::Comment { .. } => {
219 if self.options.preserve_comments {
220 Some(false)
221 } else {
222 None
223 }
224 }
225 _ => Some(true),
226 })
227 .unwrap_or(true)
228 });
229 // The start tag may be omitted if the first thing inside the <head> element is an element.
230 let omit_start = empty_attributes
231 && omit_end
232 && node
233 .children
234 .borrow()
235 .iter()
236 .find_map(|node| match &node.data {
237 NodeData::Text { contents } => self.is_whitespace(contents),
238 NodeData::Element { .. } => Some(true),
239 NodeData::Comment { .. } => {
240 if self.options.preserve_comments {
241 Some(false)
242 } else {
243 None
244 }
245 }
246 _ => Some(false),
247 })
248 .unwrap_or(true);
249
250 (omit_start, omit_end)
251 }
252 "body" => {
253 // The start tag may be omitted if the first thing inside it is not a space character, comment, <script> element or <style> element.
254 let omit_start = empty_attributes
255 && node
256 .children
257 .borrow()
258 .iter()
259 .find_map(|node| match &node.data {
260 NodeData::Text { contents } => self.is_whitespace(contents),
261 NodeData::Element { name, .. } => {
262 Some(!matches!(name.local.as_ref(), "script" | "style"))
263 }
264 NodeData::Comment { .. } => {
265 if self.options.preserve_comments {
266 Some(false)
267 } else {
268 None
269 }
270 }
271 _ => Some(true),
272 })
273 .unwrap_or(true);
274 // The end tag may be omitted if the <body> element has contents or has a start tag, and is not immediately followed by a comment.
275 let omit_end = ctx.right.map_or(true, |right| !self.next_is_comment(right));
276
277 (omit_start && omit_end, omit_end)
278 }
279 "p" => {
280 let omit_end = ctx.next_element().map_or(true, |node| {
281 if let NodeData::Element { name, .. } = &node.data {
282 matches!(
283 name.local.as_ref().to_ascii_lowercase().as_str(),
284 "address"
285 | "article"
286 | "aside"
287 | "blockquote"
288 | "div"
289 | "dl"
290 | "fieldset"
291 | "footer"
292 | "form"
293 | "h1"
294 | "h2"
295 | "h3"
296 | "h4"
297 | "h5"
298 | "h6"
299 | "header"
300 | "hr"
301 | "menu"
302 | "nav"
303 | "ol"
304 | "p"
305 | "pre"
306 | "section"
307 | "table"
308 | "ul"
309 )
310 } else {
311 false
312 }
313 });
314
315 (false, omit_end)
316 }
317 // TODO: comprehensive handling of optional end element rules
318 _ => (false, optional_end_tag(name)),
319 })
320 }
321
322 #[allow(clippy::needless_pass_by_value)]
323 fn minify_children(&mut self, ctx: &Option<Context>, node: &Node) -> io::Result<()> {
324 let children = node.children.borrow();
325 let l = children.len();
326
327 children.iter().enumerate().try_for_each(|(i, child)| {
328 if self.preceding_whitespace && is_block_element(child) {
329 self.preceding_whitespace = false;
330 }
331
332 self.minify_node(
333 &Some(Context {
334 parent: node,
335 parent_context: ctx.as_ref(),
336 left: if i > 0 { Some(&children[..i]) } else { None },
337 right: if i + 1 < l {
338 Some(&children[i + 1..])
339 } else {
340 None
341 },
342 }),
343 child,
344 )
345 })
346 }
347
348 fn write_qualified_name(&mut self, name: &QualName) -> io::Result<()> {
349 if let Some(prefix) = &name.prefix {
350 self.w
351 .write_all(prefix.as_ref().to_ascii_lowercase().as_bytes())?;
352 self.w.write_all(b":")?;
353 }
354
355 self.w
356 .write_all(name.local.as_ref().to_ascii_lowercase().as_bytes())
357 }
358
359 fn write_start_tag(&mut self, name: &QualName, attrs: &[Attribute]) -> io::Result<()> {
360 self.w.write_all(b"<")?;
361 self.write_qualified_name(name)?;
362
363 attrs
364 .iter()
365 .try_for_each(|attr| self.write_attribute(attr))?;
366
367 self.w.write_all(b">")
368 }
369
370 fn write_end_tag(&mut self, name: &QualName) -> io::Result<()> {
371 self.w.write_all(b"</")?;
372 self.write_qualified_name(name)?;
373 self.w.write_all(b">")
374 }
375
376 fn write_attribute(&mut self, attr: &Attribute) -> io::Result<()> {
377 self.w.write_all(b" ")?;
378 self.write_qualified_name(&attr.name)?;
379
380 let value = attr.value.as_ref();
381 let value = if self.options.collapse_whitespace {
382 value.trim_matches(is_ascii_whitespace)
383 } else {
384 value
385 };
386
387 if value.is_empty() {
388 return io::Result::Ok(());
389 }
390
391 self.w.write_all(b"=")?;
392
393 let b = value.as_bytes();
394 let (unquoted, double, _) =
395 b.iter()
396 .fold((true, false, false), |(unquoted, double, single), &c| {
397 let (double, single) = (double || c == b'"', single || c == b'\'');
398 let unquoted =
399 unquoted && !double && !single && c != b'=' && !c.is_ascii_whitespace();
400
401 (unquoted, double, single)
402 });
403
404 if unquoted {
405 self.w.write_all(b)
406 } else if double {
407 self.write_attribute_value(b, b"'", reserved_entity_with_apos)
408 } else {
409 self.write_attribute_value(b, b"\"", reserved_entity)
410 }
411 }
412
413 fn write_attribute_value<T: AsRef<[u8]>>(
414 &mut self,
415 v: T,
416 quote: &[u8],
417 f: EntityFn,
418 ) -> io::Result<()> {
419 self.w.write_all(quote)?;
420
421 let b = v.as_ref();
422
423 if self.options.collapse_whitespace {
424 self.write_collapse_whitespace(b, f, Some(false))
425 } else {
426 self.w.write_all(b)
427 }?;
428
429 self.w.write_all(quote)
430 }
431
432 /// Efficiently writes blocks of content, e.g. a string with no collapsed
433 /// whitespace would result in a single write.
434 fn write_collapse_whitespace(
435 &mut self,
436 b: &[u8],
437 f: EntityFn,
438 preceding_whitespace: Option<bool>,
439 ) -> io::Result<()> {
440 b.iter()
441 .enumerate()
442 .try_fold(
443 (0, preceding_whitespace.unwrap_or(self.preceding_whitespace)),
444 |(pos, preceding_whitespace), (i, &c)| {
445 let is_whitespace = c.is_ascii_whitespace();
446
447 Ok(if is_whitespace && preceding_whitespace {
448 if i != pos {
449 self.write(&b[pos..i], f)?;
450 }
451
452 // ASCII whitespace = 1 byte
453 (i + 1, true)
454 } else {
455 (pos, is_whitespace)
456 })
457 },
458 )
459 .and_then(|(pos, _)| {
460 if pos < b.len() {
461 self.write(&b[pos..], f)?;
462 }
463
464 Ok(())
465 })
466 }
467
468 fn write(&mut self, b: &[u8], f: EntityFn) -> io::Result<()> {
469 b.iter()
470 .enumerate()
471 .try_fold(0, |pos, (i, &c)| {
472 Ok(if let Some(entity) = f(c) {
473 self.w.write_all(&b[pos..i])?;
474 self.w.write_all(entity)?;
475
476 // Reserved characters are 1 byte
477 i + 1
478 } else {
479 pos
480 })
481 })
482 .and_then(|pos| {
483 if pos < b.len() {
484 self.w.write_all(&b[pos..])?;
485 }
486
487 Ok(())
488 })
489 }
490}
491
492struct Context<'a> {
493 parent: &'a Node,
494 parent_context: Option<&'a Context<'a>>,
495 left: Option<&'a [Rc<Node>]>,
496 right: Option<&'a [Rc<Node>]>,
497}
498
499impl<'a> Context<'a> {
500 /// Determine whether to trim whitespace.
501 /// Uses naive HTML5 whitespace collapsing rules.
502 fn trim(&self, preceding_whitespace: bool) -> (bool, bool) {
503 (preceding_whitespace || self.trim_left(), self.trim_right())
504 }
505
506 fn trim_left(&self) -> bool {
507 self.left.map_or_else(
508 || is_block_element(self.parent) || self.parent_trim_left(),
509 |siblings| {
510 siblings
511 .iter()
512 .rev()
513 .find_map(Self::is_block_element)
514 .unwrap_or_else(|| self.parent_trim_left())
515 },
516 )
517 }
518
519 fn parent_trim_left(&self) -> bool {
520 self.parent_context.map_or(true, Context::trim_left)
521 }
522
523 fn trim_right(&self) -> bool {
524 self.right.map_or(true, |siblings| {
525 siblings
526 .iter()
527 .find_map(Self::is_block_element)
528 .unwrap_or(true)
529 })
530 }
531
532 fn next_element(&self) -> Option<&Rc<Node>> {
533 self.right.and_then(|siblings| {
534 siblings
535 .iter()
536 .find(|node| matches!(node.data, NodeData::Element { .. }))
537 })
538 }
539
540 fn is_block_element(node: &Rc<Node>) -> Option<bool> {
541 if let NodeData::Element { name, .. } = &node.data {
542 Some(is_block_element_name(name.local.as_ref()))
543 } else {
544 None
545 }
546 }
547}
548
549type EntityFn = fn(u8) -> Option<&'static [u8]>;
550
551const fn reserved_entity(v: u8) -> Option<&'static [u8]> {
552 match v {
553 b'<' => Some(b"<"),
554 b'>' => Some(b">"),
555 b'&' => Some(b"&"),
556 _ => None,
557 }
558}
559
560const fn reserved_entity_with_apos(v: u8) -> Option<&'static [u8]> {
561 if v == b'\'' {
562 Some(b"'")
563 } else {
564 reserved_entity(v)
565 }
566}
567
568fn is_whitespace(s: &RefCell<Tendril<UTF8>>) -> bool {
569 s.borrow().as_bytes().iter().all(u8::is_ascii_whitespace)
570}
571
572fn is_block_element_name(name: &str) -> bool {
573 matches!(
574 name,
575 "address"
576 | "article"
577 | "aside"
578 | "blockquote"
579 | "body"
580 | "br"
581 | "details"
582 | "dialog"
583 | "dd"
584 | "div"
585 | "dl"
586 | "dt"
587 | "fieldset"
588 | "figcaption"
589 | "figure"
590 | "footer"
591 | "form"
592 | "h1"
593 | "h2"
594 | "h3"
595 | "h4"
596 | "h5"
597 | "h6"
598 | "head"
599 | "header"
600 | "hgroup"
601 | "hr"
602 | "html"
603 | "li"
604 | "link"
605 | "main"
606 | "meta"
607 | "nav"
608 | "ol"
609 | "option"
610 | "p"
611 | "pre"
612 | "script"
613 | "section"
614 | "source"
615 | "table"
616 | "td"
617 | "th"
618 | "title"
619 | "tr"
620 | "ul"
621 )
622}
623
624fn is_block_element(node: &Node) -> bool {
625 match &node.data {
626 NodeData::Element { name, .. } => is_block_element_name(name.local.as_ref()),
627 NodeData::Document => true,
628 _ => false,
629 }
630}
631
632#[allow(clippy::missing_const_for_fn)]
633fn is_ascii_whitespace(c: char) -> bool {
634 c.is_ascii_whitespace()
635}
636
637fn preserve_whitespace(name: &str) -> bool {
638 matches!(name, "pre" | "textarea")
639}
640
641fn contains_code(name: &str) -> bool {
642 matches!(name, "script" | "style")
643}
644
645fn is_self_closing(name: &str) -> bool {
646 matches!(
647 name,
648 "area"
649 | "base"
650 | "br"
651 | "col"
652 | "embed"
653 | "hr"
654 | "img"
655 | "input"
656 | "link"
657 | "meta"
658 | "param"
659 | "source"
660 | "track"
661 | "wbr"
662 | "command"
663 | "keygen"
664 | "menuitem"
665 )
666}
667
668fn optional_end_tag(name: &str) -> bool {
669 matches!(
670 name,
671 "basefont"
672 | "colgroup"
673 | "dd"
674 | "dt"
675 | "frame"
676 | "isindex"
677 | "li"
678 | "option"
679 | "p"
680 | "tbody"
681 | "td"
682 | "tfoot"
683 | "th"
684 | "thead"
685 | "tr"
686 )
687}
688
689#[cfg(test)]
690mod tests {
691 use super::*;
692 use std::str;
693
694 #[test]
695 fn test_write_collapse_whitespace() {
696 for &(input, expected, preceding_whitespace) in &[
697 ("", "", false),
698 (" ", " ", false),
699 (" ", " ", false),
700 (" ", "", true),
701 (" x y ", " x y ", false),
702 (" x y ", "x y ", true),
703 (" x \n \t \n y ", " x y ", false),
704 (" x \n \t \n y ", "x y ", true),
705 ] {
706 let mut w = Vec::new();
707 let mut minifier = Minifier::new(&mut w, MinifierOptions::default());
708 minifier.preceding_whitespace = preceding_whitespace;
709 minifier
710 .write_collapse_whitespace(
711 input.as_bytes(),
712 reserved_entity,
713 Some(preceding_whitespace),
714 )
715 .unwrap();
716
717 let s = str::from_utf8(&w).unwrap();
718
719 assert_eq!(expected, s);
720 }
721 }
722
723 #[test]
724 fn test_omit_tags() {
725 for &(input, expected, collapse_whitespace, preserve_comments) in &[
726 // <html>
727 ("<html>", "", true, false),
728 // Comments ignored
729 ("<html><!-- -->", "", true, false),
730 // Comments preserved
731 ("<html> <!-- --> ", "<html><!-- -->", true, true),
732 ("<html><!-- --></html>", "<html><!-- -->", true, true),
733 (
734 "<html><!-- --></html><!-- -->",
735 "<html><!-- --></html><!-- -->",
736 true,
737 true,
738 ),
739 (
740 "<html> <!-- --> </html> <!-- --> ",
741 "<html><!-- --></html><!-- -->",
742 true,
743 true,
744 ),
745 (
746 "<html> <!-- --> </html> <!-- --> ",
747 // <body> is implicitly added to the DOM
748 "<html><!-- --><body> </html><!-- -->",
749 false,
750 true,
751 ),
752 // <head>
753 (
754 "<html> <head> <title>A</title> </head> <body><p> B </p> </body>",
755 "<title>A</title><p>B",
756 true,
757 false,
758 ),
759 (
760 "<html> <head> <title>A</title> </head> <body><p> B </p> </body>",
761 "<head> <title>A</title> </head> <p> B ",
762 false,
763 false,
764 ),
765 (
766 "<html> <head><!-- --> <title>A</title> </head> <body><p> B </p> </body>",
767 "<head><!-- --><title>A</title><p>B",
768 true,
769 true,
770 ),
771 // <body>
772 ("<body>", "", true, false),
773 (
774 "<body> <script>let x = 1;</script> ",
775 "<body><script>let x = 1;</script>",
776 true,
777 false,
778 ),
779 (
780 "<body> <style>body{margin:1em}</style>",
781 "<body><style>body{margin:1em}</style>",
782 true,
783 false,
784 ),
785 ("<body> <p>A", "<p>A", true, false),
786 ("<body id=main> <p>A", "<body id=main><p>A", true, false),
787 // Retain whitespace, whitespace before <p>
788 (
789 " <body> <p>A ",
790 "<body> <p>A ",
791 false,
792 false,
793 ),
794 // Retain whitespace, touching <p>
795 ("<body><p>A</body>", "<p>A", false, false),
796 // Comments ignored
797 ("<body><p>A</body><!-- -->", "<p>A", false, false),
798 // Comments preserved
799 (
800 "<body><p>A</body><!-- -->",
801 "<body><p>A</body><!-- -->",
802 false,
803 true,
804 ),
805 // Retain end tag if touching inline element
806 (
807 "<p>Some text</p><button></button>",
808 "<p>Some text</p><button></button>",
809 false,
810 false,
811 ),
812 ] {
813 let mut w = Vec::new();
814 let mut minifier = Minifier::new(
815 &mut w,
816 MinifierOptions {
817 omit_doctype: true,
818 preserve_comments,
819 collapse_whitespace,
820 },
821 );
822 minifier.minify(&mut input.as_bytes()).unwrap();
823
824 let s = str::from_utf8(&w).unwrap();
825
826 assert_eq!(expected, s);
827 }
828 }
829}