1use html5ever::{
  2    Attribute, ParseOpts, QualName, parse_document,
  3    tendril::{Tendril, TendrilSink, fmt::UTF8},
  4};
  5use markup5ever_rcdom::{Node, NodeData, RcDom};
  6use std::{cell::RefCell, io, rc::Rc, str};
  7
  8#[derive(Default)]
  9pub(crate) struct MinifierOptions {
 10    pub omit_doctype: bool,
 11    pub preserve_comments: bool,
 12    pub collapse_whitespace: bool,
 13}
 14
 15pub(crate) struct Minifier<'a, W: io::Write> {
 16    w: &'a mut W,
 17    options: MinifierOptions,
 18    preceding_whitespace: bool,
 19}
 20
 21impl<'a, W> Minifier<'a, W>
 22where
 23    W: io::Write,
 24{
 25    /// Creates a new `Minifier` instance.
 26    #[inline]
 27    pub const fn new(w: &'a mut W, options: MinifierOptions) -> Self {
 28        Self {
 29            w,
 30            options,
 31            preceding_whitespace: false,
 32        }
 33    }
 34
 35    /// Minifies the given reader input.
 36    ///
 37    /// # Errors
 38    ///
 39    /// Will return `Err` if unable to write to the output writer.
 40    #[inline]
 41    pub fn minify<R: io::Read>(&mut self, mut r: &mut R) -> io::Result<()> {
 42        let dom = parse_document(RcDom::default(), ParseOpts::default())
 43            .from_utf8()
 44            .read_from(&mut r)?;
 45
 46        if !self.options.omit_doctype {
 47            self.w.write_all(b"<!doctype html>")?;
 48        }
 49
 50        self.minify_node(&None, &dom.document)
 51    }
 52
 53    fn minify_node<'b>(&mut self, ctx: &'b Option<Context>, node: &'b Node) -> io::Result<()> {
 54        match &node.data {
 55            NodeData::Text { contents } => {
 56                // Check if whitespace collapsing disabled
 57                let contents = contents.borrow();
 58                let contents = contents.as_ref();
 59
 60                if !self.options.collapse_whitespace {
 61                    return self.w.write_all(contents.as_bytes());
 62                }
 63
 64                // Check if parent is whitespace preserving element or contains code (<script>, <style>)
 65                let (skip_collapse_whitespace, contains_code) =
 66                    ctx.as_ref().map_or((false, false), |ctx| {
 67                        if let NodeData::Element { name, .. } = &ctx.parent.data {
 68                            let name = name.local.as_ref();
 69
 70                            (preserve_whitespace(name), contains_code(name))
 71                        } else {
 72                            (false, false)
 73                        }
 74                    });
 75
 76                if skip_collapse_whitespace {
 77                    return self.w.write_all(contents.as_bytes());
 78                }
 79
 80                if contains_code {
 81                    return self
 82                        .w
 83                        .write_all(contents.trim_matches(is_ascii_whitespace).as_bytes());
 84                }
 85
 86                // Early exit if empty to forego expensive trim logic
 87                if contents.is_empty() {
 88                    return io::Result::Ok(());
 89                }
 90
 91                let (trim_left, trim_right) = ctx
 92                    .as_ref()
 93                    .map_or((true, true), |ctx| ctx.trim(self.preceding_whitespace));
 94                let contents = match (trim_left, trim_right) {
 95                    (true, true) => contents.trim_matches(is_ascii_whitespace),
 96                    (true, false) => contents.trim_start_matches(is_ascii_whitespace),
 97                    (false, true) => contents.trim_end_matches(is_ascii_whitespace),
 98                    _ => contents,
 99                };
100
101                // Second empty check after trimming whitespace
102                if !contents.is_empty() {
103                    // replace \n, \r to ' '
104                    let contents = contents
105                        .bytes()
106                        .map(|c| if matches!(c, b'\n' | b'\r') { b' ' } else { c })
107                        .collect::<Vec<u8>>();
108
109                    self.write_collapse_whitespace(&contents, reserved_entity, None)?;
110
111                    self.preceding_whitespace = !trim_right
112                        && contents
113                            .iter()
114                            .last()
115                            .map_or(false, u8::is_ascii_whitespace);
116                }
117
118                Ok(())
119            }
120
121            NodeData::Comment { contents } if self.options.preserve_comments => {
122                self.w.write_all(b"<!--")?;
123                self.w.write_all(contents.as_bytes())?;
124                self.w.write_all(b"-->")
125            }
126
127            NodeData::Document => self.minify_children(ctx, node),
128
129            NodeData::Element { name, attrs, .. } => {
130                let attrs = attrs.borrow();
131                let tag = name.local.as_ref();
132
133                if is_self_closing(tag) {
134                    return self.write_start_tag(name, &attrs);
135                }
136
137                let (omit_start_tag, omit_end_tag) =
138                    self.omit_tags(ctx, node, tag, attrs.is_empty());
139
140                if !omit_start_tag {
141                    self.write_start_tag(name, &attrs)?;
142                }
143
144                self.minify_children(ctx, node)?;
145
146                if !omit_end_tag {
147                    self.write_end_tag(name)?;
148                }
149
150                Ok(())
151            }
152
153            _ => Ok(()),
154        }
155    }
156
157    fn next_is_comment<'b, I>(&self, v: I) -> bool
158    where
159        I: IntoIterator<Item = &'b Rc<Node>>,
160    {
161        v.into_iter()
162            .find_map(|node| match &node.data {
163                NodeData::Text { contents } => {
164                    if self.options.collapse_whitespace && is_whitespace(contents) {
165                        // Blocks of whitespace are skipped
166                        None
167                    } else {
168                        Some(false)
169                    }
170                }
171                NodeData::Comment { .. } => Some(self.options.preserve_comments),
172                _ => Some(false),
173            })
174            .unwrap_or(false)
175    }
176
177    fn is_whitespace(&self, s: &RefCell<Tendril<UTF8>>) -> Option<bool> {
178        if self.options.collapse_whitespace && is_whitespace(s) {
179            None
180        } else {
181            Some(
182                !s.borrow()
183                    .as_bytes()
184                    .iter()
185                    .next()
186                    .map_or(false, u8::is_ascii_whitespace),
187            )
188        }
189    }
190
191    /// Determines if start and end tags can be omitted.
192    /// Whitespace rules are ignored if `collapse_whitespace` is enabled.
193    #[allow(clippy::too_many_lines)]
194    fn omit_tags(
195        &self,
196        ctx: &Option<Context>,
197        node: &Node,
198        name: &str,
199        empty_attributes: bool,
200    ) -> (bool, bool) {
201        ctx.as_ref().map_or((false, false), |ctx| match name {
202            "html" => {
203                // The end tag may be omitted if the <html> element is not immediately followed by a comment.
204                let omit_end = ctx.right.map_or(true, |right| !self.next_is_comment(right));
205                // The start tag may be omitted if the first thing inside the <html> element is not a comment.
206                let omit_start =
207                    empty_attributes && omit_end && !self.next_is_comment(&*node.children.borrow());
208
209                (omit_start, omit_end)
210            }
211            "head" => {
212                // The end tag may be omitted if the first thing following the <head> element is not a space character or a comment.
213                let omit_end = ctx.right.map_or(true, |right| {
214                    right
215                        .iter()
216                        .find_map(|node| match &node.data {
217                            NodeData::Text { contents } => self.is_whitespace(contents),
218                            NodeData::Comment { .. } => {
219                                if self.options.preserve_comments {
220                                    Some(false)
221                                } else {
222                                    None
223                                }
224                            }
225                            _ => Some(true),
226                        })
227                        .unwrap_or(true)
228                });
229                // The start tag may be omitted if the first thing inside the <head> element is an element.
230                let omit_start = empty_attributes
231                    && omit_end
232                    && node
233                        .children
234                        .borrow()
235                        .iter()
236                        .find_map(|node| match &node.data {
237                            NodeData::Text { contents } => self.is_whitespace(contents),
238                            NodeData::Element { .. } => Some(true),
239                            NodeData::Comment { .. } => {
240                                if self.options.preserve_comments {
241                                    Some(false)
242                                } else {
243                                    None
244                                }
245                            }
246                            _ => Some(false),
247                        })
248                        .unwrap_or(true);
249
250                (omit_start, omit_end)
251            }
252            "body" => {
253                // The start tag may be omitted if the first thing inside it is not a space character, comment, <script> element or <style> element.
254                let omit_start = empty_attributes
255                    && node
256                        .children
257                        .borrow()
258                        .iter()
259                        .find_map(|node| match &node.data {
260                            NodeData::Text { contents } => self.is_whitespace(contents),
261                            NodeData::Element { name, .. } => {
262                                Some(!matches!(name.local.as_ref(), "script" | "style"))
263                            }
264                            NodeData::Comment { .. } => {
265                                if self.options.preserve_comments {
266                                    Some(false)
267                                } else {
268                                    None
269                                }
270                            }
271                            _ => Some(true),
272                        })
273                        .unwrap_or(true);
274                // The end tag may be omitted if the <body> element has contents or has a start tag, and is not immediately followed by a comment.
275                let omit_end = ctx.right.map_or(true, |right| !self.next_is_comment(right));
276
277                (omit_start && omit_end, omit_end)
278            }
279            "p" => {
280                let omit_end = ctx.next_element().map_or(true, |node| {
281                    if let NodeData::Element { name, .. } = &node.data {
282                        matches!(
283                            name.local.as_ref().to_ascii_lowercase().as_str(),
284                            "address"
285                                | "article"
286                                | "aside"
287                                | "blockquote"
288                                | "div"
289                                | "dl"
290                                | "fieldset"
291                                | "footer"
292                                | "form"
293                                | "h1"
294                                | "h2"
295                                | "h3"
296                                | "h4"
297                                | "h5"
298                                | "h6"
299                                | "header"
300                                | "hr"
301                                | "menu"
302                                | "nav"
303                                | "ol"
304                                | "p"
305                                | "pre"
306                                | "section"
307                                | "table"
308                                | "ul"
309                        )
310                    } else {
311                        false
312                    }
313                });
314
315                (false, omit_end)
316            }
317            // TODO: comprehensive handling of optional end element rules
318            _ => (false, optional_end_tag(name)),
319        })
320    }
321
322    #[allow(clippy::needless_pass_by_value)]
323    fn minify_children(&mut self, ctx: &Option<Context>, node: &Node) -> io::Result<()> {
324        let children = node.children.borrow();
325        let l = children.len();
326
327        children.iter().enumerate().try_for_each(|(i, child)| {
328            if self.preceding_whitespace && is_block_element(child) {
329                self.preceding_whitespace = false;
330            }
331
332            self.minify_node(
333                &Some(Context {
334                    parent: node,
335                    parent_context: ctx.as_ref(),
336                    left: if i > 0 { Some(&children[..i]) } else { None },
337                    right: if i + 1 < l {
338                        Some(&children[i + 1..])
339                    } else {
340                        None
341                    },
342                }),
343                child,
344            )
345        })
346    }
347
348    fn write_qualified_name(&mut self, name: &QualName) -> io::Result<()> {
349        if let Some(prefix) = &name.prefix {
350            self.w
351                .write_all(prefix.as_ref().to_ascii_lowercase().as_bytes())?;
352            self.w.write_all(b":")?;
353        }
354
355        self.w
356            .write_all(name.local.as_ref().to_ascii_lowercase().as_bytes())
357    }
358
359    fn write_start_tag(&mut self, name: &QualName, attrs: &[Attribute]) -> io::Result<()> {
360        self.w.write_all(b"<")?;
361        self.write_qualified_name(name)?;
362
363        attrs
364            .iter()
365            .try_for_each(|attr| self.write_attribute(attr))?;
366
367        self.w.write_all(b">")
368    }
369
370    fn write_end_tag(&mut self, name: &QualName) -> io::Result<()> {
371        self.w.write_all(b"</")?;
372        self.write_qualified_name(name)?;
373        self.w.write_all(b">")
374    }
375
376    fn write_attribute(&mut self, attr: &Attribute) -> io::Result<()> {
377        self.w.write_all(b" ")?;
378        self.write_qualified_name(&attr.name)?;
379
380        let value = attr.value.as_ref();
381        let value = if self.options.collapse_whitespace {
382            value.trim_matches(is_ascii_whitespace)
383        } else {
384            value
385        };
386
387        if value.is_empty() {
388            return io::Result::Ok(());
389        }
390
391        self.w.write_all(b"=")?;
392
393        let b = value.as_bytes();
394        let (unquoted, double, _) =
395            b.iter()
396                .fold((true, false, false), |(unquoted, double, single), &c| {
397                    let (double, single) = (double || c == b'"', single || c == b'\'');
398                    let unquoted =
399                        unquoted && !double && !single && c != b'=' && !c.is_ascii_whitespace();
400
401                    (unquoted, double, single)
402                });
403
404        if unquoted {
405            self.w.write_all(b)
406        } else if double {
407            self.write_attribute_value(b, b"'", reserved_entity_with_apos)
408        } else {
409            self.write_attribute_value(b, b"\"", reserved_entity)
410        }
411    }
412
413    fn write_attribute_value<T: AsRef<[u8]>>(
414        &mut self,
415        v: T,
416        quote: &[u8],
417        f: EntityFn,
418    ) -> io::Result<()> {
419        self.w.write_all(quote)?;
420
421        let b = v.as_ref();
422
423        if self.options.collapse_whitespace {
424            self.write_collapse_whitespace(b, f, Some(false))
425        } else {
426            self.w.write_all(b)
427        }?;
428
429        self.w.write_all(quote)
430    }
431
432    /// Efficiently writes blocks of content, e.g. a string with no collapsed
433    /// whitespace would result in a single write.
434    fn write_collapse_whitespace(
435        &mut self,
436        b: &[u8],
437        f: EntityFn,
438        preceding_whitespace: Option<bool>,
439    ) -> io::Result<()> {
440        b.iter()
441            .enumerate()
442            .try_fold(
443                (0, preceding_whitespace.unwrap_or(self.preceding_whitespace)),
444                |(pos, preceding_whitespace), (i, &c)| {
445                    let is_whitespace = c.is_ascii_whitespace();
446
447                    Ok(if is_whitespace && preceding_whitespace {
448                        if i != pos {
449                            self.write(&b[pos..i], f)?;
450                        }
451
452                        // ASCII whitespace = 1 byte
453                        (i + 1, true)
454                    } else {
455                        (pos, is_whitespace)
456                    })
457                },
458            )
459            .and_then(|(pos, _)| {
460                if pos < b.len() {
461                    self.write(&b[pos..], f)?;
462                }
463
464                Ok(())
465            })
466    }
467
468    fn write(&mut self, b: &[u8], f: EntityFn) -> io::Result<()> {
469        b.iter()
470            .enumerate()
471            .try_fold(0, |pos, (i, &c)| {
472                Ok(if let Some(entity) = f(c) {
473                    self.w.write_all(&b[pos..i])?;
474                    self.w.write_all(entity)?;
475
476                    // Reserved characters are 1 byte
477                    i + 1
478                } else {
479                    pos
480                })
481            })
482            .and_then(|pos| {
483                if pos < b.len() {
484                    self.w.write_all(&b[pos..])?;
485                }
486
487                Ok(())
488            })
489    }
490}
491
492struct Context<'a> {
493    parent: &'a Node,
494    parent_context: Option<&'a Context<'a>>,
495    left: Option<&'a [Rc<Node>]>,
496    right: Option<&'a [Rc<Node>]>,
497}
498
499impl<'a> Context<'a> {
500    /// Determine whether to trim whitespace.
501    /// Uses naive HTML5 whitespace collapsing rules.
502    fn trim(&self, preceding_whitespace: bool) -> (bool, bool) {
503        (preceding_whitespace || self.trim_left(), self.trim_right())
504    }
505
506    fn trim_left(&self) -> bool {
507        self.left.map_or_else(
508            || is_block_element(self.parent) || self.parent_trim_left(),
509            |siblings| {
510                siblings
511                    .iter()
512                    .rev()
513                    .find_map(Self::is_block_element)
514                    .unwrap_or_else(|| self.parent_trim_left())
515            },
516        )
517    }
518
519    fn parent_trim_left(&self) -> bool {
520        self.parent_context.map_or(true, Context::trim_left)
521    }
522
523    fn trim_right(&self) -> bool {
524        self.right.map_or(true, |siblings| {
525            siblings
526                .iter()
527                .find_map(Self::is_block_element)
528                .unwrap_or(true)
529        })
530    }
531
532    fn next_element(&self) -> Option<&Rc<Node>> {
533        self.right.and_then(|siblings| {
534            siblings
535                .iter()
536                .find(|node| matches!(node.data, NodeData::Element { .. }))
537        })
538    }
539
540    fn is_block_element(node: &Rc<Node>) -> Option<bool> {
541        if let NodeData::Element { name, .. } = &node.data {
542            Some(is_block_element_name(name.local.as_ref()))
543        } else {
544            None
545        }
546    }
547}
548
549type EntityFn = fn(u8) -> Option<&'static [u8]>;
550
551const fn reserved_entity(v: u8) -> Option<&'static [u8]> {
552    match v {
553        b'<' => Some(b"<"),
554        b'>' => Some(b">"),
555        b'&' => Some(b"&"),
556        _ => None,
557    }
558}
559
560const fn reserved_entity_with_apos(v: u8) -> Option<&'static [u8]> {
561    if v == b'\'' {
562        Some(b"'")
563    } else {
564        reserved_entity(v)
565    }
566}
567
568fn is_whitespace(s: &RefCell<Tendril<UTF8>>) -> bool {
569    s.borrow().as_bytes().iter().all(u8::is_ascii_whitespace)
570}
571
572fn is_block_element_name(name: &str) -> bool {
573    matches!(
574        name,
575        "address"
576            | "article"
577            | "aside"
578            | "blockquote"
579            | "body"
580            | "br"
581            | "details"
582            | "dialog"
583            | "dd"
584            | "div"
585            | "dl"
586            | "dt"
587            | "fieldset"
588            | "figcaption"
589            | "figure"
590            | "footer"
591            | "form"
592            | "h1"
593            | "h2"
594            | "h3"
595            | "h4"
596            | "h5"
597            | "h6"
598            | "head"
599            | "header"
600            | "hgroup"
601            | "hr"
602            | "html"
603            | "li"
604            | "link"
605            | "main"
606            | "meta"
607            | "nav"
608            | "ol"
609            | "option"
610            | "p"
611            | "pre"
612            | "script"
613            | "section"
614            | "source"
615            | "table"
616            | "td"
617            | "th"
618            | "title"
619            | "tr"
620            | "ul"
621    )
622}
623
624fn is_block_element(node: &Node) -> bool {
625    match &node.data {
626        NodeData::Element { name, .. } => is_block_element_name(name.local.as_ref()),
627        NodeData::Document => true,
628        _ => false,
629    }
630}
631
632#[allow(clippy::missing_const_for_fn)]
633fn is_ascii_whitespace(c: char) -> bool {
634    c.is_ascii_whitespace()
635}
636
637fn preserve_whitespace(name: &str) -> bool {
638    matches!(name, "pre" | "textarea")
639}
640
641fn contains_code(name: &str) -> bool {
642    matches!(name, "script" | "style")
643}
644
645fn is_self_closing(name: &str) -> bool {
646    matches!(
647        name,
648        "area"
649            | "base"
650            | "br"
651            | "col"
652            | "embed"
653            | "hr"
654            | "img"
655            | "input"
656            | "link"
657            | "meta"
658            | "param"
659            | "source"
660            | "track"
661            | "wbr"
662            | "command"
663            | "keygen"
664            | "menuitem"
665    )
666}
667
668fn optional_end_tag(name: &str) -> bool {
669    matches!(
670        name,
671        "basefont"
672            | "colgroup"
673            | "dd"
674            | "dt"
675            | "frame"
676            | "isindex"
677            | "li"
678            | "option"
679            | "p"
680            | "tbody"
681            | "td"
682            | "tfoot"
683            | "th"
684            | "thead"
685            | "tr"
686    )
687}
688
689#[cfg(test)]
690mod tests {
691    use super::*;
692    use std::str;
693
694    #[test]
695    fn test_write_collapse_whitespace() {
696        for &(input, expected, preceding_whitespace) in &[
697            ("", "", false),
698            ("  ", " ", false),
699            ("   ", " ", false),
700            ("   ", "", true),
701            (" x      y  ", " x y ", false),
702            (" x      y  ", "x y ", true),
703            (" x   \n  \t \n   y  ", " x y ", false),
704            (" x   \n  \t \n   y  ", "x y ", true),
705        ] {
706            let mut w = Vec::new();
707            let mut minifier = Minifier::new(&mut w, MinifierOptions::default());
708            minifier.preceding_whitespace = preceding_whitespace;
709            minifier
710                .write_collapse_whitespace(
711                    input.as_bytes(),
712                    reserved_entity,
713                    Some(preceding_whitespace),
714                )
715                .unwrap();
716
717            let s = str::from_utf8(&w).unwrap();
718
719            assert_eq!(expected, s);
720        }
721    }
722
723    #[test]
724    fn test_omit_tags() {
725        for &(input, expected, collapse_whitespace, preserve_comments) in &[
726            // <html>
727            ("<html>", "", true, false),
728            // Comments ignored
729            ("<html><!-- -->", "", true, false),
730            // Comments preserved
731            ("<html>     <!-- -->    ", "<html><!-- -->", true, true),
732            ("<html><!-- --></html>", "<html><!-- -->", true, true),
733            (
734                "<html><!-- --></html><!-- -->",
735                "<html><!-- --></html><!-- -->",
736                true,
737                true,
738            ),
739            (
740                "<html>    <!-- -->    </html>    <!-- -->    ",
741                "<html><!-- --></html><!-- -->",
742                true,
743                true,
744            ),
745            (
746                "<html>    <!-- -->    </html>    <!-- -->    ",
747                // <body> is implicitly added to the DOM
748                "<html><!-- --><body>        </html><!-- -->",
749                false,
750                true,
751            ),
752            // <head>
753            (
754                "<html>   <head>   <title>A</title>     </head>   <body><p>     B  </p> </body>",
755                "<title>A</title><p>B",
756                true,
757                false,
758            ),
759            (
760                "<html>   <head>   <title>A</title>     </head>   <body><p>     B  </p> </body>",
761                "<head>   <title>A</title>     </head>   <p>     B   ",
762                false,
763                false,
764            ),
765            (
766                "<html>   <head><!-- -->   <title>A</title>     </head>   <body><p>     B  </p> </body>",
767                "<head><!-- --><title>A</title><p>B",
768                true,
769                true,
770            ),
771            // <body>
772            ("<body>", "", true, false),
773            (
774                "<body>    <script>let x = 1;</script>   ",
775                "<body><script>let x = 1;</script>",
776                true,
777                false,
778            ),
779            (
780                "<body>        <style>body{margin:1em}</style>",
781                "<body><style>body{margin:1em}</style>",
782                true,
783                false,
784            ),
785            ("<body>    <p>A", "<p>A", true, false),
786            ("<body id=main>    <p>A", "<body id=main><p>A", true, false),
787            // Retain whitespace, whitespace before <p>
788            (
789                "    <body>    <p>A      ",
790                "<body>    <p>A      ",
791                false,
792                false,
793            ),
794            // Retain whitespace, touching <p>
795            ("<body><p>A</body>", "<p>A", false, false),
796            // Comments ignored
797            ("<body><p>A</body><!-- -->", "<p>A", false, false),
798            // Comments preserved
799            (
800                "<body><p>A</body><!-- -->",
801                "<body><p>A</body><!-- -->",
802                false,
803                true,
804            ),
805            // Retain end tag if touching inline element
806            (
807                "<p>Some text</p><button></button>",
808                "<p>Some text</p><button></button>",
809                false,
810                false,
811            ),
812        ] {
813            let mut w = Vec::new();
814            let mut minifier = Minifier::new(
815                &mut w,
816                MinifierOptions {
817                    omit_doctype: true,
818                    preserve_comments,
819                    collapse_whitespace,
820                },
821            );
822            minifier.minify(&mut input.as_bytes()).unwrap();
823
824            let s = str::from_utf8(&w).unwrap();
825
826            assert_eq!(expected, s);
827        }
828    }
829}