1//go:build !cgo
2
3package clib
4
5import (
6 "bytes"
7 "regexp"
8 "strings"
9
10 "github.com/PuerkitoBio/goquery"
11)
12
13// HTMLToElements parses HTML and returns structured elements (pure Go fallback).
14func HTMLToElements(html string) ([]HTMLElement, bool) {
15 if len(html) == 0 {
16 return nil, true
17 }
18
19 doc, err := goquery.NewDocumentFromReader(bytes.NewReader([]byte(html)))
20 if err != nil {
21 return nil, false
22 }
23
24 doc.Find("style, script").Remove()
25
26 var elements []HTMLElement
27
28 // Process h1 elements
29 doc.Find("h1").Each(func(i int, s *goquery.Selection) {
30 elements = append(elements, HTMLElement{Type: HElemH1, Text: s.Text()})
31 s.ReplaceWithHtml("\n\n")
32 })
33
34 // Process h2 elements
35 doc.Find("h2").Each(func(i int, s *goquery.Selection) {
36 elements = append(elements, HTMLElement{Type: HElemH2, Text: s.Text()})
37 s.ReplaceWithHtml("\n\n")
38 })
39
40 // Add newlines after block elements
41 doc.Find("p, div").Each(func(i int, s *goquery.Selection) {
42 s.After("\n\n")
43 })
44
45 // Replace <br> with newlines
46 doc.Find("br").Each(func(i int, s *goquery.Selection) {
47 s.ReplaceWithHtml("\n")
48 })
49
50 // Process blockquotes
51 onWroteRegex := regexp.MustCompile(`On\s+(.+?),\s+(.+?)\s+wrote:`)
52 doc.Find("blockquote").Each(func(i int, s *goquery.Selection) {
53 cite, _ := s.Attr("cite")
54 quoteText := strings.TrimSpace(s.Text())
55
56 var prevText string
57 if prev := s.Prev(); prev.Length() > 0 {
58 prevText = strings.TrimSpace(prev.Text())
59 if onWroteRegex.MatchString(prevText) {
60 s.Prev().Remove()
61 }
62 }
63
64 elem := HTMLElement{
65 Type: HElemBlockquote,
66 Text: quoteText,
67 }
68 if cite != "" {
69 elem.Attr1 = cite
70 }
71 if prevText != "" {
72 elem.Attr2 = prevText
73 }
74 elements = append(elements, elem)
75 s.ReplaceWithHtml("\n")
76 })
77
78 // Process links
79 doc.Find("a").Each(func(i int, s *goquery.Selection) {
80 href, exists := s.Attr("href")
81 if !exists {
82 return
83 }
84 elements = append(elements, HTMLElement{
85 Type: HElemLink,
86 Text: s.Text(),
87 Attr1: href,
88 })
89 s.ReplaceWithHtml("")
90 })
91
92 // Process images
93 doc.Find("img").Each(func(i int, s *goquery.Selection) {
94 src, exists := s.Attr("src")
95 if !exists {
96 return
97 }
98 alt, _ := s.Attr("alt")
99 if alt == "" {
100 alt = "Does not contain alt text"
101 }
102 elements = append(elements, HTMLElement{
103 Type: HElemImage,
104 Attr1: src,
105 Attr2: alt,
106 })
107 s.ReplaceWithHtml("")
108 })
109
110 // Get remaining text
111 text := doc.Text()
112 if strings.TrimSpace(text) != "" {
113 elements = append(elements, HTMLElement{Type: HElemText, Text: text})
114 }
115
116 return elements, true
117}