htmlconv_nocgo.go

  1//go:build !cgo
  2
  3package clib
  4
  5import (
  6	"bytes"
  7	"regexp"
  8	"strings"
  9
 10	"github.com/PuerkitoBio/goquery"
 11)
 12
 13// HTMLToElements parses HTML and returns structured elements (pure Go fallback).
 14func HTMLToElements(html string) ([]HTMLElement, bool) {
 15	if len(html) == 0 {
 16		return nil, true
 17	}
 18
 19	doc, err := goquery.NewDocumentFromReader(bytes.NewReader([]byte(html)))
 20	if err != nil {
 21		return nil, false
 22	}
 23
 24	doc.Find("style, script").Remove()
 25
 26	var elements []HTMLElement
 27
 28	// Process h1 elements
 29	doc.Find("h1").Each(func(i int, s *goquery.Selection) {
 30		elements = append(elements, HTMLElement{Type: HElemH1, Text: s.Text()})
 31		s.ReplaceWithHtml("\n\n")
 32	})
 33
 34	// Process h2 elements
 35	doc.Find("h2").Each(func(i int, s *goquery.Selection) {
 36		elements = append(elements, HTMLElement{Type: HElemH2, Text: s.Text()})
 37		s.ReplaceWithHtml("\n\n")
 38	})
 39
 40	// Add newlines after block elements
 41	doc.Find("p, div").Each(func(i int, s *goquery.Selection) {
 42		s.After("\n\n")
 43	})
 44
 45	// Replace <br> with newlines
 46	doc.Find("br").Each(func(i int, s *goquery.Selection) {
 47		s.ReplaceWithHtml("\n")
 48	})
 49
 50	// Process blockquotes
 51	onWroteRegex := regexp.MustCompile(`On\s+(.+?),\s+(.+?)\s+wrote:`)
 52	doc.Find("blockquote").Each(func(i int, s *goquery.Selection) {
 53		cite, _ := s.Attr("cite")
 54		quoteText := strings.TrimSpace(s.Text())
 55
 56		var prevText string
 57		if prev := s.Prev(); prev.Length() > 0 {
 58			prevText = strings.TrimSpace(prev.Text())
 59			if onWroteRegex.MatchString(prevText) {
 60				s.Prev().Remove()
 61			}
 62		}
 63
 64		elem := HTMLElement{
 65			Type: HElemBlockquote,
 66			Text: quoteText,
 67		}
 68		if cite != "" {
 69			elem.Attr1 = cite
 70		}
 71		if prevText != "" {
 72			elem.Attr2 = prevText
 73		}
 74		elements = append(elements, elem)
 75		s.ReplaceWithHtml("\n")
 76	})
 77
 78	// Process links
 79	doc.Find("a").Each(func(i int, s *goquery.Selection) {
 80		href, exists := s.Attr("href")
 81		if !exists {
 82			return
 83		}
 84		elements = append(elements, HTMLElement{
 85			Type:  HElemLink,
 86			Text:  s.Text(),
 87			Attr1: href,
 88		})
 89		s.ReplaceWithHtml("")
 90	})
 91
 92	// Process images
 93	doc.Find("img").Each(func(i int, s *goquery.Selection) {
 94		src, exists := s.Attr("src")
 95		if !exists {
 96			return
 97		}
 98		alt, _ := s.Attr("alt")
 99		if alt == "" {
100			alt = "Does not contain alt text"
101		}
102		elements = append(elements, HTMLElement{
103			Type:  HElemImage,
104			Attr1: src,
105			Attr2: alt,
106		})
107		s.ReplaceWithHtml("")
108	})
109
110	// Get remaining text
111	text := doc.Text()
112	if strings.TrimSpace(text) != "" {
113		elements = append(elements, HTMLElement{Type: HElemText, Text: text})
114	}
115
116	return elements, true
117}