1#include "htmlconv.h"
2#include <stdlib.h>
3#include <string.h>
4#include <ctype.h>
5#include <stdio.h>
6
7// --- Dynamic buffer ---
8
9typedef struct {
10 char* data;
11 size_t len;
12 size_t cap;
13} Buffer;
14
15static void buf_init(Buffer* b) {
16 b->data = NULL;
17 b->len = 0;
18 b->cap = 0;
19}
20
21static void buf_ensure(Buffer* b, size_t extra) {
22 size_t needed = b->len + extra;
23 if (needed <= b->cap) return;
24 size_t newcap = b->cap ? b->cap * 2 : 256;
25 while (newcap < needed) newcap *= 2;
26 b->data = (char*)realloc(b->data, newcap);
27 b->cap = newcap;
28}
29
30static void buf_append(Buffer* b, const char* s, size_t n) {
31 if (n == 0) return;
32 buf_ensure(b, n);
33 memcpy(b->data + b->len, s, n);
34 b->len += n;
35}
36
37static void buf_append_char(Buffer* b, char c) {
38 buf_ensure(b, 1);
39 b->data[b->len++] = c;
40}
41
42static char* buf_finish(Buffer* b) {
43 buf_append_char(b, '\0');
44 return b->data;
45}
46
47static void buf_free(Buffer* b) {
48 free(b->data);
49 b->data = NULL;
50 b->len = 0;
51 b->cap = 0;
52}
53
54// Append a text character with HTML whitespace collapsing.
55// In non-pre mode, collapses runs of whitespace to a single space.
56static void buf_append_html_char(Buffer* b, char c, int in_pre) {
57 if (in_pre) {
58 buf_append_char(b, c);
59 return;
60 }
61 if (c == '\n' || c == '\r' || c == '\t') c = ' ';
62 if (c == ' ') {
63 // Skip if buffer already ends with space or is empty
64 if (b->len == 0 || b->data[b->len - 1] == ' ' || b->data[b->len - 1] == '\n') return;
65 }
66 buf_append_char(b, c);
67}
68
69// --- Result helpers ---
70
71static void result_init(HTMLConvertResult* r) {
72 r->elements = NULL;
73 r->count = 0;
74 r->cap = 0;
75 r->ok = 0;
76}
77
78static HTMLElement* result_add(HTMLConvertResult* r) {
79 if (r->count >= r->cap) {
80 int newcap = r->cap ? r->cap * 2 : 32;
81 r->elements = (HTMLElement*)realloc(r->elements, sizeof(HTMLElement) * newcap);
82 r->cap = newcap;
83 }
84 HTMLElement* e = &r->elements[r->count++];
85 e->type = HELEM_TEXT;
86 e->text = NULL;
87 e->attr1 = NULL;
88 e->attr2 = NULL;
89 return e;
90}
91
92// Flush accumulated text buffer as a TEXT element.
93static void flush_text(HTMLConvertResult* r, Buffer* buf) {
94 if (buf->len == 0) return;
95 HTMLElement* e = result_add(r);
96 e->type = HELEM_TEXT;
97 e->text = buf_finish(buf);
98 buf_init(buf);
99}
100
101// --- HTML entity decoding ---
102
103static size_t decode_entity(const char* s, size_t len, Buffer* out) {
104 // s points to '&', returns number of chars consumed
105 if (len < 2) { buf_append_char(out, '&'); return 1; }
106
107 // Find the ';'
108 size_t end = 1;
109 while (end < len && end < 12 && s[end] != ';') end++;
110 if (end >= len || s[end] != ';') { buf_append_char(out, '&'); return 1; }
111
112 size_t ent_len = end - 1; // length of entity name (between & and ;)
113 const char* name = s + 1;
114
115 // Numeric entities
116 if (ent_len >= 2 && name[0] == '#') {
117 unsigned long cp = 0;
118 if (name[1] == 'x' || name[1] == 'X') {
119 for (size_t i = 2; i < ent_len; i++) {
120 char c = name[i];
121 if (c >= '0' && c <= '9') cp = cp * 16 + (c - '0');
122 else if (c >= 'a' && c <= 'f') cp = cp * 16 + 10 + (c - 'a');
123 else if (c >= 'A' && c <= 'F') cp = cp * 16 + 10 + (c - 'A');
124 else break;
125 }
126 } else {
127 for (size_t i = 1; i < ent_len; i++) {
128 if (name[i] >= '0' && name[i] <= '9') cp = cp * 10 + (name[i] - '0');
129 else break;
130 }
131 }
132 // Encode as UTF-8
133 if (cp < 0x80) {
134 buf_append_char(out, (char)cp);
135 } else if (cp < 0x800) {
136 buf_append_char(out, (char)(0xC0 | (cp >> 6)));
137 buf_append_char(out, (char)(0x80 | (cp & 0x3F)));
138 } else if (cp < 0x10000) {
139 buf_append_char(out, (char)(0xE0 | (cp >> 12)));
140 buf_append_char(out, (char)(0x80 | ((cp >> 6) & 0x3F)));
141 buf_append_char(out, (char)(0x80 | (cp & 0x3F)));
142 } else if (cp < 0x110000) {
143 buf_append_char(out, (char)(0xF0 | (cp >> 18)));
144 buf_append_char(out, (char)(0x80 | ((cp >> 12) & 0x3F)));
145 buf_append_char(out, (char)(0x80 | ((cp >> 6) & 0x3F)));
146 buf_append_char(out, (char)(0x80 | (cp & 0x3F)));
147 }
148 return end + 1;
149 }
150
151 // Named entities (common ones)
152 struct { const char* name; const char* value; } entities[] = {
153 {"lt", "<"}, {"gt", ">"}, {"amp", "&"}, {"quot", "\""},
154 {"apos", "'"}, {"nbsp", " "}, {"ndash", "\xe2\x80\x93"},
155 {"mdash", "\xe2\x80\x94"}, {"laquo", "\xc2\xab"},
156 {"raquo", "\xc2\xbb"}, {"copy", "\xc2\xa9"},
157 {"reg", "\xc2\xae"}, {"trade", "\xe2\x84\xa2"},
158 {"hellip", "\xe2\x80\xa6"}, {"bull", "\xe2\x80\xa2"},
159 {"rsquo", "\xe2\x80\x99"}, {"lsquo", "\xe2\x80\x98"},
160 {"rdquo", "\xe2\x80\x9d"}, {"ldquo", "\xe2\x80\x9c"},
161 {NULL, NULL}
162 };
163
164 for (int i = 0; entities[i].name; i++) {
165 if (ent_len == strlen(entities[i].name) &&
166 strncmp(name, entities[i].name, ent_len) == 0) {
167 buf_append(out, entities[i].value, strlen(entities[i].value));
168 return end + 1;
169 }
170 }
171
172 // Unknown entity - pass through
173 buf_append(out, s, end + 1);
174 return end + 1;
175}
176
177// --- Tag parsing ---
178
179typedef struct {
180 char name[64];
181 int name_len;
182 int is_closing;
183 int is_self_closing;
184 // Attributes (we parse href, src, alt, cite)
185 char href[2048];
186 char src[2048];
187 char alt[512];
188 char cite[2048];
189} Tag;
190
191// Case-insensitive compare for tag names.
192static int tag_eq(const char* a, int alen, const char* b) {
193 int blen = (int)strlen(b);
194 if (alen != blen) return 0;
195 for (int i = 0; i < alen; i++) {
196 if (tolower((unsigned char)a[i]) != tolower((unsigned char)b[i])) return 0;
197 }
198 return 1;
199}
200
201// Parse an attribute value (handles both quoted and unquoted).
202// Returns pointer past the parsed value.
203static const char* parse_attr_value(const char* p, const char* end, char* out, size_t out_size) {
204 if (p >= end) return p;
205 char quote = 0;
206 if (*p == '"' || *p == '\'') {
207 quote = *p++;
208 }
209 size_t i = 0;
210 while (p < end) {
211 if (quote) {
212 if (*p == quote) { p++; break; }
213 } else {
214 if (isspace((unsigned char)*p) || *p == '>' || *p == '/') break;
215 }
216 if (i < out_size - 1) out[i++] = *p;
217 p++;
218 }
219 out[i] = '\0';
220 return p;
221}
222
223// Parse a tag starting after '<'. Returns pointer past '>'.
224static const char* parse_tag(const char* p, const char* end, Tag* tag) {
225 memset(tag, 0, sizeof(*tag));
226
227 // Skip whitespace after '<'
228 while (p < end && isspace((unsigned char)*p)) p++;
229
230 // Check closing tag
231 if (p < end && *p == '/') {
232 tag->is_closing = 1;
233 p++;
234 }
235
236 // Parse tag name
237 while (p < end && !isspace((unsigned char)*p) && *p != '>' && *p != '/' &&
238 tag->name_len < 63) {
239 tag->name[tag->name_len++] = *p++;
240 }
241 tag->name[tag->name_len] = '\0';
242
243 // Parse attributes
244 while (p < end && *p != '>') {
245 // Skip whitespace
246 while (p < end && isspace((unsigned char)*p)) p++;
247 if (p >= end || *p == '>' || *p == '/') break;
248
249 // Parse attribute name
250 char attr_name[64] = {0};
251 int an = 0;
252 while (p < end && *p != '=' && *p != '>' && !isspace((unsigned char)*p) && an < 63) {
253 attr_name[an++] = tolower((unsigned char)*p++);
254 }
255 attr_name[an] = '\0';
256
257 // Skip '='
258 while (p < end && isspace((unsigned char)*p)) p++;
259 if (p < end && *p == '=') {
260 p++;
261 while (p < end && isspace((unsigned char)*p)) p++;
262
263 // Parse value into correct field
264 if (strcmp(attr_name, "href") == 0) {
265 p = parse_attr_value(p, end, tag->href, sizeof(tag->href));
266 } else if (strcmp(attr_name, "src") == 0) {
267 p = parse_attr_value(p, end, tag->src, sizeof(tag->src));
268 } else if (strcmp(attr_name, "alt") == 0) {
269 p = parse_attr_value(p, end, tag->alt, sizeof(tag->alt));
270 } else if (strcmp(attr_name, "cite") == 0) {
271 p = parse_attr_value(p, end, tag->cite, sizeof(tag->cite));
272 } else {
273 // Skip unknown attribute value
274 char discard[4096];
275 p = parse_attr_value(p, end, discard, sizeof(discard));
276 }
277 }
278 }
279
280 // Check self-closing and skip past '>'
281 if (p < end && *p == '/') {
282 tag->is_self_closing = 1;
283 p++;
284 }
285 if (p < end && *p == '>') p++;
286
287 return p;
288}
289
290// --- Main parser ---
291
292// Tag stack for nesting tracking.
293#define MAX_STACK 128
294
295typedef struct {
296 int in_style; // Inside <style>
297 int in_script; // Inside <script>
298 int in_pre; // Inside <pre>
299 int in_a; // Inside <a>
300 char a_href[2048]; // Current link href
301 Buffer a_text; // Current link text accumulator
302 int in_h1;
303 int in_h2;
304 Buffer h_text; // Current header text accumulator
305 int bq_depth; // Blockquote nesting depth
306 Buffer bq_text; // Current blockquote text accumulator
307 char bq_cite[2048];
308 Buffer bq_prev; // Text before blockquote (for "On...wrote:" detection)
309 int last_was_block; // Last element was a block (for spacing)
310 // Table state
311 int table_depth; // Table nesting depth (0 = not in any table)
312 int capture_depth; // Depth at which we're capturing data table (-1 = not capturing)
313 int in_thead; // Inside <thead> (at capture depth)
314 int in_tr; // Inside <tr> (at capture depth, capturing mode)
315 int in_td; // Inside <td>/<th> (at capture depth, capturing mode)
316 int cell_index; // Cell index within current row
317 int row_index; // Row index within current table
318 int header_rows; // Number of header rows (rows inside <thead>)
319 Buffer cell_text; // Current cell text accumulator
320 Buffer table_data; // Accumulated table data (cells tab-separated, rows newline-separated)
321} ParseState;
322
323HTMLConvertResult html_to_elements(const char* html, size_t len) {
324 HTMLConvertResult result;
325 result_init(&result);
326
327 if (!html || len == 0) {
328 result.ok = 1;
329 return result;
330 }
331
332 ParseState state;
333 memset(&state, 0, sizeof(state));
334 state.capture_depth = -1; // Not capturing
335 buf_init(&state.a_text);
336 buf_init(&state.h_text);
337 buf_init(&state.bq_text);
338 buf_init(&state.bq_prev);
339 buf_init(&state.cell_text);
340 buf_init(&state.table_data);
341
342 Buffer text_buf;
343 buf_init(&text_buf);
344
345 const char* p = html;
346 const char* end = html + len;
347
348 while (p < end) {
349 if (*p == '<') {
350 // Check for comment
351 if (p + 3 < end && p[1] == '!' && p[2] == '-' && p[3] == '-') {
352 const char* ce = strstr(p + 4, "-->");
353 if (ce) { p = ce + 3; continue; }
354 p++;
355 continue;
356 }
357
358 // Check for DOCTYPE/CDATA
359 if (p + 1 < end && p[1] == '!') {
360 const char* gt = memchr(p, '>', end - p);
361 if (gt) { p = gt + 1; continue; }
362 p++;
363 continue;
364 }
365
366 Tag tag;
367 const char* after = parse_tag(p + 1, end, &tag);
368
369 // Handle specific tags
370 if (tag_eq(tag.name, tag.name_len, "style")) {
371 if (tag.is_closing) state.in_style = 0;
372 else state.in_style = 1;
373 p = after;
374 continue;
375 }
376 if (tag_eq(tag.name, tag.name_len, "script")) {
377 if (tag.is_closing) state.in_script = 0;
378 else state.in_script = 1;
379 p = after;
380 continue;
381 }
382
383 if (state.in_style || state.in_script) {
384 p = after;
385 continue;
386 }
387
388 // <br> -> newline
389 if (tag_eq(tag.name, tag.name_len, "br")) {
390 if (state.in_td) {
391 buf_append_char(&state.cell_text, ' ');
392 } else if (state.in_a) {
393 buf_append_char(&state.a_text, '\n');
394 } else if (state.in_h1 || state.in_h2) {
395 buf_append_char(&state.h_text, ' ');
396 } else if (state.bq_depth > 0) {
397 buf_append_char(&state.bq_text, '\n');
398 } else {
399 buf_append_char(&text_buf, '\n');
400 }
401 p = after;
402 continue;
403 }
404
405 // <pre>
406 if (tag_eq(tag.name, tag.name_len, "pre")) {
407 state.in_pre = !tag.is_closing;
408 p = after;
409 continue;
410 }
411
412 // <h1>
413 if (tag_eq(tag.name, tag.name_len, "h1")) {
414 if (tag.is_closing && state.in_h1) {
415 state.in_h1 = 0;
416 flush_text(&result, &text_buf);
417 HTMLElement* e = result_add(&result);
418 e->type = HELEM_H1;
419 e->text = buf_finish(&state.h_text);
420 buf_init(&state.h_text);
421 // Add block spacing
422 HTMLElement* sp = result_add(&result);
423 sp->type = HELEM_TEXT;
424 sp->text = strdup("\n\n");
425 } else if (!tag.is_closing) {
426 flush_text(&result, &text_buf);
427 state.in_h1 = 1;
428 buf_init(&state.h_text);
429 }
430 p = after;
431 continue;
432 }
433
434 // <h2>
435 if (tag_eq(tag.name, tag.name_len, "h2")) {
436 if (tag.is_closing && state.in_h2) {
437 state.in_h2 = 0;
438 flush_text(&result, &text_buf);
439 HTMLElement* e = result_add(&result);
440 e->type = HELEM_H2;
441 e->text = buf_finish(&state.h_text);
442 buf_init(&state.h_text);
443 HTMLElement* sp = result_add(&result);
444 sp->type = HELEM_TEXT;
445 sp->text = strdup("\n\n");
446 } else if (!tag.is_closing) {
447 flush_text(&result, &text_buf);
448 state.in_h2 = 1;
449 buf_init(&state.h_text);
450 }
451 p = after;
452 continue;
453 }
454
455 // <a>
456 if (tag_eq(tag.name, tag.name_len, "a")) {
457 if (tag.is_closing && state.in_a) {
458 state.in_a = 0;
459 // If inside blockquote, emit link text inline
460 if (state.bq_depth > 0) {
461 if (state.a_text.len > 0) {
462 buf_append(&state.bq_text, state.a_text.data, state.a_text.len);
463 }
464 buf_free(&state.a_text);
465 } else {
466 flush_text(&result, &text_buf);
467 HTMLElement* e = result_add(&result);
468 e->type = HELEM_LINK;
469 e->text = buf_finish(&state.a_text);
470 e->attr1 = strdup(state.a_href);
471 buf_init(&state.a_text);
472 }
473 } else if (!tag.is_closing && tag.href[0]) {
474 if (state.bq_depth == 0) flush_text(&result, &text_buf);
475 state.in_a = 1;
476 strncpy(state.a_href, tag.href, sizeof(state.a_href) - 1);
477 state.a_href[sizeof(state.a_href) - 1] = '\0';
478 buf_init(&state.a_text);
479 }
480 p = after;
481 continue;
482 }
483
484 // <img>
485 if (tag_eq(tag.name, tag.name_len, "img")) {
486 if (tag.src[0]) {
487 flush_text(&result, &text_buf);
488 HTMLElement* e = result_add(&result);
489 e->type = HELEM_IMAGE;
490 e->attr1 = strdup(tag.src);
491 e->attr2 = tag.alt[0] ? strdup(tag.alt) : strdup("Does not contain alt text");
492 }
493 p = after;
494 continue;
495 }
496
497 // <blockquote>
498 if (tag_eq(tag.name, tag.name_len, "blockquote")) {
499 if (tag.is_closing && state.bq_depth > 0) {
500 state.bq_depth--;
501 if (state.bq_depth == 0) {
502 flush_text(&result, &text_buf);
503 HTMLElement* e = result_add(&result);
504 e->type = HELEM_BLOCKQUOTE;
505 e->text = buf_finish(&state.bq_text);
506 if (tag.cite[0]) {
507 e->attr1 = strdup(tag.cite);
508 }
509 if (state.bq_prev.len > 0) {
510 e->attr2 = buf_finish(&state.bq_prev);
511 }
512 buf_init(&state.bq_text);
513 buf_init(&state.bq_prev);
514 }
515 } else if (!tag.is_closing) {
516 if (state.bq_depth == 0) {
517 // Capture preceding text for "On...wrote:" detection
518 // Look back in text_buf for the last line
519 buf_free(&state.bq_prev);
520 buf_init(&state.bq_prev);
521 if (text_buf.len > 0) {
522 // Find last non-empty line
523 int start = (int)text_buf.len - 1;
524 while (start > 0 && text_buf.data[start] == '\n') start--;
525 int line_start = start;
526 while (line_start > 0 && text_buf.data[line_start - 1] != '\n') line_start--;
527 int line_len = start - line_start + 1;
528 if (line_len > 0) {
529 buf_append(&state.bq_prev, text_buf.data + line_start, line_len);
530 }
531 }
532 flush_text(&result, &text_buf);
533 buf_init(&state.bq_text);
534 }
535 if (tag.cite[0]) {
536 strncpy(state.bq_cite, tag.cite, sizeof(state.bq_cite) - 1);
537 }
538 state.bq_depth++;
539 }
540 p = after;
541 continue;
542 }
543
544 // <table>
545 if (tag_eq(tag.name, tag.name_len, "table")) {
546 if (!tag.is_closing) {
547 state.table_depth++;
548 } else if (state.table_depth > 0) {
549 if (state.table_depth == state.capture_depth) {
550 // Closing the data table we're capturing
551 if (state.table_data.len > 0) {
552 flush_text(&result, &text_buf);
553 HTMLElement* e = result_add(&result);
554 e->type = HELEM_TABLE;
555 e->text = buf_finish(&state.table_data);
556 buf_init(&state.table_data);
557 char hdr_buf[16];
558 snprintf(hdr_buf, sizeof(hdr_buf), "%d", state.header_rows);
559 e->attr1 = strdup(hdr_buf);
560 } else {
561 buf_free(&state.table_data);
562 buf_init(&state.table_data);
563 }
564 state.capture_depth = -1;
565 state.in_td = 0;
566 state.in_tr = 0;
567 } else {
568 // Layout or nested table: block spacing
569 if (state.bq_depth > 0) {
570 buf_append(&state.bq_text, "\n\n", 2);
571 } else {
572 buf_append(&text_buf, "\n\n", 2);
573 }
574 state.last_was_block = 1;
575 }
576 state.table_depth--;
577 }
578 p = after;
579 continue;
580 }
581
582 // <thead>
583 if (tag_eq(tag.name, tag.name_len, "thead")) {
584 if (state.table_depth == state.capture_depth) {
585 state.in_thead = !tag.is_closing;
586 }
587 p = after;
588 continue;
589 }
590
591 // <tbody>, <tfoot> - skip tag
592 if (tag_eq(tag.name, tag.name_len, "tbody") ||
593 tag_eq(tag.name, tag.name_len, "tfoot")) {
594 p = after;
595 continue;
596 }
597
598 // <tr>
599 if (tag_eq(tag.name, tag.name_len, "tr")) {
600 if (state.table_depth == state.capture_depth) {
601 // Data table row
602 if (!tag.is_closing) {
603 if (state.row_index > 0) {
604 buf_append_char(&state.table_data, '\n');
605 }
606 state.in_tr = 1;
607 state.cell_index = 0;
608 } else {
609 state.in_tr = 0;
610 if (state.in_thead) state.header_rows++;
611 state.row_index++;
612 }
613 } else if (tag.is_closing) {
614 // Layout table: block spacing
615 if (state.bq_depth > 0) {
616 buf_append(&state.bq_text, "\n\n", 2);
617 } else {
618 buf_append(&text_buf, "\n\n", 2);
619 }
620 state.last_was_block = 1;
621 }
622 p = after;
623 continue;
624 }
625
626 // <td>, <th>
627 if (tag_eq(tag.name, tag.name_len, "td") ||
628 tag_eq(tag.name, tag.name_len, "th")) {
629 // <th> enables capture mode for the current table depth
630 if (tag_eq(tag.name, tag.name_len, "th") && !tag.is_closing &&
631 state.capture_depth < 0 && state.table_depth > 0) {
632 state.capture_depth = state.table_depth;
633 // Start tracking the current row
634 state.in_tr = 1;
635 state.cell_index = 0;
636 state.row_index = 0;
637 state.header_rows = 0;
638 state.in_thead = 1; // <th> implies header
639 buf_free(&state.table_data);
640 buf_init(&state.table_data);
641 }
642
643 if (state.table_depth == state.capture_depth && state.in_tr) {
644 if (!tag.is_closing) {
645 state.in_td = 1;
646 buf_free(&state.cell_text);
647 buf_init(&state.cell_text);
648 } else if (state.in_td) {
649 state.in_td = 0;
650 // Append cell to row (tab-separated)
651 if (state.cell_index > 0) {
652 buf_append_char(&state.table_data, '\t');
653 }
654 if (state.cell_text.len > 0) {
655 buf_append(&state.table_data, state.cell_text.data, state.cell_text.len);
656 }
657 buf_free(&state.cell_text);
658 buf_init(&state.cell_text);
659 state.cell_index++;
660 }
661 }
662 // For layout tables (not capturing at this depth): td/th tags are
663 // ignored, text content flows through to text_buf naturally
664 p = after;
665 continue;
666 }
667
668 // Block elements: add spacing
669 if (tag_eq(tag.name, tag.name_len, "p") ||
670 tag_eq(tag.name, tag.name_len, "div") ||
671 tag_eq(tag.name, tag.name_len, "li") ||
672 tag_eq(tag.name, tag.name_len, "hr")) {
673 if (tag.is_closing || tag_eq(tag.name, tag.name_len, "hr")) {
674 if (state.bq_depth > 0) {
675 buf_append(&state.bq_text, "\n\n", 2);
676 } else if (state.in_td) {
677 buf_append_char(&state.cell_text, ' ');
678 } else {
679 buf_append(&text_buf, "\n\n", 2);
680 }
681 state.last_was_block = 1;
682 }
683 p = after;
684 continue;
685 }
686
687 // <ul>, <ol>, <dl>, etc. - skip tag but process children
688 p = after;
689 continue;
690 }
691
692 // Text content
693 if (state.in_style || state.in_script) {
694 p++;
695 continue;
696 }
697
698 // Handle entities
699 if (*p == '&') {
700 Buffer* target;
701 if (state.in_td) target = &state.cell_text;
702 else if (state.in_a) target = &state.a_text;
703 else if (state.in_h1 || state.in_h2) target = &state.h_text;
704 else if (state.bq_depth > 0) target = &state.bq_text;
705 else target = &text_buf;
706
707 size_t consumed = decode_entity(p, end - p, target);
708 p += consumed;
709 continue;
710 }
711
712 // Regular character — collapse whitespace like HTML (unless in <pre>)
713 char c = *p++;
714 if (state.in_td) {
715 buf_append_html_char(&state.cell_text, c, state.in_pre);
716 } else if (state.in_a) {
717 buf_append_html_char(&state.a_text, c, state.in_pre);
718 } else if (state.in_h1 || state.in_h2) {
719 buf_append_html_char(&state.h_text, c, state.in_pre);
720 } else if (state.bq_depth > 0) {
721 buf_append_html_char(&state.bq_text, c, state.in_pre);
722 } else {
723 buf_append_html_char(&text_buf, c, state.in_pre);
724 }
725 }
726
727 // Flush remaining text
728 flush_text(&result, &text_buf);
729
730 // Flush any unclosed elements
731 if (state.in_h1 || state.in_h2) {
732 HTMLElement* e = result_add(&result);
733 e->type = state.in_h1 ? HELEM_H1 : HELEM_H2;
734 e->text = buf_finish(&state.h_text);
735 } else {
736 buf_free(&state.h_text);
737 }
738
739 if (state.in_a) {
740 HTMLElement* e = result_add(&result);
741 e->type = HELEM_LINK;
742 e->text = buf_finish(&state.a_text);
743 e->attr1 = strdup(state.a_href);
744 } else {
745 buf_free(&state.a_text);
746 }
747
748 if (state.bq_depth > 0) {
749 HTMLElement* e = result_add(&result);
750 e->type = HELEM_BLOCKQUOTE;
751 e->text = buf_finish(&state.bq_text);
752 if (state.bq_prev.len > 0) {
753 e->attr2 = buf_finish(&state.bq_prev);
754 }
755 } else {
756 buf_free(&state.bq_text);
757 buf_free(&state.bq_prev);
758 }
759
760 // Flush unclosed table
761 if (state.capture_depth > 0 && state.table_data.len > 0) {
762 HTMLElement* e = result_add(&result);
763 e->type = HELEM_TABLE;
764 e->text = buf_finish(&state.table_data);
765 char hdr_buf[16];
766 snprintf(hdr_buf, sizeof(hdr_buf), "%d", state.header_rows);
767 e->attr1 = strdup(hdr_buf);
768 } else {
769 buf_free(&state.table_data);
770 }
771 buf_free(&state.cell_text);
772
773 result.ok = 1;
774 return result;
775}
776
777void free_html_result(HTMLConvertResult* r) {
778 if (!r) return;
779 for (int i = 0; i < r->count; i++) {
780 free(r->elements[i].text);
781 free(r->elements[i].attr1);
782 free(r->elements[i].attr2);
783 }
784 free(r->elements);
785 r->elements = NULL;
786 r->count = 0;
787 r->cap = 0;
788}