htmlconv.c

  1#include "htmlconv.h"
  2#include <stdlib.h>
  3#include <string.h>
  4#include <ctype.h>
  5#include <stdio.h>
  6
  7// --- Dynamic buffer ---
  8
  9typedef struct {
 10    char* data;
 11    size_t len;
 12    size_t cap;
 13} Buffer;
 14
 15static void buf_init(Buffer* b) {
 16    b->data = NULL;
 17    b->len = 0;
 18    b->cap = 0;
 19}
 20
 21static void buf_ensure(Buffer* b, size_t extra) {
 22    size_t needed = b->len + extra;
 23    if (needed <= b->cap) return;
 24    size_t newcap = b->cap ? b->cap * 2 : 256;
 25    while (newcap < needed) newcap *= 2;
 26    b->data = (char*)realloc(b->data, newcap);
 27    b->cap = newcap;
 28}
 29
 30static void buf_append(Buffer* b, const char* s, size_t n) {
 31    if (n == 0) return;
 32    buf_ensure(b, n);
 33    memcpy(b->data + b->len, s, n);
 34    b->len += n;
 35}
 36
 37static void buf_append_char(Buffer* b, char c) {
 38    buf_ensure(b, 1);
 39    b->data[b->len++] = c;
 40}
 41
 42static char* buf_finish(Buffer* b) {
 43    buf_append_char(b, '\0');
 44    return b->data;
 45}
 46
 47static void buf_free(Buffer* b) {
 48    free(b->data);
 49    b->data = NULL;
 50    b->len = 0;
 51    b->cap = 0;
 52}
 53
 54// Append a text character with HTML whitespace collapsing.
 55// In non-pre mode, collapses runs of whitespace to a single space.
 56static void buf_append_html_char(Buffer* b, char c, int in_pre) {
 57    if (in_pre) {
 58        buf_append_char(b, c);
 59        return;
 60    }
 61    if (c == '\n' || c == '\r' || c == '\t') c = ' ';
 62    if (c == ' ') {
 63        // Skip if buffer already ends with space or is empty
 64        if (b->len == 0 || b->data[b->len - 1] == ' ' || b->data[b->len - 1] == '\n') return;
 65    }
 66    buf_append_char(b, c);
 67}
 68
 69// --- Result helpers ---
 70
 71static void result_init(HTMLConvertResult* r) {
 72    r->elements = NULL;
 73    r->count = 0;
 74    r->cap = 0;
 75    r->ok = 0;
 76}
 77
 78static HTMLElement* result_add(HTMLConvertResult* r) {
 79    if (r->count >= r->cap) {
 80        int newcap = r->cap ? r->cap * 2 : 32;
 81        r->elements = (HTMLElement*)realloc(r->elements, sizeof(HTMLElement) * newcap);
 82        r->cap = newcap;
 83    }
 84    HTMLElement* e = &r->elements[r->count++];
 85    e->type = HELEM_TEXT;
 86    e->text = NULL;
 87    e->attr1 = NULL;
 88    e->attr2 = NULL;
 89    return e;
 90}
 91
 92// Flush accumulated text buffer as a TEXT element.
 93static void flush_text(HTMLConvertResult* r, Buffer* buf) {
 94    if (buf->len == 0) return;
 95    HTMLElement* e = result_add(r);
 96    e->type = HELEM_TEXT;
 97    e->text = buf_finish(buf);
 98    buf_init(buf);
 99}
100
101// --- HTML entity decoding ---
102
103static size_t decode_entity(const char* s, size_t len, Buffer* out) {
104    // s points to '&', returns number of chars consumed
105    if (len < 2) { buf_append_char(out, '&'); return 1; }
106
107    // Find the ';'
108    size_t end = 1;
109    while (end < len && end < 12 && s[end] != ';') end++;
110    if (end >= len || s[end] != ';') { buf_append_char(out, '&'); return 1; }
111
112    size_t ent_len = end - 1; // length of entity name (between & and ;)
113    const char* name = s + 1;
114
115    // Numeric entities
116    if (ent_len >= 2 && name[0] == '#') {
117        unsigned long cp = 0;
118        if (name[1] == 'x' || name[1] == 'X') {
119            for (size_t i = 2; i < ent_len; i++) {
120                char c = name[i];
121                if (c >= '0' && c <= '9') cp = cp * 16 + (c - '0');
122                else if (c >= 'a' && c <= 'f') cp = cp * 16 + 10 + (c - 'a');
123                else if (c >= 'A' && c <= 'F') cp = cp * 16 + 10 + (c - 'A');
124                else break;
125            }
126        } else {
127            for (size_t i = 1; i < ent_len; i++) {
128                if (name[i] >= '0' && name[i] <= '9') cp = cp * 10 + (name[i] - '0');
129                else break;
130            }
131        }
132        // Encode as UTF-8
133        if (cp < 0x80) {
134            buf_append_char(out, (char)cp);
135        } else if (cp < 0x800) {
136            buf_append_char(out, (char)(0xC0 | (cp >> 6)));
137            buf_append_char(out, (char)(0x80 | (cp & 0x3F)));
138        } else if (cp < 0x10000) {
139            buf_append_char(out, (char)(0xE0 | (cp >> 12)));
140            buf_append_char(out, (char)(0x80 | ((cp >> 6) & 0x3F)));
141            buf_append_char(out, (char)(0x80 | (cp & 0x3F)));
142        } else if (cp < 0x110000) {
143            buf_append_char(out, (char)(0xF0 | (cp >> 18)));
144            buf_append_char(out, (char)(0x80 | ((cp >> 12) & 0x3F)));
145            buf_append_char(out, (char)(0x80 | ((cp >> 6) & 0x3F)));
146            buf_append_char(out, (char)(0x80 | (cp & 0x3F)));
147        }
148        return end + 1;
149    }
150
151    // Named entities (common ones)
152    struct { const char* name; const char* value; } entities[] = {
153        {"lt", "<"}, {"gt", ">"}, {"amp", "&"}, {"quot", "\""},
154        {"apos", "'"}, {"nbsp", " "}, {"ndash", "\xe2\x80\x93"},
155        {"mdash", "\xe2\x80\x94"}, {"laquo", "\xc2\xab"},
156        {"raquo", "\xc2\xbb"}, {"copy", "\xc2\xa9"},
157        {"reg", "\xc2\xae"}, {"trade", "\xe2\x84\xa2"},
158        {"hellip", "\xe2\x80\xa6"}, {"bull", "\xe2\x80\xa2"},
159        {"rsquo", "\xe2\x80\x99"}, {"lsquo", "\xe2\x80\x98"},
160        {"rdquo", "\xe2\x80\x9d"}, {"ldquo", "\xe2\x80\x9c"},
161        {NULL, NULL}
162    };
163
164    for (int i = 0; entities[i].name; i++) {
165        if (ent_len == strlen(entities[i].name) &&
166            strncmp(name, entities[i].name, ent_len) == 0) {
167            buf_append(out, entities[i].value, strlen(entities[i].value));
168            return end + 1;
169        }
170    }
171
172    // Unknown entity - pass through
173    buf_append(out, s, end + 1);
174    return end + 1;
175}
176
177// --- Tag parsing ---
178
179typedef struct {
180    char name[64];
181    int name_len;
182    int is_closing;
183    int is_self_closing;
184    // Attributes (we parse href, src, alt, cite)
185    char href[2048];
186    char src[2048];
187    char alt[512];
188    char cite[2048];
189} Tag;
190
191// Case-insensitive compare for tag names.
192static int tag_eq(const char* a, int alen, const char* b) {
193    int blen = (int)strlen(b);
194    if (alen != blen) return 0;
195    for (int i = 0; i < alen; i++) {
196        if (tolower((unsigned char)a[i]) != tolower((unsigned char)b[i])) return 0;
197    }
198    return 1;
199}
200
201// Parse an attribute value (handles both quoted and unquoted).
202// Returns pointer past the parsed value.
203static const char* parse_attr_value(const char* p, const char* end, char* out, size_t out_size) {
204    if (p >= end) return p;
205    char quote = 0;
206    if (*p == '"' || *p == '\'') {
207        quote = *p++;
208    }
209    size_t i = 0;
210    while (p < end) {
211        if (quote) {
212            if (*p == quote) { p++; break; }
213        } else {
214            if (isspace((unsigned char)*p) || *p == '>' || *p == '/') break;
215        }
216        if (i < out_size - 1) out[i++] = *p;
217        p++;
218    }
219    out[i] = '\0';
220    return p;
221}
222
223// Parse a tag starting after '<'. Returns pointer past '>'.
224static const char* parse_tag(const char* p, const char* end, Tag* tag) {
225    memset(tag, 0, sizeof(*tag));
226
227    // Skip whitespace after '<'
228    while (p < end && isspace((unsigned char)*p)) p++;
229
230    // Check closing tag
231    if (p < end && *p == '/') {
232        tag->is_closing = 1;
233        p++;
234    }
235
236    // Parse tag name
237    while (p < end && !isspace((unsigned char)*p) && *p != '>' && *p != '/' &&
238           tag->name_len < 63) {
239        tag->name[tag->name_len++] = *p++;
240    }
241    tag->name[tag->name_len] = '\0';
242
243    // Parse attributes
244    while (p < end && *p != '>') {
245        // Skip whitespace
246        while (p < end && isspace((unsigned char)*p)) p++;
247        if (p >= end || *p == '>' || *p == '/') break;
248
249        // Parse attribute name
250        char attr_name[64] = {0};
251        int an = 0;
252        while (p < end && *p != '=' && *p != '>' && !isspace((unsigned char)*p) && an < 63) {
253            attr_name[an++] = tolower((unsigned char)*p++);
254        }
255        attr_name[an] = '\0';
256
257        // Skip '='
258        while (p < end && isspace((unsigned char)*p)) p++;
259        if (p < end && *p == '=') {
260            p++;
261            while (p < end && isspace((unsigned char)*p)) p++;
262
263            // Parse value into correct field
264            if (strcmp(attr_name, "href") == 0) {
265                p = parse_attr_value(p, end, tag->href, sizeof(tag->href));
266            } else if (strcmp(attr_name, "src") == 0) {
267                p = parse_attr_value(p, end, tag->src, sizeof(tag->src));
268            } else if (strcmp(attr_name, "alt") == 0) {
269                p = parse_attr_value(p, end, tag->alt, sizeof(tag->alt));
270            } else if (strcmp(attr_name, "cite") == 0) {
271                p = parse_attr_value(p, end, tag->cite, sizeof(tag->cite));
272            } else {
273                // Skip unknown attribute value
274                char discard[4096];
275                p = parse_attr_value(p, end, discard, sizeof(discard));
276            }
277        }
278    }
279
280    // Check self-closing and skip past '>'
281    if (p < end && *p == '/') {
282        tag->is_self_closing = 1;
283        p++;
284    }
285    if (p < end && *p == '>') p++;
286
287    return p;
288}
289
290// --- Main parser ---
291
292// Tag stack for nesting tracking.
293#define MAX_STACK 128
294
295typedef struct {
296    int in_style;      // Inside <style>
297    int in_script;     // Inside <script>
298    int in_pre;        // Inside <pre>
299    int in_a;          // Inside <a>
300    char a_href[2048]; // Current link href
301    Buffer a_text;     // Current link text accumulator
302    int in_h1;
303    int in_h2;
304    Buffer h_text;     // Current header text accumulator
305    int bq_depth;      // Blockquote nesting depth
306    Buffer bq_text;    // Current blockquote text accumulator
307    char bq_cite[2048];
308    Buffer bq_prev;    // Text before blockquote (for "On...wrote:" detection)
309    int last_was_block; // Last element was a block (for spacing)
310    // Table state
311    int table_depth;    // Table nesting depth (0 = not in any table)
312    int capture_depth;  // Depth at which we're capturing data table (-1 = not capturing)
313    int in_thead;       // Inside <thead> (at capture depth)
314    int in_tr;          // Inside <tr> (at capture depth, capturing mode)
315    int in_td;          // Inside <td>/<th> (at capture depth, capturing mode)
316    int cell_index;     // Cell index within current row
317    int row_index;      // Row index within current table
318    int header_rows;    // Number of header rows (rows inside <thead>)
319    Buffer cell_text;   // Current cell text accumulator
320    Buffer table_data;  // Accumulated table data (cells tab-separated, rows newline-separated)
321} ParseState;
322
323HTMLConvertResult html_to_elements(const char* html, size_t len) {
324    HTMLConvertResult result;
325    result_init(&result);
326
327    if (!html || len == 0) {
328        result.ok = 1;
329        return result;
330    }
331
332    ParseState state;
333    memset(&state, 0, sizeof(state));
334    state.capture_depth = -1;  // Not capturing
335    buf_init(&state.a_text);
336    buf_init(&state.h_text);
337    buf_init(&state.bq_text);
338    buf_init(&state.bq_prev);
339    buf_init(&state.cell_text);
340    buf_init(&state.table_data);
341
342    Buffer text_buf;
343    buf_init(&text_buf);
344
345    const char* p = html;
346    const char* end = html + len;
347
348    while (p < end) {
349        if (*p == '<') {
350            // Check for comment
351            if (p + 3 < end && p[1] == '!' && p[2] == '-' && p[3] == '-') {
352                const char* ce = strstr(p + 4, "-->");
353                if (ce) { p = ce + 3; continue; }
354                p++;
355                continue;
356            }
357
358            // Check for DOCTYPE/CDATA
359            if (p + 1 < end && p[1] == '!') {
360                const char* gt = memchr(p, '>', end - p);
361                if (gt) { p = gt + 1; continue; }
362                p++;
363                continue;
364            }
365
366            Tag tag;
367            const char* after = parse_tag(p + 1, end, &tag);
368
369            // Handle specific tags
370            if (tag_eq(tag.name, tag.name_len, "style")) {
371                if (tag.is_closing) state.in_style = 0;
372                else state.in_style = 1;
373                p = after;
374                continue;
375            }
376            if (tag_eq(tag.name, tag.name_len, "script")) {
377                if (tag.is_closing) state.in_script = 0;
378                else state.in_script = 1;
379                p = after;
380                continue;
381            }
382
383            if (state.in_style || state.in_script) {
384                p = after;
385                continue;
386            }
387
388            // <br> -> newline
389            if (tag_eq(tag.name, tag.name_len, "br")) {
390                if (state.in_td) {
391                    buf_append_char(&state.cell_text, ' ');
392                } else if (state.in_a) {
393                    buf_append_char(&state.a_text, '\n');
394                } else if (state.in_h1 || state.in_h2) {
395                    buf_append_char(&state.h_text, ' ');
396                } else if (state.bq_depth > 0) {
397                    buf_append_char(&state.bq_text, '\n');
398                } else {
399                    buf_append_char(&text_buf, '\n');
400                }
401                p = after;
402                continue;
403            }
404
405            // <pre>
406            if (tag_eq(tag.name, tag.name_len, "pre")) {
407                state.in_pre = !tag.is_closing;
408                p = after;
409                continue;
410            }
411
412            // <h1>
413            if (tag_eq(tag.name, tag.name_len, "h1")) {
414                if (tag.is_closing && state.in_h1) {
415                    state.in_h1 = 0;
416                    flush_text(&result, &text_buf);
417                    HTMLElement* e = result_add(&result);
418                    e->type = HELEM_H1;
419                    e->text = buf_finish(&state.h_text);
420                    buf_init(&state.h_text);
421                    // Add block spacing
422                    HTMLElement* sp = result_add(&result);
423                    sp->type = HELEM_TEXT;
424                    sp->text = strdup("\n\n");
425                } else if (!tag.is_closing) {
426                    flush_text(&result, &text_buf);
427                    state.in_h1 = 1;
428                    buf_init(&state.h_text);
429                }
430                p = after;
431                continue;
432            }
433
434            // <h2>
435            if (tag_eq(tag.name, tag.name_len, "h2")) {
436                if (tag.is_closing && state.in_h2) {
437                    state.in_h2 = 0;
438                    flush_text(&result, &text_buf);
439                    HTMLElement* e = result_add(&result);
440                    e->type = HELEM_H2;
441                    e->text = buf_finish(&state.h_text);
442                    buf_init(&state.h_text);
443                    HTMLElement* sp = result_add(&result);
444                    sp->type = HELEM_TEXT;
445                    sp->text = strdup("\n\n");
446                } else if (!tag.is_closing) {
447                    flush_text(&result, &text_buf);
448                    state.in_h2 = 1;
449                    buf_init(&state.h_text);
450                }
451                p = after;
452                continue;
453            }
454
455            // <a>
456            if (tag_eq(tag.name, tag.name_len, "a")) {
457                if (tag.is_closing && state.in_a) {
458                    state.in_a = 0;
459                    // If inside blockquote, emit link text inline
460                    if (state.bq_depth > 0) {
461                        if (state.a_text.len > 0) {
462                            buf_append(&state.bq_text, state.a_text.data, state.a_text.len);
463                        }
464                        buf_free(&state.a_text);
465                    } else {
466                        flush_text(&result, &text_buf);
467                        HTMLElement* e = result_add(&result);
468                        e->type = HELEM_LINK;
469                        e->text = buf_finish(&state.a_text);
470                        e->attr1 = strdup(state.a_href);
471                        buf_init(&state.a_text);
472                    }
473                } else if (!tag.is_closing && tag.href[0]) {
474                    if (state.bq_depth == 0) flush_text(&result, &text_buf);
475                    state.in_a = 1;
476                    strncpy(state.a_href, tag.href, sizeof(state.a_href) - 1);
477                    state.a_href[sizeof(state.a_href) - 1] = '\0';
478                    buf_init(&state.a_text);
479                }
480                p = after;
481                continue;
482            }
483
484            // <img>
485            if (tag_eq(tag.name, tag.name_len, "img")) {
486                if (tag.src[0]) {
487                    flush_text(&result, &text_buf);
488                    HTMLElement* e = result_add(&result);
489                    e->type = HELEM_IMAGE;
490                    e->attr1 = strdup(tag.src);
491                    e->attr2 = tag.alt[0] ? strdup(tag.alt) : strdup("Does not contain alt text");
492                }
493                p = after;
494                continue;
495            }
496
497            // <blockquote>
498            if (tag_eq(tag.name, tag.name_len, "blockquote")) {
499                if (tag.is_closing && state.bq_depth > 0) {
500                    state.bq_depth--;
501                    if (state.bq_depth == 0) {
502                        flush_text(&result, &text_buf);
503                        HTMLElement* e = result_add(&result);
504                        e->type = HELEM_BLOCKQUOTE;
505                        e->text = buf_finish(&state.bq_text);
506                        if (tag.cite[0]) {
507                            e->attr1 = strdup(tag.cite);
508                        }
509                        if (state.bq_prev.len > 0) {
510                            e->attr2 = buf_finish(&state.bq_prev);
511                        }
512                        buf_init(&state.bq_text);
513                        buf_init(&state.bq_prev);
514                    }
515                } else if (!tag.is_closing) {
516                    if (state.bq_depth == 0) {
517                        // Capture preceding text for "On...wrote:" detection
518                        // Look back in text_buf for the last line
519                        buf_free(&state.bq_prev);
520                        buf_init(&state.bq_prev);
521                        if (text_buf.len > 0) {
522                            // Find last non-empty line
523                            int start = (int)text_buf.len - 1;
524                            while (start > 0 && text_buf.data[start] == '\n') start--;
525                            int line_start = start;
526                            while (line_start > 0 && text_buf.data[line_start - 1] != '\n') line_start--;
527                            int line_len = start - line_start + 1;
528                            if (line_len > 0) {
529                                buf_append(&state.bq_prev, text_buf.data + line_start, line_len);
530                            }
531                        }
532                        flush_text(&result, &text_buf);
533                        buf_init(&state.bq_text);
534                    }
535                    if (tag.cite[0]) {
536                        strncpy(state.bq_cite, tag.cite, sizeof(state.bq_cite) - 1);
537                    }
538                    state.bq_depth++;
539                }
540                p = after;
541                continue;
542            }
543
544            // <table>
545            if (tag_eq(tag.name, tag.name_len, "table")) {
546                if (!tag.is_closing) {
547                    state.table_depth++;
548                } else if (state.table_depth > 0) {
549                    if (state.table_depth == state.capture_depth) {
550                        // Closing the data table we're capturing
551                        if (state.table_data.len > 0) {
552                            flush_text(&result, &text_buf);
553                            HTMLElement* e = result_add(&result);
554                            e->type = HELEM_TABLE;
555                            e->text = buf_finish(&state.table_data);
556                            buf_init(&state.table_data);
557                            char hdr_buf[16];
558                            snprintf(hdr_buf, sizeof(hdr_buf), "%d", state.header_rows);
559                            e->attr1 = strdup(hdr_buf);
560                        } else {
561                            buf_free(&state.table_data);
562                            buf_init(&state.table_data);
563                        }
564                        state.capture_depth = -1;
565                        state.in_td = 0;
566                        state.in_tr = 0;
567                    } else {
568                        // Layout or nested table: block spacing
569                        if (state.bq_depth > 0) {
570                            buf_append(&state.bq_text, "\n\n", 2);
571                        } else {
572                            buf_append(&text_buf, "\n\n", 2);
573                        }
574                        state.last_was_block = 1;
575                    }
576                    state.table_depth--;
577                }
578                p = after;
579                continue;
580            }
581
582            // <thead>
583            if (tag_eq(tag.name, tag.name_len, "thead")) {
584                if (state.table_depth == state.capture_depth) {
585                    state.in_thead = !tag.is_closing;
586                }
587                p = after;
588                continue;
589            }
590
591            // <tbody>, <tfoot> - skip tag
592            if (tag_eq(tag.name, tag.name_len, "tbody") ||
593                tag_eq(tag.name, tag.name_len, "tfoot")) {
594                p = after;
595                continue;
596            }
597
598            // <tr>
599            if (tag_eq(tag.name, tag.name_len, "tr")) {
600                if (state.table_depth == state.capture_depth) {
601                    // Data table row
602                    if (!tag.is_closing) {
603                        if (state.row_index > 0) {
604                            buf_append_char(&state.table_data, '\n');
605                        }
606                        state.in_tr = 1;
607                        state.cell_index = 0;
608                    } else {
609                        state.in_tr = 0;
610                        if (state.in_thead) state.header_rows++;
611                        state.row_index++;
612                    }
613                } else if (tag.is_closing) {
614                    // Layout table: block spacing
615                    if (state.bq_depth > 0) {
616                        buf_append(&state.bq_text, "\n\n", 2);
617                    } else {
618                        buf_append(&text_buf, "\n\n", 2);
619                    }
620                    state.last_was_block = 1;
621                }
622                p = after;
623                continue;
624            }
625
626            // <td>, <th>
627            if (tag_eq(tag.name, tag.name_len, "td") ||
628                tag_eq(tag.name, tag.name_len, "th")) {
629                // <th> enables capture mode for the current table depth
630                if (tag_eq(tag.name, tag.name_len, "th") && !tag.is_closing &&
631                    state.capture_depth < 0 && state.table_depth > 0) {
632                    state.capture_depth = state.table_depth;
633                    // Start tracking the current row
634                    state.in_tr = 1;
635                    state.cell_index = 0;
636                    state.row_index = 0;
637                    state.header_rows = 0;
638                    state.in_thead = 1; // <th> implies header
639                    buf_free(&state.table_data);
640                    buf_init(&state.table_data);
641                }
642
643                if (state.table_depth == state.capture_depth && state.in_tr) {
644                    if (!tag.is_closing) {
645                        state.in_td = 1;
646                        buf_free(&state.cell_text);
647                        buf_init(&state.cell_text);
648                    } else if (state.in_td) {
649                        state.in_td = 0;
650                        // Append cell to row (tab-separated)
651                        if (state.cell_index > 0) {
652                            buf_append_char(&state.table_data, '\t');
653                        }
654                        if (state.cell_text.len > 0) {
655                            buf_append(&state.table_data, state.cell_text.data, state.cell_text.len);
656                        }
657                        buf_free(&state.cell_text);
658                        buf_init(&state.cell_text);
659                        state.cell_index++;
660                    }
661                }
662                // For layout tables (not capturing at this depth): td/th tags are
663                // ignored, text content flows through to text_buf naturally
664                p = after;
665                continue;
666            }
667
668            // Block elements: add spacing
669            if (tag_eq(tag.name, tag.name_len, "p") ||
670                tag_eq(tag.name, tag.name_len, "div") ||
671                tag_eq(tag.name, tag.name_len, "li") ||
672                tag_eq(tag.name, tag.name_len, "hr")) {
673                if (tag.is_closing || tag_eq(tag.name, tag.name_len, "hr")) {
674                    if (state.bq_depth > 0) {
675                        buf_append(&state.bq_text, "\n\n", 2);
676                    } else if (state.in_td) {
677                        buf_append_char(&state.cell_text, ' ');
678                    } else {
679                        buf_append(&text_buf, "\n\n", 2);
680                    }
681                    state.last_was_block = 1;
682                }
683                p = after;
684                continue;
685            }
686
687            // <ul>, <ol>, <dl>, etc. - skip tag but process children
688            p = after;
689            continue;
690        }
691
692        // Text content
693        if (state.in_style || state.in_script) {
694            p++;
695            continue;
696        }
697
698        // Handle entities
699        if (*p == '&') {
700            Buffer* target;
701            if (state.in_td) target = &state.cell_text;
702            else if (state.in_a) target = &state.a_text;
703            else if (state.in_h1 || state.in_h2) target = &state.h_text;
704            else if (state.bq_depth > 0) target = &state.bq_text;
705            else target = &text_buf;
706
707            size_t consumed = decode_entity(p, end - p, target);
708            p += consumed;
709            continue;
710        }
711
712        // Regular character — collapse whitespace like HTML (unless in <pre>)
713        char c = *p++;
714        if (state.in_td) {
715            buf_append_html_char(&state.cell_text, c, state.in_pre);
716        } else if (state.in_a) {
717            buf_append_html_char(&state.a_text, c, state.in_pre);
718        } else if (state.in_h1 || state.in_h2) {
719            buf_append_html_char(&state.h_text, c, state.in_pre);
720        } else if (state.bq_depth > 0) {
721            buf_append_html_char(&state.bq_text, c, state.in_pre);
722        } else {
723            buf_append_html_char(&text_buf, c, state.in_pre);
724        }
725    }
726
727    // Flush remaining text
728    flush_text(&result, &text_buf);
729
730    // Flush any unclosed elements
731    if (state.in_h1 || state.in_h2) {
732        HTMLElement* e = result_add(&result);
733        e->type = state.in_h1 ? HELEM_H1 : HELEM_H2;
734        e->text = buf_finish(&state.h_text);
735    } else {
736        buf_free(&state.h_text);
737    }
738
739    if (state.in_a) {
740        HTMLElement* e = result_add(&result);
741        e->type = HELEM_LINK;
742        e->text = buf_finish(&state.a_text);
743        e->attr1 = strdup(state.a_href);
744    } else {
745        buf_free(&state.a_text);
746    }
747
748    if (state.bq_depth > 0) {
749        HTMLElement* e = result_add(&result);
750        e->type = HELEM_BLOCKQUOTE;
751        e->text = buf_finish(&state.bq_text);
752        if (state.bq_prev.len > 0) {
753            e->attr2 = buf_finish(&state.bq_prev);
754        }
755    } else {
756        buf_free(&state.bq_text);
757        buf_free(&state.bq_prev);
758    }
759
760    // Flush unclosed table
761    if (state.capture_depth > 0 && state.table_data.len > 0) {
762        HTMLElement* e = result_add(&result);
763        e->type = HELEM_TABLE;
764        e->text = buf_finish(&state.table_data);
765        char hdr_buf[16];
766        snprintf(hdr_buf, sizeof(hdr_buf), "%d", state.header_rows);
767        e->attr1 = strdup(hdr_buf);
768    } else {
769        buf_free(&state.table_data);
770    }
771    buf_free(&state.cell_text);
772
773    result.ok = 1;
774    return result;
775}
776
777void free_html_result(HTMLConvertResult* r) {
778    if (!r) return;
779    for (int i = 0; i < r->count; i++) {
780        free(r->elements[i].text);
781        free(r->elements[i].attr1);
782        free(r->elements[i].attr2);
783    }
784    free(r->elements);
785    r->elements = NULL;
786    r->count = 0;
787    r->cap = 0;
788}