md4c-html.c

  1#include <stdio.h>
  2#include <string.h>
  3
  4#include "md4c-html.h"
  5#include "entity.h"
  6
  7#if !defined(__STDC_VERSION__) || __STDC_VERSION__ < 199409L
  8
  9    #if defined __GNUC__
 10        #define inline __inline__
 11    #elif defined _MSC_VER
 12        #define inline __inline
 13    #else
 14        #define inline
 15    #endif
 16#endif
 17
 18#ifdef _WIN32
 19    #define snprintf _snprintf
 20#endif
 21
 22typedef struct MD_HTML_tag MD_HTML;
 23struct MD_HTML_tag {
 24    void (*process_output)(const MD_CHAR*, MD_SIZE, void*);
 25    void* userdata;
 26    unsigned flags;
 27    int image_nesting_level;
 28    char escape_map[256];
 29};
 30
 31#define NEED_HTML_ESC_FLAG   0x1
 32#define NEED_URL_ESC_FLAG    0x2
 33
 34#define ISDIGIT(ch)     ('0' <= (ch) && (ch) <= '9')
 35#define ISLOWER(ch)     ('a' <= (ch) && (ch) <= 'z')
 36#define ISUPPER(ch)     ('A' <= (ch) && (ch) <= 'Z')
 37#define ISALNUM(ch)     (ISLOWER(ch) || ISUPPER(ch) || ISDIGIT(ch))
 38
 39static inline void
 40render_verbatim(MD_HTML* r, const MD_CHAR* text, MD_SIZE size)
 41{
 42    r->process_output(text, size, r->userdata);
 43}
 44
 45#define RENDER_VERBATIM(r, verbatim)                                    \
 46        render_verbatim((r), (verbatim), (MD_SIZE) (strlen(verbatim)))
 47
 48static void
 49render_html_escaped(MD_HTML* r, const MD_CHAR* data, MD_SIZE size)
 50{
 51    MD_OFFSET beg = 0;
 52    MD_OFFSET off = 0;
 53
 54    #define NEED_HTML_ESC(ch)   (r->escape_map[(unsigned char)(ch)] & NEED_HTML_ESC_FLAG)
 55
 56    while(1) {
 57
 58        while(off + 3 < size  &&  !NEED_HTML_ESC(data[off+0])  &&  !NEED_HTML_ESC(data[off+1])
 59                              &&  !NEED_HTML_ESC(data[off+2])  &&  !NEED_HTML_ESC(data[off+3]))
 60            off += 4;
 61        while(off < size  &&  !NEED_HTML_ESC(data[off]))
 62            off++;
 63
 64        if(off > beg)
 65            render_verbatim(r, data + beg, off - beg);
 66
 67        if(off < size) {
 68            switch(data[off]) {
 69                case '&':   RENDER_VERBATIM(r, "&amp;"); break;
 70                case '<':   RENDER_VERBATIM(r, "&lt;"); break;
 71                case '>':   RENDER_VERBATIM(r, "&gt;"); break;
 72                case '"':   RENDER_VERBATIM(r, "&quot;"); break;
 73            }
 74            off++;
 75        } else {
 76            break;
 77        }
 78        beg = off;
 79    }
 80}
 81
 82static void
 83render_url_escaped(MD_HTML* r, const MD_CHAR* data, MD_SIZE size)
 84{
 85    static const MD_CHAR hex_chars[] = "0123456789ABCDEF";
 86    MD_OFFSET beg = 0;
 87    MD_OFFSET off = 0;
 88
 89    #define NEED_URL_ESC(ch)    (r->escape_map[(unsigned char)(ch)] & NEED_URL_ESC_FLAG)
 90
 91    while(1) {
 92        while(off < size  &&  !NEED_URL_ESC(data[off]))
 93            off++;
 94        if(off > beg)
 95            render_verbatim(r, data + beg, off - beg);
 96
 97        if(off < size) {
 98            char hex[3];
 99
100            switch(data[off]) {
101                case '&':   RENDER_VERBATIM(r, "&amp;"); break;
102                default:
103                    hex[0] = '%';
104                    hex[1] = hex_chars[((unsigned)data[off] >> 4) & 0xf];
105                    hex[2] = hex_chars[((unsigned)data[off] >> 0) & 0xf];
106                    render_verbatim(r, hex, 3);
107                    break;
108            }
109            off++;
110        } else {
111            break;
112        }
113
114        beg = off;
115    }
116}
117
118static unsigned
119hex_val(char ch)
120{
121    if('0' <= ch && ch <= '9')
122        return ch - '0';
123    if('A' <= ch && ch <= 'Z')
124        return ch - 'A' + 10;
125    else
126        return ch - 'a' + 10;
127}
128
129static void
130render_utf8_codepoint(MD_HTML* r, unsigned codepoint,
131                      void (*fn_append)(MD_HTML*, const MD_CHAR*, MD_SIZE))
132{
133    static const MD_CHAR utf8_replacement_char[] = { (char)0xef, (char)0xbf, (char)0xbd };
134
135    unsigned char utf8[4];
136    size_t n;
137
138    if(codepoint <= 0x7f) {
139        n = 1;
140        utf8[0] = codepoint;
141    } else if(codepoint <= 0x7ff) {
142        n = 2;
143        utf8[0] = 0xc0 | ((codepoint >>  6) & 0x1f);
144        utf8[1] = 0x80 + ((codepoint >>  0) & 0x3f);
145    } else if(codepoint <= 0xffff) {
146        n = 3;
147        utf8[0] = 0xe0 | ((codepoint >> 12) & 0xf);
148        utf8[1] = 0x80 + ((codepoint >>  6) & 0x3f);
149        utf8[2] = 0x80 + ((codepoint >>  0) & 0x3f);
150    } else {
151        n = 4;
152        utf8[0] = 0xf0 | ((codepoint >> 18) & 0x7);
153        utf8[1] = 0x80 + ((codepoint >> 12) & 0x3f);
154        utf8[2] = 0x80 + ((codepoint >>  6) & 0x3f);
155        utf8[3] = 0x80 + ((codepoint >>  0) & 0x3f);
156    }
157
158    if(0 < codepoint  &&  codepoint <= 0x10ffff)
159        fn_append(r, (char*)utf8, (MD_SIZE)n);
160    else
161        fn_append(r, utf8_replacement_char, 3);
162}
163
164static void
165render_entity(MD_HTML* r, const MD_CHAR* text, MD_SIZE size,
166              void (*fn_append)(MD_HTML*, const MD_CHAR*, MD_SIZE))
167{
168    if(r->flags & MD_HTML_FLAG_VERBATIM_ENTITIES) {
169        render_verbatim(r, text, size);
170        return;
171    }
172
173    if(size > 3 && text[1] == '#') {
174        unsigned codepoint = 0;
175
176        if(text[2] == 'x' || text[2] == 'X') {
177
178            MD_SIZE i;
179            for(i = 3; i < size-1; i++)
180                codepoint = 16 * codepoint + hex_val(text[i]);
181        } else {
182
183            MD_SIZE i;
184            for(i = 2; i < size-1; i++)
185                codepoint = 10 * codepoint + (text[i] - '0');
186        }
187
188        render_utf8_codepoint(r, codepoint, fn_append);
189        return;
190    } else {
191
192        const ENTITY* ent;
193
194        ent = entity_lookup(text, size);
195        if(ent != NULL) {
196            render_utf8_codepoint(r, ent->codepoints[0], fn_append);
197            if(ent->codepoints[1])
198                render_utf8_codepoint(r, ent->codepoints[1], fn_append);
199            return;
200        }
201    }
202
203    fn_append(r, text, size);
204}
205
206static void
207render_attribute(MD_HTML* r, const MD_ATTRIBUTE* attr,
208                 void (*fn_append)(MD_HTML*, const MD_CHAR*, MD_SIZE))
209{
210    int i;
211
212    for(i = 0; attr->substr_offsets[i] < attr->size; i++) {
213        MD_TEXTTYPE type = attr->substr_types[i];
214        MD_OFFSET off = attr->substr_offsets[i];
215        MD_SIZE size = attr->substr_offsets[i+1] - off;
216        const MD_CHAR* text = attr->text + off;
217
218        switch(type) {
219            case MD_TEXT_NULLCHAR:  render_utf8_codepoint(r, 0x0000, render_verbatim); break;
220            case MD_TEXT_ENTITY:    render_entity(r, text, size, fn_append); break;
221            default:                fn_append(r, text, size); break;
222        }
223    }
224}
225
226static void
227render_open_ol_block(MD_HTML* r, const MD_BLOCK_OL_DETAIL* det)
228{
229    char buf[64];
230
231    if(det->start == 1) {
232        RENDER_VERBATIM(r, "<ol>\n");
233        return;
234    }
235
236    snprintf(buf, sizeof(buf), "<ol start=\"%u\">\n", det->start);
237    RENDER_VERBATIM(r, buf);
238}
239
240static void
241render_open_li_block(MD_HTML* r, const MD_BLOCK_LI_DETAIL* det)
242{
243    if(det->is_task) {
244        RENDER_VERBATIM(r, "<li class=\"task-list-item\">"
245                          "<input type=\"checkbox\" class=\"task-list-item-checkbox\" disabled");
246        if(det->task_mark == 'x' || det->task_mark == 'X')
247            RENDER_VERBATIM(r, " checked");
248        RENDER_VERBATIM(r, ">");
249    } else {
250        RENDER_VERBATIM(r, "<li>");
251    }
252}
253
254static void
255render_open_code_block(MD_HTML* r, const MD_BLOCK_CODE_DETAIL* det)
256{
257    RENDER_VERBATIM(r, "<pre><code");
258
259    if(det->lang.text != NULL) {
260        RENDER_VERBATIM(r, " class=\"language-");
261        render_attribute(r, &det->lang, render_html_escaped);
262        RENDER_VERBATIM(r, "\"");
263    }
264
265    RENDER_VERBATIM(r, ">");
266}
267
268static void
269render_open_td_block(MD_HTML* r, const MD_CHAR* cell_type, const MD_BLOCK_TD_DETAIL* det)
270{
271    RENDER_VERBATIM(r, "<");
272    RENDER_VERBATIM(r, cell_type);
273
274    switch(det->align) {
275        case MD_ALIGN_LEFT:     RENDER_VERBATIM(r, " align=\"left\">"); break;
276        case MD_ALIGN_CENTER:   RENDER_VERBATIM(r, " align=\"center\">"); break;
277        case MD_ALIGN_RIGHT:    RENDER_VERBATIM(r, " align=\"right\">"); break;
278        default:                RENDER_VERBATIM(r, ">"); break;
279    }
280}
281
282static void
283render_open_a_span(MD_HTML* r, const MD_SPAN_A_DETAIL* det)
284{
285    RENDER_VERBATIM(r, "<a href=\"");
286    render_attribute(r, &det->href, render_url_escaped);
287
288    if(det->title.text != NULL) {
289        RENDER_VERBATIM(r, "\" title=\"");
290        render_attribute(r, &det->title, render_html_escaped);
291    }
292
293    RENDER_VERBATIM(r, "\">");
294}
295
296static void
297render_open_img_span(MD_HTML* r, const MD_SPAN_IMG_DETAIL* det)
298{
299    RENDER_VERBATIM(r, "<img src=\"");
300    render_attribute(r, &det->src, render_url_escaped);
301
302    RENDER_VERBATIM(r, "\" alt=\"");
303}
304
305static void
306render_close_img_span(MD_HTML* r, const MD_SPAN_IMG_DETAIL* det)
307{
308    if(det->title.text != NULL) {
309        RENDER_VERBATIM(r, "\" title=\"");
310        render_attribute(r, &det->title, render_html_escaped);
311    }
312
313    RENDER_VERBATIM(r, (r->flags & MD_HTML_FLAG_XHTML) ? "\" />" : "\">");
314}
315
316static void
317render_open_wikilink_span(MD_HTML* r, const MD_SPAN_WIKILINK_DETAIL* det)
318{
319    RENDER_VERBATIM(r, "<x-wikilink data-target=\"");
320    render_attribute(r, &det->target, render_html_escaped);
321
322    RENDER_VERBATIM(r, "\">");
323}
324
325static int
326enter_block_callback(MD_BLOCKTYPE type, void* detail, void* userdata)
327{
328    static const MD_CHAR* head[6] = { "<h1>", "<h2>", "<h3>", "<h4>", "<h5>", "<h6>" };
329    MD_HTML* r = (MD_HTML*) userdata;
330
331    switch(type) {
332        case MD_BLOCK_DOC:       break;
333        case MD_BLOCK_QUOTE:    RENDER_VERBATIM(r, "<blockquote>\n"); break;
334        case MD_BLOCK_UL:       RENDER_VERBATIM(r, "<ul>\n"); break;
335        case MD_BLOCK_OL:       render_open_ol_block(r, (const MD_BLOCK_OL_DETAIL*)detail); break;
336        case MD_BLOCK_LI:       render_open_li_block(r, (const MD_BLOCK_LI_DETAIL*)detail); break;
337        case MD_BLOCK_HR:       RENDER_VERBATIM(r, (r->flags & MD_HTML_FLAG_XHTML) ? "<hr />\n" : "<hr>\n"); break;
338        case MD_BLOCK_H:        RENDER_VERBATIM(r, head[((MD_BLOCK_H_DETAIL*)detail)->level - 1]); break;
339        case MD_BLOCK_CODE:     render_open_code_block(r, (const MD_BLOCK_CODE_DETAIL*) detail); break;
340        case MD_BLOCK_HTML:      break;
341        case MD_BLOCK_P:        RENDER_VERBATIM(r, "<p>"); break;
342        case MD_BLOCK_TABLE:    RENDER_VERBATIM(r, "<table>\n"); break;
343        case MD_BLOCK_THEAD:    RENDER_VERBATIM(r, "<thead>\n"); break;
344        case MD_BLOCK_TBODY:    RENDER_VERBATIM(r, "<tbody>\n"); break;
345        case MD_BLOCK_TR:       RENDER_VERBATIM(r, "<tr>\n"); break;
346        case MD_BLOCK_TH:       render_open_td_block(r, "th", (MD_BLOCK_TD_DETAIL*)detail); break;
347        case MD_BLOCK_TD:       render_open_td_block(r, "td", (MD_BLOCK_TD_DETAIL*)detail); break;
348    }
349
350    return 0;
351}
352
353static int
354leave_block_callback(MD_BLOCKTYPE type, void* detail, void* userdata)
355{
356    static const MD_CHAR* head[6] = { "</h1>\n", "</h2>\n", "</h3>\n", "</h4>\n", "</h5>\n", "</h6>\n" };
357    MD_HTML* r = (MD_HTML*) userdata;
358
359    switch(type) {
360        case MD_BLOCK_DOC:       break;
361        case MD_BLOCK_QUOTE:    RENDER_VERBATIM(r, "</blockquote>\n"); break;
362        case MD_BLOCK_UL:       RENDER_VERBATIM(r, "</ul>\n"); break;
363        case MD_BLOCK_OL:       RENDER_VERBATIM(r, "</ol>\n"); break;
364        case MD_BLOCK_LI:       RENDER_VERBATIM(r, "</li>\n"); break;
365        case MD_BLOCK_HR:        break;
366        case MD_BLOCK_H:        RENDER_VERBATIM(r, head[((MD_BLOCK_H_DETAIL*)detail)->level - 1]); break;
367        case MD_BLOCK_CODE:     RENDER_VERBATIM(r, "</code></pre>\n"); break;
368        case MD_BLOCK_HTML:      break;
369        case MD_BLOCK_P:        RENDER_VERBATIM(r, "</p>\n"); break;
370        case MD_BLOCK_TABLE:    RENDER_VERBATIM(r, "</table>\n"); break;
371        case MD_BLOCK_THEAD:    RENDER_VERBATIM(r, "</thead>\n"); break;
372        case MD_BLOCK_TBODY:    RENDER_VERBATIM(r, "</tbody>\n"); break;
373        case MD_BLOCK_TR:       RENDER_VERBATIM(r, "</tr>\n"); break;
374        case MD_BLOCK_TH:       RENDER_VERBATIM(r, "</th>\n"); break;
375        case MD_BLOCK_TD:       RENDER_VERBATIM(r, "</td>\n"); break;
376    }
377
378    return 0;
379}
380
381static int
382enter_span_callback(MD_SPANTYPE type, void* detail, void* userdata)
383{
384    MD_HTML* r = (MD_HTML*) userdata;
385    int inside_img = (r->image_nesting_level > 0);
386
387    if(type == MD_SPAN_IMG)
388        r->image_nesting_level++;
389    if(inside_img)
390        return 0;
391
392    switch(type) {
393        case MD_SPAN_EM:                RENDER_VERBATIM(r, "<em>"); break;
394        case MD_SPAN_STRONG:            RENDER_VERBATIM(r, "<strong>"); break;
395        case MD_SPAN_U:                 RENDER_VERBATIM(r, "<u>"); break;
396        case MD_SPAN_A:                 render_open_a_span(r, (MD_SPAN_A_DETAIL*) detail); break;
397        case MD_SPAN_IMG:               render_open_img_span(r, (MD_SPAN_IMG_DETAIL*) detail); break;
398        case MD_SPAN_CODE:              RENDER_VERBATIM(r, "<code>"); break;
399        case MD_SPAN_DEL:               RENDER_VERBATIM(r, "<del>"); break;
400        case MD_SPAN_LATEXMATH:         RENDER_VERBATIM(r, "<x-equation>"); break;
401        case MD_SPAN_LATEXMATH_DISPLAY: RENDER_VERBATIM(r, "<x-equation type=\"display\">"); break;
402        case MD_SPAN_WIKILINK:          render_open_wikilink_span(r, (MD_SPAN_WIKILINK_DETAIL*) detail); break;
403    }
404
405    return 0;
406}
407
408static int
409leave_span_callback(MD_SPANTYPE type, void* detail, void* userdata)
410{
411    MD_HTML* r = (MD_HTML*) userdata;
412
413    if(type == MD_SPAN_IMG)
414        r->image_nesting_level--;
415    if(r->image_nesting_level > 0)
416        return 0;
417
418    switch(type) {
419        case MD_SPAN_EM:                RENDER_VERBATIM(r, "</em>"); break;
420        case MD_SPAN_STRONG:            RENDER_VERBATIM(r, "</strong>"); break;
421        case MD_SPAN_U:                 RENDER_VERBATIM(r, "</u>"); break;
422        case MD_SPAN_A:                 RENDER_VERBATIM(r, "</a>"); break;
423        case MD_SPAN_IMG:               render_close_img_span(r, (MD_SPAN_IMG_DETAIL*) detail); break;
424        case MD_SPAN_CODE:              RENDER_VERBATIM(r, "</code>"); break;
425        case MD_SPAN_DEL:               RENDER_VERBATIM(r, "</del>"); break;
426        case MD_SPAN_LATEXMATH:
427        case MD_SPAN_LATEXMATH_DISPLAY: RENDER_VERBATIM(r, "</x-equation>"); break;
428        case MD_SPAN_WIKILINK:          RENDER_VERBATIM(r, "</x-wikilink>"); break;
429    }
430
431    return 0;
432}
433
434static int
435text_callback(MD_TEXTTYPE type, const MD_CHAR* text, MD_SIZE size, void* userdata)
436{
437    MD_HTML* r = (MD_HTML*) userdata;
438
439    switch(type) {
440        case MD_TEXT_NULLCHAR:  render_utf8_codepoint(r, 0x0000, render_verbatim); break;
441        case MD_TEXT_BR:        RENDER_VERBATIM(r, (r->image_nesting_level == 0
442                                        ? ((r->flags & MD_HTML_FLAG_XHTML) ? "<br />\n" : "<br>\n")
443                                        : " "));
444                                break;
445        case MD_TEXT_SOFTBR:    RENDER_VERBATIM(r, (r->image_nesting_level == 0 ? "\n" : " ")); break;
446        case MD_TEXT_HTML:      render_verbatim(r, text, size); break;
447        case MD_TEXT_ENTITY:    render_entity(r, text, size, render_html_escaped); break;
448        default:                render_html_escaped(r, text, size); break;
449    }
450
451    return 0;
452}
453
454static void
455debug_log_callback(const char* msg, void* userdata)
456{
457    MD_HTML* r = (MD_HTML*) userdata;
458    if(r->flags & MD_HTML_FLAG_DEBUG)
459        fprintf(stderr, "MD4C: %s\n", msg);
460}
461
462int
463md_html(const MD_CHAR* input, MD_SIZE input_size,
464        void (*process_output)(const MD_CHAR*, MD_SIZE, void*),
465        void* userdata, unsigned parser_flags, unsigned renderer_flags)
466{
467    MD_HTML render = { process_output, userdata, renderer_flags, 0, { 0 } };
468    int i;
469
470    MD_PARSER parser = {
471        0,
472        parser_flags,
473        enter_block_callback,
474        leave_block_callback,
475        enter_span_callback,
476        leave_span_callback,
477        text_callback,
478        debug_log_callback,
479        NULL
480    };
481
482    for(i = 0; i < 256; i++) {
483        unsigned char ch = (unsigned char) i;
484
485        if(strchr("\"&<>", ch) != NULL)
486            render.escape_map[i] |= NEED_HTML_ESC_FLAG;
487
488        if(!ISALNUM(ch)  &&  strchr("~-_.+!*(),%#@?=;:/,+$", ch) == NULL)
489            render.escape_map[i] |= NEED_URL_ESC_FLAG;
490    }
491
492    if(renderer_flags & MD_HTML_FLAG_SKIP_UTF8_BOM  &&  sizeof(MD_CHAR) == 1) {
493        static const MD_CHAR bom[3] = { (char)0xef, (char)0xbb, (char)0xbf };
494        if(input_size >= sizeof(bom)  &&  memcmp(input, bom, sizeof(bom)) == 0) {
495            input += sizeof(bom);
496            input_size -= sizeof(bom);
497        }
498    }
499
500    return md_parse(input, input_size, &parser, (void*) &render);
501}