Spectral/include/hoedown/escape.c

#include "escape.h"

#include <assert.h>
#include <stdio.h>
#include <string.h>


#define likely(x)       __builtin_expect((x),1)
#define unlikely(x)     __builtin_expect((x),0)


/*
 * The following characters will not be escaped:
 *
 *		-_.+!*'(),%#@?=;:/,+&$ alphanum
 *
 * Note that this character set is the addition of:
 *
 *	- The characters which are safe to be in an URL
 *	- The characters which are *not* safe to be in
 *	an URL because they are RESERVED characters.
 *
 * We assume (lazily) that any RESERVED char that
 * appears inside an URL is actually meant to
 * have its native function (i.e. as an URL
 * component/separator) and hence needs no escaping.
 *
 * There are two exceptions: the chacters & (amp)
 * and ' (single quote) do not appear in the table.
 * They are meant to appear in the URL as components,
 * yet they require special HTML-entity escaping
 * to generate valid HTML markup.
 *
 * All other characters will be escaped to %XX.
 *
 */
static const uint8_t HREF_SAFE[UINT8_MAX+1] = {
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1,
	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
	0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
};

void
hoedown_escape_href(hoedown_buffer *ob, const uint8_t *data, size_t size)
{
	static const char hex_chars[] = "0123456789ABCDEF";
	size_t  i = 0, mark;
	char hex_str[3];

	hex_str[0] = '%';

	while (i < size) {
		mark = i;
		while (i < size && HREF_SAFE[data[i]]) i++;

		/* Optimization for cases where there's nothing to escape */
		if (mark == 0 && i >= size) {
			hoedown_buffer_put(ob, data, size);
			return;
		}

		if (likely(i > mark)) {
			hoedown_buffer_put(ob, data + mark, i - mark);
		}

		/* escaping */
		if (i >= size)
			break;

		switch (data[i]) {
		/* amp appears all the time in URLs, but needs
		 * HTML-entity escaping to be inside an href */
		case '&':
			HOEDOWN_BUFPUTSL(ob, "&amp;");
			break;

		/* the single quote is a valid URL character
		 * according to the standard; it needs HTML
		 * entity escaping too */
		case '\'':
			HOEDOWN_BUFPUTSL(ob, "&#x27;");
			break;

		/* the space can be escaped to %20 or a plus
		 * sign. we're going with the generic escape
		 * for now. the plus thing is more commonly seen
		 * when building GET strings */
#if 0
		case ' ':
			hoedown_buffer_putc(ob, '+');
			break;
#endif

		/* every other character goes with a %XX escaping */
		default:
			hex_str[1] = hex_chars[(data[i] >> 4) & 0xF];
			hex_str[2] = hex_chars[data[i] & 0xF];
			hoedown_buffer_put(ob, (uint8_t *)hex_str, 3);
		}

		i++;
	}
}


/**
 * According to the OWASP rules:
 *
 * & --> &amp;
 * < --> &lt;
 * > --> &gt;
 * " --> &quot;
 * ' --> &#x27;     &apos; is not recommended
 * / --> &#x2F;     forward slash is included as it helps end an HTML entity
 *
 */
static const uint8_t HTML_ESCAPE_TABLE[UINT8_MAX+1] = {
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 1, 0, 0, 0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 4,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 6, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
};

static const char *HTML_ESCAPES[] = {
        "",
        "&quot;",
        "&amp;",
        "&#39;",
        "&#47;",
        "&lt;",
        "&gt;"
};

void
hoedown_escape_html(hoedown_buffer *ob, const uint8_t *data, size_t size, int secure)
{
	size_t i = 0, mark;

	while (1) {
		mark = i;
		while (i < size && HTML_ESCAPE_TABLE[data[i]] == 0) i++;

		/* Optimization for cases where there's nothing to escape */
		if (mark == 0 && i >= size) {
			hoedown_buffer_put(ob, data, size);
			return;
		}

		if (likely(i > mark))
			hoedown_buffer_put(ob, data + mark, i - mark);

		if (i >= size) break;

		/* The forward slash is only escaped in secure mode */
		if (!secure && data[i] == '/') {
			hoedown_buffer_putc(ob, '/');
		} else {
			hoedown_buffer_puts(ob, HTML_ESCAPES[HTML_ESCAPE_TABLE[data[i]]]);
		}

		i++;
	}
}
Change markdown parser. 2019-02-13 08:14:13 +00:00			`#include "escape.h"`

			`#include <assert.h>`
			`#include <stdio.h>`
			`#include <string.h>`


			`#define likely(x) __builtin_expect((x),1)`
			`#define unlikely(x) __builtin_expect((x),0)`


			`/*`
			`* The following characters will not be escaped:`
			`*`
			`* -_.+!*'(),%#@?=;:/,+&$ alphanum`
			`*`
			`* Note that this character set is the addition of:`
			`*`
			`* - The characters which are safe to be in an URL`
			`* - The characters which are not safe to be in`
			`* an URL because they are RESERVED characters.`
			`*`
			`* We assume (lazily) that any RESERVED char that`
			`* appears inside an URL is actually meant to`
			`* have its native function (i.e. as an URL`
			`* component/separator) and hence needs no escaping.`
			`*`
			`* There are two exceptions: the chacters & (amp)`
			`* and ' (single quote) do not appear in the table.`
			`* They are meant to appear in the URL as components,`
			`* yet they require special HTML-entity escaping`
			`* to generate valid HTML markup.`
			`*`
			`* All other characters will be escaped to %XX.`
			`*`
			`*/`
			`static const uint8_t HREF_SAFE[UINT8_MAX+1] = {`
			`0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,`
			`0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,`
			`0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,`
			`1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1,`
			`1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,`
			`1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,`
			`0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,`
			`1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,`
			`0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,`
			`0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,`
			`0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,`
			`0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,`
			`0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,`
			`0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,`
			`0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,`
			`0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,`
			`};`

			`void`
			`hoedown_escape_href(hoedown_buffer ob, const uint8_t data, size_t size)`
			`{`
			`static const char hex_chars[] = "0123456789ABCDEF";`
			`size_t i = 0, mark;`
			`char hex_str[3];`

			`hex_str[0] = '%';`

			`while (i < size) {`
			`mark = i;`
			`while (i < size && HREF_SAFE[data[i]]) i++;`

			`/* Optimization for cases where there's nothing to escape */`
			`if (mark == 0 && i >= size) {`
			`hoedown_buffer_put(ob, data, size);`
			`return;`
			`}`

			`if (likely(i > mark)) {`
			`hoedown_buffer_put(ob, data + mark, i - mark);`
			`}`

			`/* escaping */`
			`if (i >= size)`
			`break;`

			`switch (data[i]) {`
			`/* amp appears all the time in URLs, but needs`
			`* HTML-entity escaping to be inside an href */`
			`case '&':`
			`HOEDOWN_BUFPUTSL(ob, "&");`
			`break;`

			`/* the single quote is a valid URL character`
			`* according to the standard; it needs HTML`
			`* entity escaping too */`
			`case '\'':`
			`HOEDOWN_BUFPUTSL(ob, "'");`
			`break;`

			`/* the space can be escaped to %20 or a plus`
			`* sign. we're going with the generic escape`
			`* for now. the plus thing is more commonly seen`
			`* when building GET strings */`
			`#if 0`
			`case ' ':`
			`hoedown_buffer_putc(ob, '+');`
			`break;`
			`#endif`

			`/* every other character goes with a %XX escaping */`
			`default:`
			`hex_str[1] = hex_chars[(data[i] >> 4) & 0xF];`
			`hex_str[2] = hex_chars[data[i] & 0xF];`
			`hoedown_buffer_put(ob, (uint8_t *)hex_str, 3);`
			`}`

			`i++;`
			`}`
			`}`


			`/**`
			`* According to the OWASP rules:`
			`*`
			`* & --> &`
			`* < --> <`
			`* > --> >`
			`* " --> "`
			`* ' --> ' ' is not recommended`
			`* / --> / forward slash is included as it helps end an HTML entity`
			`*`
			`*/`
			`static const uint8_t HTML_ESCAPE_TABLE[UINT8_MAX+1] = {`
			`0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,`
			`0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,`
			`0, 0, 1, 0, 0, 0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 4,`
			`0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 6, 0,`
			`0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,`
			`0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,`
			`0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,`
			`0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,`
			`0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,`
			`0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,`
			`0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,`
			`0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,`
			`0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,`
			`0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,`
			`0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,`
			`0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,`
			`};`

			`static const char *HTML_ESCAPES[] = {`
			`"",`
			`""",`
			`"&",`
			`"'",`
			`"/",`
			`"<",`
			`">"`
			`};`

			`void`
			`hoedown_escape_html(hoedown_buffer ob, const uint8_t data, size_t size, int secure)`
			`{`
			`size_t i = 0, mark;`

			`while (1) {`
			`mark = i;`
			`while (i < size && HTML_ESCAPE_TABLE[data[i]] == 0) i++;`

			`/* Optimization for cases where there's nothing to escape */`
			`if (mark == 0 && i >= size) {`
			`hoedown_buffer_put(ob, data, size);`
			`return;`
			`}`

			`if (likely(i > mark))`
			`hoedown_buffer_put(ob, data + mark, i - mark);`

			`if (i >= size) break;`

			`/* The forward slash is only escaped in secure mode */`
			`if (!secure && data[i] == '/') {`
			`hoedown_buffer_putc(ob, '/');`
			`} else {`
			`hoedown_buffer_puts(ob, HTML_ESCAPES[HTML_ESCAPE_TABLE[data[i]]]);`
			`}`

			`i++;`
			`}`
			`}`