blob: 3275ddd5b08365b80ffcd3adc7c68951c28d53d1 [file] [log] [blame]
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include "cmark_ctype.h"
#include "config.h"
#include "node.h"
#include "parser.h"
#include "references.h"
#include "cmark.h"
#include "houdini.h"
#include "utf8.h"
#include "scanners.h"
#include "inlines.h"
static const char *EMDASH = "\xE2\x80\x94";
static const char *ENDASH = "\xE2\x80\x93";
static const char *ELLIPSES = "\xE2\x80\xA6";
static const char *LEFTDOUBLEQUOTE = "\xE2\x80\x9C";
static const char *RIGHTDOUBLEQUOTE = "\xE2\x80\x9D";
static const char *LEFTSINGLEQUOTE = "\xE2\x80\x98";
static const char *RIGHTSINGLEQUOTE = "\xE2\x80\x99";
// Macros for creating various kinds of simple.
#define make_str(s) make_literal(CMARK_NODE_TEXT, s)
#define make_code(s) make_literal(CMARK_NODE_CODE, s)
#define make_raw_html(s) make_literal(CMARK_NODE_INLINE_HTML, s)
#define make_linebreak() make_simple(CMARK_NODE_LINEBREAK)
#define make_softbreak() make_simple(CMARK_NODE_SOFTBREAK)
#define make_emph() make_simple(CMARK_NODE_EMPH)
#define make_strong() make_simple(CMARK_NODE_STRONG)
typedef struct delimiter {
struct delimiter *previous;
struct delimiter *next;
cmark_node *inl_text;
bufsize_t position;
unsigned char delim_char;
bool can_open;
bool can_close;
bool active;
} delimiter;
typedef struct {
cmark_chunk input;
bufsize_t pos;
cmark_reference_map *refmap;
delimiter *last_delim;
} subject;
static inline bool S_is_line_end_char(char c) {
return (c == '\n' || c == '\r');
}
static delimiter *S_insert_emph(subject *subj, delimiter *opener,
delimiter *closer);
static int parse_inline(subject *subj, cmark_node *parent, int options);
static void subject_from_buf(subject *e, cmark_strbuf *buffer,
cmark_reference_map *refmap);
static bufsize_t subject_find_special_char(subject *subj, int options);
// Create an inline with a literal string value.
static inline cmark_node *make_literal(cmark_node_type t, cmark_chunk s) {
cmark_node *e = (cmark_node *)calloc(1, sizeof(*e));
if (e != NULL) {
e->type = t;
e->as.literal = s;
e->next = NULL;
e->prev = NULL;
e->parent = NULL;
e->first_child = NULL;
e->last_child = NULL;
// These fields aren't used for inlines:
e->start_line = 0;
e->start_column = 0;
e->end_line = 0;
}
return e;
}
// Create an inline with no value.
static inline cmark_node *make_simple(cmark_node_type t) {
cmark_node *e = (cmark_node *)calloc(1, sizeof(*e));
if (e != NULL) {
e->type = t;
e->next = NULL;
e->prev = NULL;
e->parent = NULL;
e->first_child = NULL;
e->last_child = NULL;
// These fields aren't used for inlines:
e->start_line = 0;
e->start_column = 0;
e->end_line = 0;
}
return e;
}
// Like make_str, but parses entities.
static cmark_node *make_str_with_entities(cmark_chunk *content) {
cmark_strbuf unescaped = GH_BUF_INIT;
if (houdini_unescape_html(&unescaped, content->data, content->len)) {
return make_str(cmark_chunk_buf_detach(&unescaped));
} else {
return make_str(*content);
}
}
// Duplicate a chunk by creating a copy of the buffer not by reusing the
// buffer like cmark_chunk_dup does.
static cmark_chunk chunk_clone(cmark_chunk *src) {
cmark_chunk c;
bufsize_t len = src->len;
c.len = len;
c.data = (unsigned char *)malloc(len + 1);
c.alloc = 1;
memcpy(c.data, src->data, len);
c.data[len] = '\0';
return c;
}
static cmark_chunk cmark_clean_autolink(cmark_chunk *url, int is_email) {
cmark_strbuf buf = GH_BUF_INIT;
cmark_chunk_trim(url);
if (url->len == 0) {
cmark_chunk result = CMARK_CHUNK_EMPTY;
return result;
}
if (is_email)
cmark_strbuf_puts(&buf, "mailto:");
houdini_unescape_html_f(&buf, url->data, url->len);
return cmark_chunk_buf_detach(&buf);
}
static inline cmark_node *make_autolink(cmark_chunk url, int is_email) {
cmark_node *link = make_simple(CMARK_NODE_LINK);
link->as.link.url = cmark_clean_autolink(&url, is_email);
link->as.link.title = cmark_chunk_literal("");
cmark_node_append_child(link, make_str_with_entities(&url));
return link;
}
static void subject_from_buf(subject *e, cmark_strbuf *buffer,
cmark_reference_map *refmap) {
e->input.data = buffer->ptr;
e->input.len = buffer->size;
e->input.alloc = 0;
e->pos = 0;
e->refmap = refmap;
e->last_delim = NULL;
}
static inline int isbacktick(int c) { return (c == '`'); }
static inline unsigned char peek_char(subject *subj) {
// NULL bytes should have been stripped out by now. If they're
// present, it's a programming error:
assert(!(subj->pos < subj->input.len && subj->input.data[subj->pos] == 0));
return (subj->pos < subj->input.len) ? subj->input.data[subj->pos] : 0;
}
static inline unsigned char peek_at(subject *subj, bufsize_t pos) {
return subj->input.data[pos];
}
// Return true if there are more characters in the subject.
static inline int is_eof(subject *subj) {
return (subj->pos >= subj->input.len);
}
// Advance the subject. Doesn't check for eof.
#define advance(subj) (subj)->pos += 1
static inline bool skip_spaces(subject *subj) {
bool skipped = false;
while (peek_char(subj) == ' ' || peek_char(subj) == '\t') {
advance(subj);
skipped = true;
}
return skipped;
}
static inline bool skip_line_end(subject *subj) {
bool seen_line_end_char = false;
if (peek_char(subj) == '\r') {
advance(subj);
seen_line_end_char = true;
}
if (peek_char(subj) == '\n') {
advance(subj);
seen_line_end_char = true;
}
return seen_line_end_char || is_eof(subj);
}
// Take characters while a predicate holds, and return a string.
static inline cmark_chunk take_while(subject *subj, int (*f)(int)) {
unsigned char c;
bufsize_t startpos = subj->pos;
bufsize_t len = 0;
while ((c = peek_char(subj)) && (*f)(c)) {
advance(subj);
len++;
}
return cmark_chunk_dup(&subj->input, startpos, len);
}
// Try to process a backtick code span that began with a
// span of ticks of length openticklength length (already
// parsed). Return 0 if you don't find matching closing
// backticks, otherwise return the position in the subject
// after the closing backticks.
static bufsize_t scan_to_closing_backticks(subject *subj,
bufsize_t openticklength) {
// read non backticks
unsigned char c;
while ((c = peek_char(subj)) && c != '`') {
advance(subj);
}
if (is_eof(subj)) {
return 0; // did not find closing ticks, return 0
}
bufsize_t numticks = 0;
while (peek_char(subj) == '`') {
advance(subj);
numticks++;
}
if (numticks != openticklength) {
return (scan_to_closing_backticks(subj, openticklength));
}
return (subj->pos);
}
// Parse backtick code section or raw backticks, return an inline.
// Assumes that the subject has a backtick at the current position.
static cmark_node *handle_backticks(subject *subj) {
cmark_chunk openticks = take_while(subj, isbacktick);
bufsize_t startpos = subj->pos;
bufsize_t endpos = scan_to_closing_backticks(subj, openticks.len);
if (endpos == 0) { // not found
subj->pos = startpos; // rewind
return make_str(openticks);
} else {
cmark_strbuf buf = GH_BUF_INIT;
cmark_strbuf_set(&buf, subj->input.data + startpos,
endpos - startpos - openticks.len);
cmark_strbuf_trim(&buf);
cmark_strbuf_normalize_whitespace(&buf);
return make_code(cmark_chunk_buf_detach(&buf));
}
}
// Scan ***, **, or * and return number scanned, or 0.
// Advances position.
static int scan_delims(subject *subj, unsigned char c, bool *can_open,
bool *can_close) {
int numdelims = 0;
bufsize_t before_char_pos;
int32_t after_char = 0;
int32_t before_char = 0;
int len;
bool left_flanking, right_flanking;
if (subj->pos == 0) {
before_char = 10;
} else {
before_char_pos = subj->pos - 1;
// walk back to the beginning of the UTF_8 sequence:
while (peek_at(subj, before_char_pos) >> 6 == 2 && before_char_pos > 0) {
before_char_pos -= 1;
}
len = cmark_utf8proc_iterate(subj->input.data + before_char_pos,
subj->pos - before_char_pos, &before_char);
if (len == -1) {
before_char = 10;
}
}
if (c == '\'' || c == '"') {
numdelims++;
advance(subj); // limit to 1 delim for quotes
} else {
while (peek_char(subj) == c) {
numdelims++;
advance(subj);
}
}
len = cmark_utf8proc_iterate(subj->input.data + subj->pos,
subj->input.len - subj->pos, &after_char);
if (len == -1) {
after_char = 10;
}
left_flanking = numdelims > 0 && !cmark_utf8proc_is_space(after_char) &&
!(cmark_utf8proc_is_punctuation(after_char) &&
!cmark_utf8proc_is_space(before_char) &&
!cmark_utf8proc_is_punctuation(before_char));
right_flanking =
numdelims > 0 && !cmark_utf8proc_is_space(before_char) &&
!(cmark_utf8proc_is_punctuation(before_char) &&
!cmark_utf8proc_is_space(after_char) && !cmark_utf8proc_is_punctuation(after_char));
if (c == '_') {
*can_open = left_flanking &&
(!right_flanking || cmark_utf8proc_is_punctuation(before_char));
*can_close = right_flanking &&
(!left_flanking || cmark_utf8proc_is_punctuation(after_char));
} else if (c == '\'' || c == '"') {
*can_open = left_flanking && !right_flanking;
*can_close = right_flanking;
} else {
*can_open = left_flanking;
*can_close = right_flanking;
}
return numdelims;
}
/*
static void print_delimiters(subject *subj)
{
delimiter *delim;
delim = subj->last_delim;
while (delim != NULL) {
printf("Item at stack pos %p, text pos %d: %d %d %d next(%p)
prev(%p)\n",
(void*)delim, delim->position, delim->delim_char,
delim->can_open, delim->can_close,
(void*)delim->next, (void*)delim->previous);
delim = delim->previous;
}
}
*/
static void remove_delimiter(subject *subj, delimiter *delim) {
if (delim == NULL)
return;
if (delim->next == NULL) {
// end of list:
assert(delim == subj->last_delim);
subj->last_delim = delim->previous;
} else {
delim->next->previous = delim->previous;
}
if (delim->previous != NULL) {
delim->previous->next = delim->next;
}
free(delim);
}
static void push_delimiter(subject *subj, unsigned char c, bool can_open,
bool can_close, cmark_node *inl_text) {
delimiter *delim = (delimiter *)malloc(sizeof(delimiter));
if (delim == NULL) {
return;
}
delim->delim_char = c;
delim->can_open = can_open;
delim->can_close = can_close;
delim->inl_text = inl_text;
delim->previous = subj->last_delim;
delim->next = NULL;
if (delim->previous != NULL) {
delim->previous->next = delim;
}
delim->position = subj->pos;
delim->active = true;
subj->last_delim = delim;
}
// Assumes the subject has a c at the current position.
static cmark_node *handle_delim(subject *subj, unsigned char c, bool smart) {
bufsize_t numdelims;
cmark_node *inl_text;
bool can_open, can_close;
cmark_chunk contents;
numdelims = scan_delims(subj, c, &can_open, &can_close);
if (c == '\'' && smart) {
contents = cmark_chunk_literal(RIGHTSINGLEQUOTE);
} else if (c == '"' && smart) {
contents =
cmark_chunk_literal(can_close ? RIGHTDOUBLEQUOTE : LEFTDOUBLEQUOTE);
} else {
contents = cmark_chunk_dup(&subj->input, subj->pos - numdelims, numdelims);
}
inl_text = make_str(contents);
if ((can_open || can_close) && (!(c == '\'' || c == '"') || smart)) {
push_delimiter(subj, c, can_open, can_close, inl_text);
}
return inl_text;
}
// Assumes we have a hyphen at the current position.
static cmark_node *handle_hyphen(subject *subj, bool smart) {
int startpos = subj->pos;
advance(subj);
if (!smart || peek_char(subj) != '-') {
return make_str(cmark_chunk_literal("-"));
}
while (smart && peek_char(subj) == '-') {
advance(subj);
}
int numhyphens = subj->pos - startpos;
int en_count = 0;
int em_count = 0;
int i;
cmark_strbuf buf = GH_BUF_INIT;
if (numhyphens % 3 == 0) { // if divisible by 3, use all em dashes
em_count = numhyphens / 3;
} else if (numhyphens % 2 == 0) { // if divisible by 2, use all en dashes
en_count = numhyphens / 2;
} else if (numhyphens % 3 == 2) { // use one en dash at end
en_count = 1;
em_count = (numhyphens - 2) / 3;
} else { // use two en dashes at the end
en_count = 2;
em_count = (numhyphens - 4) / 3;
}
for (i = em_count; i > 0; i--) {
cmark_strbuf_puts(&buf, EMDASH);
}
for (i = en_count; i > 0; i--) {
cmark_strbuf_puts(&buf, ENDASH);
}
return make_str(cmark_chunk_buf_detach(&buf));
}
// Assumes we have a period at the current position.
static cmark_node *handle_period(subject *subj, bool smart) {
advance(subj);
if (smart && peek_char(subj) == '.') {
advance(subj);
if (peek_char(subj) == '.') {
advance(subj);
return make_str(cmark_chunk_literal(ELLIPSES));
} else {
return make_str(cmark_chunk_literal(".."));
}
} else {
return make_str(cmark_chunk_literal("."));
}
}
static void process_emphasis(subject *subj, delimiter *stack_bottom) {
delimiter *closer = subj->last_delim;
delimiter *opener;
delimiter *old_closer;
bool opener_found;
delimiter *openers_bottom[128];
// initialize openers_bottom:
openers_bottom['*'] = stack_bottom;
openers_bottom['_'] = stack_bottom;
openers_bottom['\''] = stack_bottom;
openers_bottom['"'] = stack_bottom;
// move back to first relevant delim.
while (closer != NULL && closer->previous != stack_bottom) {
closer = closer->previous;
}
// now move forward, looking for closers, and handling each
while (closer != NULL) {
if (closer->can_close &&
(closer->delim_char == '*' || closer->delim_char == '_' ||
closer->delim_char == '"' || closer->delim_char == '\'')) {
// Now look backwards for first matching opener:
opener = closer->previous;
opener_found = false;
while (opener != NULL && opener != stack_bottom &&
opener != openers_bottom[closer->delim_char]) {
if (opener->delim_char == closer->delim_char && opener->can_open) {
opener_found = true;
break;
}
opener = opener->previous;
}
old_closer = closer;
if (closer->delim_char == '*' || closer->delim_char == '_') {
if (opener_found) {
closer = S_insert_emph(subj, opener, closer);
} else {
closer = closer->next;
}
} else if (closer->delim_char == '\'') {
cmark_chunk_free(&closer->inl_text->as.literal);
closer->inl_text->as.literal = cmark_chunk_literal(RIGHTSINGLEQUOTE);
if (opener_found) {
cmark_chunk_free(&opener->inl_text->as.literal);
opener->inl_text->as.literal = cmark_chunk_literal(LEFTSINGLEQUOTE);
}
closer = closer->next;
} else if (closer->delim_char == '"') {
cmark_chunk_free(&closer->inl_text->as.literal);
closer->inl_text->as.literal = cmark_chunk_literal(RIGHTDOUBLEQUOTE);
if (opener_found) {
cmark_chunk_free(&opener->inl_text->as.literal);
opener->inl_text->as.literal = cmark_chunk_literal(LEFTDOUBLEQUOTE);
}
closer = closer->next;
}
if (!opener_found) {
// set lower bound for future searches for openers:
openers_bottom[old_closer->delim_char] = old_closer->previous;
if (!old_closer->can_open) {
// we can remove a closer that can't be an
// opener, once we've seen there's no
// matching opener:
remove_delimiter(subj, old_closer);
}
}
} else {
closer = closer->next;
}
}
// free all delimiters in list until stack_bottom:
while (subj->last_delim != stack_bottom) {
remove_delimiter(subj, subj->last_delim);
}
}
static delimiter *S_insert_emph(subject *subj, delimiter *opener,
delimiter *closer) {
delimiter *delim, *tmp_delim;
bufsize_t use_delims;
cmark_node *opener_inl = opener->inl_text;
cmark_node *closer_inl = closer->inl_text;
bufsize_t opener_num_chars = opener_inl->as.literal.len;
bufsize_t closer_num_chars = closer_inl->as.literal.len;
cmark_node *tmp, *emph, *first_child, *last_child;
// calculate the actual number of characters used from this closer
if (closer_num_chars < 3 || opener_num_chars < 3) {
use_delims = closer_num_chars <= opener_num_chars ? closer_num_chars
: opener_num_chars;
} else { // closer and opener both have >= 3 characters
use_delims = closer_num_chars % 2 == 0 ? 2 : 1;
}
// remove used characters from associated inlines.
opener_num_chars -= use_delims;
closer_num_chars -= use_delims;
opener_inl->as.literal.len = opener_num_chars;
closer_inl->as.literal.len = closer_num_chars;
// free delimiters between opener and closer
delim = closer->previous;
while (delim != NULL && delim != opener) {
tmp_delim = delim->previous;
remove_delimiter(subj, delim);
delim = tmp_delim;
}
first_child = opener_inl->next;
last_child = closer_inl->prev;
// if opener has 0 characters, remove it and its associated inline
if (opener_num_chars == 0) {
// replace empty opener inline with emph
cmark_chunk_free(&(opener_inl->as.literal));
emph = opener_inl;
emph->type = use_delims == 1 ? CMARK_NODE_EMPH : CMARK_NODE_STRONG;
// remove opener from list
remove_delimiter(subj, opener);
} else {
// create new emph or strong, and splice it in to our inlines
// between the opener and closer
emph = use_delims == 1 ? make_emph() : make_strong();
emph->parent = opener_inl->parent;
emph->prev = opener_inl;
opener_inl->next = emph;
}
// push children below emph
emph->next = closer_inl;
closer_inl->prev = emph;
emph->first_child = first_child;
emph->last_child = last_child;
// fix children pointers
first_child->prev = NULL;
last_child->next = NULL;
for (tmp = first_child; tmp != NULL; tmp = tmp->next) {
tmp->parent = emph;
}
// if closer has 0 characters, remove it and its associated inline
if (closer_num_chars == 0) {
// remove empty closer inline
cmark_node_free(closer_inl);
// remove closer from list
tmp_delim = closer->next;
remove_delimiter(subj, closer);
closer = tmp_delim;
}
return closer;
}
// Parse backslash-escape or just a backslash, returning an inline.
static cmark_node *handle_backslash(subject *subj) {
advance(subj);
unsigned char nextchar = peek_char(subj);
if (cmark_ispunct(
nextchar)) { // only ascii symbols and newline can be escaped
advance(subj);
return make_str(cmark_chunk_dup(&subj->input, subj->pos - 1, 1));
} else if (!is_eof(subj) && skip_line_end(subj)) {
return make_linebreak();
} else {
return make_str(cmark_chunk_literal("\\"));
}
}
// Parse an entity or a regular "&" string.
// Assumes the subject has an '&' character at the current position.
static cmark_node *handle_entity(subject *subj) {
cmark_strbuf ent = GH_BUF_INIT;
bufsize_t len;
advance(subj);
len = houdini_unescape_ent(&ent, subj->input.data + subj->pos,
subj->input.len - subj->pos);
if (len == 0)
return make_str(cmark_chunk_literal("&"));
subj->pos += len;
return make_str(cmark_chunk_buf_detach(&ent));
}
// Clean a URL: remove surrounding whitespace and surrounding <>,
// and remove \ that escape punctuation.
cmark_chunk cmark_clean_url(cmark_chunk *url) {
cmark_strbuf buf = GH_BUF_INIT;
cmark_chunk_trim(url);
if (url->len == 0) {
cmark_chunk result = CMARK_CHUNK_EMPTY;
return result;
}
if (url->data[0] == '<' && url->data[url->len - 1] == '>') {
houdini_unescape_html_f(&buf, url->data + 1, url->len - 2);
} else {
houdini_unescape_html_f(&buf, url->data, url->len);
}
cmark_strbuf_unescape(&buf);
return cmark_chunk_buf_detach(&buf);
}
cmark_chunk cmark_clean_title(cmark_chunk *title) {
cmark_strbuf buf = GH_BUF_INIT;
unsigned char first, last;
if (title->len == 0) {
cmark_chunk result = CMARK_CHUNK_EMPTY;
return result;
}
first = title->data[0];
last = title->data[title->len - 1];
// remove surrounding quotes if any:
if ((first == '\'' && last == '\'') || (first == '(' && last == ')') ||
(first == '"' && last == '"')) {
houdini_unescape_html_f(&buf, title->data + 1, title->len - 2);
} else {
houdini_unescape_html_f(&buf, title->data, title->len);
}
cmark_strbuf_unescape(&buf);
return cmark_chunk_buf_detach(&buf);
}
// Parse an autolink or HTML tag.
// Assumes the subject has a '<' character at the current position.
static cmark_node *handle_pointy_brace(subject *subj) {
bufsize_t matchlen = 0;
cmark_chunk contents;
advance(subj); // advance past first <
// first try to match a URL autolink
matchlen = scan_autolink_uri(&subj->input, subj->pos);
if (matchlen > 0) {
contents = cmark_chunk_dup(&subj->input, subj->pos, matchlen - 1);
subj->pos += matchlen;
return make_autolink(contents, 0);
}
// next try to match an email autolink
matchlen = scan_autolink_email(&subj->input, subj->pos);
if (matchlen > 0) {
contents = cmark_chunk_dup(&subj->input, subj->pos, matchlen - 1);
subj->pos += matchlen;
return make_autolink(contents, 1);
}
// finally, try to match an html tag
matchlen = scan_html_tag(&subj->input, subj->pos);
if (matchlen > 0) {
contents = cmark_chunk_dup(&subj->input, subj->pos - 1, matchlen + 1);
subj->pos += matchlen;
return make_raw_html(contents);
}
// if nothing matches, just return the opening <:
return make_str(cmark_chunk_literal("<"));
}
// Parse a link label. Returns 1 if successful.
// Note: unescaped brackets are not allowed in labels.
// The label begins with `[` and ends with the first `]` character
// encountered. Backticks in labels do not start code spans.
static int link_label(subject *subj, cmark_chunk *raw_label) {
bufsize_t startpos = subj->pos;
int length = 0;
unsigned char c;
// advance past [
if (peek_char(subj) == '[') {
advance(subj);
} else {
return 0;
}
while ((c = peek_char(subj)) && c != '[' && c != ']') {
if (c == '\\') {
advance(subj);
length++;
if (cmark_ispunct(peek_char(subj))) {
advance(subj);
length++;
}
} else {
advance(subj);
length++;
}
if (length > MAX_LINK_LABEL_LENGTH) {
goto noMatch;
}
}
if (c == ']') { // match found
*raw_label =
cmark_chunk_dup(&subj->input, startpos + 1, subj->pos - (startpos + 1));
cmark_chunk_trim(raw_label);
advance(subj); // advance past ]
return 1;
}
noMatch:
subj->pos = startpos; // rewind
return 0;
}
// Return a link, an image, or a literal close bracket.
static cmark_node *handle_close_bracket(subject *subj, cmark_node *parent) {
bufsize_t initial_pos;
bufsize_t starturl, endurl, starttitle, endtitle, endall;
bufsize_t n;
bufsize_t sps;
cmark_reference *ref;
bool is_image = false;
cmark_chunk url_chunk, title_chunk;
cmark_chunk url, title;
delimiter *opener;
cmark_node *link_text;
cmark_node *inl;
cmark_chunk raw_label;
int found_label;
advance(subj); // advance past ]
initial_pos = subj->pos;
// look through list of delimiters for a [ or !
opener = subj->last_delim;
while (opener) {
if (opener->delim_char == '[' || opener->delim_char == '!') {
break;
}
opener = opener->previous;
}
if (opener == NULL) {
return make_str(cmark_chunk_literal("]"));
}
if (!opener->active) {
// take delimiter off stack
remove_delimiter(subj, opener);
return make_str(cmark_chunk_literal("]"));
}
// If we got here, we matched a potential link/image text.
is_image = opener->delim_char == '!';
link_text = opener->inl_text->next;
// Now we check to see if it's a link/image.
// First, look for an inline link.
if (peek_char(subj) == '(' &&
((sps = scan_spacechars(&subj->input, subj->pos + 1)) > -1) &&
((n = scan_link_url(&subj->input, subj->pos + 1 + sps)) > -1)) {
// try to parse an explicit link:
starturl = subj->pos + 1 + sps; // after (
endurl = starturl + n;
starttitle = endurl + scan_spacechars(&subj->input, endurl);
// ensure there are spaces btw url and title
endtitle = (starttitle == endurl)
? starttitle
: starttitle + scan_link_title(&subj->input, starttitle);
endall = endtitle + scan_spacechars(&subj->input, endtitle);
if (peek_at(subj, endall) == ')') {
subj->pos = endall + 1;
url_chunk = cmark_chunk_dup(&subj->input, starturl, endurl - starturl);
title_chunk =
cmark_chunk_dup(&subj->input, starttitle, endtitle - starttitle);
url = cmark_clean_url(&url_chunk);
title = cmark_clean_title(&title_chunk);
cmark_chunk_free(&url_chunk);
cmark_chunk_free(&title_chunk);
goto match;
} else {
goto noMatch;
}
}
// Next, look for a following [link label] that matches in refmap.
// skip spaces
subj->pos = subj->pos + scan_spacechars(&subj->input, subj->pos);
raw_label = cmark_chunk_literal("");
found_label = link_label(subj, &raw_label);
if (!found_label || raw_label.len == 0) {
cmark_chunk_free(&raw_label);
raw_label = cmark_chunk_dup(&subj->input, opener->position,
initial_pos - opener->position - 1);
}
if (!found_label) {
// If we have a shortcut reference link, back up
// to before the spacse we skipped.
subj->pos = initial_pos;
}
ref = cmark_reference_lookup(subj->refmap, &raw_label);
cmark_chunk_free(&raw_label);
if (ref != NULL) { // found
url = chunk_clone(&ref->url);
title = chunk_clone(&ref->title);
goto match;
} else {
goto noMatch;
}
noMatch:
// If we fall through to here, it means we didn't match a link:
remove_delimiter(subj, opener); // remove this opener from delimiter list
subj->pos = initial_pos;
return make_str(cmark_chunk_literal("]"));
match:
inl = opener->inl_text;
inl->type = is_image ? CMARK_NODE_IMAGE : CMARK_NODE_LINK;
cmark_chunk_free(&inl->as.literal);
inl->first_child = link_text;
process_emphasis(subj, opener);
inl->as.link.url = url;
inl->as.link.title = title;
inl->next = NULL;
if (link_text) {
cmark_node *tmp;
link_text->prev = NULL;
for (tmp = link_text; tmp->next != NULL; tmp = tmp->next) {
tmp->parent = inl;
}
tmp->parent = inl;
inl->last_child = tmp;
}
parent->last_child = inl;
// Now, if we have a link, we also want to deactivate earlier link
// delimiters. (This code can be removed if we decide to allow links
// inside links.)
remove_delimiter(subj, opener);
if (!is_image) {
opener = subj->last_delim;
while (opener != NULL) {
if (opener->delim_char == '[') {
if (!opener->active) {
break;
} else {
opener->active = false;
}
}
opener = opener->previous;
}
}
return NULL;
}
// Parse a hard or soft linebreak, returning an inline.
// Assumes the subject has a cr or newline at the current position.
static cmark_node *handle_newline(subject *subj) {
bufsize_t nlpos = subj->pos;
// skip over cr, crlf, or lf:
if (peek_at(subj, subj->pos) == '\r') {
advance(subj);
}
if (peek_at(subj, subj->pos) == '\n') {
advance(subj);
}
// skip spaces at beginning of line
skip_spaces(subj);
if (nlpos > 1 && peek_at(subj, nlpos - 1) == ' ' &&
peek_at(subj, nlpos - 2) == ' ') {
return make_linebreak();
} else {
return make_softbreak();
}
}
static bufsize_t subject_find_special_char(subject *subj, int options) {
// "\r\n\\`&_*[]<!"
static const int8_t SPECIAL_CHARS[256] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1,
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
// " ' . -
static const char SMART_PUNCT_CHARS[] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
};
bufsize_t n = subj->pos + 1;
while (n < subj->input.len) {
if (SPECIAL_CHARS[subj->input.data[n]])
return n;
if (options & CMARK_OPT_SMART && SMART_PUNCT_CHARS[subj->input.data[n]])
return n;
n++;
}
return subj->input.len;
}
// Parse an inline, advancing subject, and add it as a child of parent.
// Return 0 if no inline can be parsed, 1 otherwise.
static int parse_inline(subject *subj, cmark_node *parent, int options) {
cmark_node *new_inl = NULL;
cmark_chunk contents;
unsigned char c;
bufsize_t endpos;
c = peek_char(subj);
if (c == 0) {
return 0;
}
switch (c) {
case '\r':
case '\n':
new_inl = handle_newline(subj);
break;
case '`':
new_inl = handle_backticks(subj);
break;
case '\\':
new_inl = handle_backslash(subj);
break;
case '&':
new_inl = handle_entity(subj);
break;
case '<':
new_inl = handle_pointy_brace(subj);
break;
case '*':
case '_':
case '\'':
case '"':
new_inl = handle_delim(subj, c, (options & CMARK_OPT_SMART) != 0);
break;
case '-':
new_inl = handle_hyphen(subj, (options & CMARK_OPT_SMART) != 0);
break;
case '.':
new_inl = handle_period(subj, (options & CMARK_OPT_SMART) != 0);
break;
case '[':
advance(subj);
new_inl = make_str(cmark_chunk_literal("["));
push_delimiter(subj, '[', true, false, new_inl);
break;
case ']':
new_inl = handle_close_bracket(subj, parent);
break;
case '!':
advance(subj);
if (peek_char(subj) == '[') {
advance(subj);
new_inl = make_str(cmark_chunk_literal("!["));
push_delimiter(subj, '!', false, true, new_inl);
} else {
new_inl = make_str(cmark_chunk_literal("!"));
}
break;
default:
endpos = subject_find_special_char(subj, options);
contents = cmark_chunk_dup(&subj->input, subj->pos, endpos - subj->pos);
subj->pos = endpos;
// if we're at a newline, strip trailing spaces.
if (S_is_line_end_char(peek_char(subj))) {
cmark_chunk_rtrim(&contents);
}
new_inl = make_str(contents);
}
if (new_inl != NULL) {
cmark_node_append_child(parent, new_inl);
}
return 1;
}
// Parse inlines from parent's string_content, adding as children of parent.
extern void cmark_parse_inlines(cmark_node *parent, cmark_reference_map *refmap,
int options) {
subject subj;
subject_from_buf(&subj, &parent->string_content, refmap);
cmark_chunk_rtrim(&subj.input);
while (!is_eof(&subj) && parse_inline(&subj, parent, options))
;
process_emphasis(&subj, NULL);
}
// Parse zero or more space characters, including at most one newline.
static void spnl(subject *subj) {
skip_spaces(subj);
if (skip_line_end(subj)) {
skip_spaces(subj);
}
}
// Parse reference. Assumes string begins with '[' character.
// Modify refmap if a reference is encountered.
// Return 0 if no reference found, otherwise position of subject
// after reference is parsed.
bufsize_t cmark_parse_reference_inline(cmark_strbuf *input,
cmark_reference_map *refmap) {
subject subj;
cmark_chunk lab;
cmark_chunk url;
cmark_chunk title;
bufsize_t matchlen = 0;
bufsize_t beforetitle;
subject_from_buf(&subj, input, NULL);
// parse label:
if (!link_label(&subj, &lab) || lab.len == 0)
return 0;
// colon:
if (peek_char(&subj) == ':') {
advance(&subj);
} else {
return 0;
}
// parse link url:
spnl(&subj);
matchlen = scan_link_url(&subj.input, subj.pos);
if (matchlen) {
url = cmark_chunk_dup(&subj.input, subj.pos, matchlen);
subj.pos += matchlen;
} else {
return 0;
}
// parse optional link_title
beforetitle = subj.pos;
spnl(&subj);
matchlen = scan_link_title(&subj.input, subj.pos);
if (matchlen) {
title = cmark_chunk_dup(&subj.input, subj.pos, matchlen);
subj.pos += matchlen;
} else {
subj.pos = beforetitle;
title = cmark_chunk_literal("");
}
// parse final spaces and newline:
skip_spaces(&subj);
if (!skip_line_end(&subj)) {
if (matchlen) { // try rewinding before title
subj.pos = beforetitle;
skip_spaces(&subj);
if (!skip_line_end(&subj)) {
return 0;
}
} else {
return 0;
}
}
// insert reference into refmap
cmark_reference_create(refmap, &lab, &url, &title);
return subj.pos;
}