| /* Copyright (c) 2013-2014 Yoran Heling |
| |
| Permission is hereby granted, free of charge, to any person obtaining |
| a copy of this software and associated documentation files (the |
| "Software"), to deal in the Software without restriction, including |
| without limitation the rights to use, copy, modify, merge, publish, |
| distribute, sublicense, and/or sell copies of the Software, and to |
| permit persons to whom the Software is furnished to do so, subject to |
| the following conditions: |
| |
| The above copyright notice and this permission notice shall be included |
| in all copies or substantial portions of the Software. |
| |
| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, |
| EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF |
| MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. |
| IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY |
| CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, |
| TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE |
| SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. |
| */ |
| |
| #include <yxml.h> |
| #include <string.h> |
| |
| typedef enum { |
| YXMLS_string, |
| /*=STATES=*/ |
| } yxml_state_t; |
| |
| |
| #define yxml_isChar(c) 1 |
| /* 0xd should be part of SP, too, but yxml_parse() already normalizes that into 0xa */ |
| #define yxml_isSP(c) (c == 0x20 || c == 0x09 || c == 0x0a) |
| #define yxml_isAlpha(c) ((c|32)-'a' < 26) |
| #define yxml_isNum(c) (c-'0' < 10) |
| #define yxml_isHex(c) (yxml_isNum(c) || (c|32)-'a' < 6) |
| #define yxml_isEncName(c) (yxml_isAlpha(c) || yxml_isNum(c) || c == '.' || c == '_' || c == '-') |
| #define yxml_isNameStart(c) (yxml_isAlpha(c) || c == ':' || c == '_' || c >= 128) |
| #define yxml_isName(c) (yxml_isNameStart(c) || yxml_isNum(c) || c == '-' || c == '.') |
| /* XXX: The valid characters are dependent on the quote char, hence the access to x->quote */ |
| #define yxml_isAttValue(c) (yxml_isChar(c) && c != x->quote && c != '<' && c != '&') |
| /* Anything between '&' and ';', the yxml_ref* functions will do further |
| * validation. Strictly speaking, this is "yxml_isName(c) || c == '#'", but |
| * this parser doesn't understand entities with '.', ':', etc, anwyay. */ |
| #define yxml_isRef(c) (yxml_isNum(c) || yxml_isAlpha(c) || c == '#') |
| |
| #define INTFROM5CHARS(a, b, c, d, e) ((((uint64_t)(a))<<32) | (((uint64_t)(b))<<24) | (((uint64_t)(c))<<16) | (((uint64_t)(d))<<8) | (uint64_t)(e)) |
| |
| |
| /* Set the given char value to ch (0<=ch<=255). |
| * This can't be done with simple assignment because char may be signed, and |
| * unsigned-to-signed overflow is implementation defined in C. This function |
| * /looks/ inefficient, but gcc compiles it down to a single movb instruction |
| * on x86, even with -O0. */ |
| static inline void yxml_setchar(char *dest, unsigned ch) { |
| unsigned char _ch = ch; |
| memcpy(dest, &_ch, 1); |
| } |
| |
| |
| /* Similar to yxml_setchar(), but will convert ch (any valid unicode point) to |
| * UTF-8 and appends a '\0'. dest must have room for at least 5 bytes. */ |
| static void yxml_setutf8(char *dest, unsigned ch) { |
| if(ch <= 0x007F) |
| yxml_setchar(dest++, ch); |
| else if(ch <= 0x07FF) { |
| yxml_setchar(dest++, 0xC0 | (ch>>6)); |
| yxml_setchar(dest++, 0x80 | (ch & 0x3F)); |
| } else if(ch <= 0xFFFF) { |
| yxml_setchar(dest++, 0xE0 | (ch>>12)); |
| yxml_setchar(dest++, 0x80 | ((ch>>6) & 0x3F)); |
| yxml_setchar(dest++, 0x80 | (ch & 0x3F)); |
| } else { |
| yxml_setchar(dest++, 0xF0 | (ch>>18)); |
| yxml_setchar(dest++, 0x80 | ((ch>>12) & 0x3F)); |
| yxml_setchar(dest++, 0x80 | ((ch>>6) & 0x3F)); |
| yxml_setchar(dest++, 0x80 | (ch & 0x3F)); |
| } |
| *dest = 0; |
| } |
| |
| |
| static inline yxml_ret_t yxml_datacontent(yxml_t *x, unsigned ch) { |
| yxml_setchar(x->data, ch); |
| x->data[1] = 0; |
| return YXML_CONTENT; |
| } |
| |
| |
| static inline yxml_ret_t yxml_datapi1(yxml_t *x, unsigned ch) { |
| yxml_setchar(x->data, ch); |
| x->data[1] = 0; |
| return YXML_PICONTENT; |
| } |
| |
| |
| static inline yxml_ret_t yxml_datapi2(yxml_t *x, unsigned ch) { |
| x->data[0] = '?'; |
| yxml_setchar(x->data+1, ch); |
| x->data[2] = 0; |
| return YXML_PICONTENT; |
| } |
| |
| |
| static inline yxml_ret_t yxml_datacd1(yxml_t *x, unsigned ch) { |
| x->data[0] = ']'; |
| yxml_setchar(x->data+1, ch); |
| x->data[2] = 0; |
| return YXML_CONTENT; |
| } |
| |
| |
| static inline yxml_ret_t yxml_datacd2(yxml_t *x, unsigned ch) { |
| x->data[0] = ']'; |
| x->data[1] = ']'; |
| yxml_setchar(x->data+2, ch); |
| x->data[3] = 0; |
| return YXML_CONTENT; |
| } |
| |
| |
| static inline yxml_ret_t yxml_dataattr(yxml_t *x, unsigned ch) { |
| /* Normalize attribute values according to the XML spec section 3.3.3. */ |
| yxml_setchar(x->data, ch == 0x9 || ch == 0xa ? 0x20 : ch); |
| x->data[1] = 0; |
| return YXML_ATTRVAL; |
| } |
| |
| |
| static yxml_ret_t yxml_pushstack(yxml_t *x, char **res, unsigned ch) { |
| if(x->stacklen+2 >= x->stacksize) |
| return YXML_ESTACK; |
| x->stacklen++; |
| *res = (char *)x->stack+x->stacklen; |
| x->stack[x->stacklen] = ch; |
| x->stacklen++; |
| x->stack[x->stacklen] = 0; |
| return YXML_OK; |
| } |
| |
| |
| static yxml_ret_t yxml_pushstackc(yxml_t *x, unsigned ch) { |
| if(x->stacklen+1 >= x->stacksize) |
| return YXML_ESTACK; |
| x->stack[x->stacklen] = ch; |
| x->stacklen++; |
| x->stack[x->stacklen] = 0; |
| return YXML_OK; |
| } |
| |
| |
| static void yxml_popstack(yxml_t *x) { |
| do |
| x->stacklen--; |
| while(x->stack[x->stacklen]); |
| } |
| |
| |
| static inline yxml_ret_t yxml_elemstart (yxml_t *x, unsigned ch) { return yxml_pushstack(x, &x->elem, ch); } |
| static inline yxml_ret_t yxml_elemname (yxml_t *x, unsigned ch) { return yxml_pushstackc(x, ch); } |
| static inline yxml_ret_t yxml_elemnameend(yxml_t *x, unsigned ch) { return YXML_ELEMSTART; } |
| |
| |
| /* Also used in yxml_elemcloseend(), since this function just removes the last |
| * element from the stack and returns ELEMEND. */ |
| static yxml_ret_t yxml_selfclose(yxml_t *x, unsigned ch) { |
| yxml_popstack(x); |
| if(x->stacklen) { |
| x->elem = (char *)x->stack+x->stacklen-1; |
| while(*(x->elem-1)) |
| x->elem--; |
| return YXML_ELEMEND; |
| } |
| x->elem = (char *)x->stack; |
| x->state = YXMLS_misc3; |
| return YXML_ELEMEND; |
| } |
| |
| |
| static inline yxml_ret_t yxml_elemclose(yxml_t *x, unsigned ch) { |
| if(*((unsigned char *)x->elem) != ch) |
| return YXML_ECLOSE; |
| x->elem++; |
| return YXML_OK; |
| } |
| |
| |
| static inline yxml_ret_t yxml_elemcloseend(yxml_t *x, unsigned ch) { |
| if(*x->elem) |
| return YXML_ECLOSE; |
| return yxml_selfclose(x, ch); |
| } |
| |
| |
| static inline yxml_ret_t yxml_attrstart (yxml_t *x, unsigned ch) { return yxml_pushstack(x, &x->attr, ch); } |
| static inline yxml_ret_t yxml_attrname (yxml_t *x, unsigned ch) { return yxml_pushstackc(x, ch); } |
| static inline yxml_ret_t yxml_attrnameend(yxml_t *x, unsigned ch) { return YXML_ATTRSTART; } |
| static inline yxml_ret_t yxml_attrvalend (yxml_t *x, unsigned ch) { yxml_popstack(x); return YXML_ATTREND; } |
| |
| |
| static inline yxml_ret_t yxml_pistart (yxml_t *x, unsigned ch) { return yxml_pushstack(x, &x->pi, ch); } |
| static inline yxml_ret_t yxml_piname (yxml_t *x, unsigned ch) { return yxml_pushstackc(x, ch); } |
| static inline yxml_ret_t yxml_piabort (yxml_t *x, unsigned ch) { yxml_popstack(x); return YXML_OK; } |
| static inline yxml_ret_t yxml_pinameend(yxml_t *x, unsigned ch) { |
| return (x->pi[0]|32) == 'x' && (x->pi[1]|32) == 'm' && (x->pi[2]|32) == 'l' && !x->pi[3] ? YXML_ESYN : YXML_PISTART; |
| } |
| static inline yxml_ret_t yxml_pivalend (yxml_t *x, unsigned ch) { yxml_popstack(x); x->pi = (char *)x->stack; return YXML_PIEND; } |
| |
| |
| static inline yxml_ret_t yxml_refstart(yxml_t *x, unsigned ch) { |
| memset(x->data, 0, sizeof(x->data)); |
| x->reflen = 0; |
| return YXML_OK; |
| } |
| |
| |
| static yxml_ret_t yxml_ref(yxml_t *x, unsigned ch) { |
| if(x->reflen >= sizeof(x->data)-1) |
| return YXML_EREF; |
| yxml_setchar(x->data+x->reflen, ch); |
| x->reflen++; |
| return YXML_OK; |
| } |
| |
| |
| static yxml_ret_t yxml_refend(yxml_t *x, yxml_ret_t ret) { |
| unsigned char *r = (unsigned char *)x->data; |
| unsigned ch = 0; |
| if(*r == '#') { |
| if(r[1] == 'x') |
| for(r += 2; yxml_isHex((unsigned)*r); r++) |
| ch = (ch<<4) + (*r <= '9' ? *r-'0' : (*r|32)-'a' + 10); |
| else |
| for(r++; yxml_isNum((unsigned)*r); r++) |
| ch = (ch*10) + (*r-'0'); |
| if(*r) |
| ch = 0; |
| } else { |
| uint64_t i = INTFROM5CHARS(r[0], r[1], r[2], r[3], r[4]); |
| ch = |
| i == INTFROM5CHARS('l','t', 0, 0, 0) ? '<' : |
| i == INTFROM5CHARS('g','t', 0, 0, 0) ? '>' : |
| i == INTFROM5CHARS('a','m','p', 0, 0) ? '&' : |
| i == INTFROM5CHARS('a','p','o','s',0) ? '\'': |
| i == INTFROM5CHARS('q','u','o','t',0) ? '"' : 0; |
| } |
| |
| /* Codepoints not allowed in the XML 1.1 definition of a Char */ |
| if(!ch || ch > 0x10FFFF || ch == 0xFFFE || ch == 0xFFFF || (ch-0xDFFF) < 0x7FF) |
| return YXML_EREF; |
| yxml_setutf8(x->data, ch); |
| return ret; |
| } |
| |
| |
| static inline yxml_ret_t yxml_refcontent(yxml_t *x, unsigned ch) { return yxml_refend(x, YXML_CONTENT); } |
| static inline yxml_ret_t yxml_refattrval(yxml_t *x, unsigned ch) { return yxml_refend(x, YXML_ATTRVAL); } |
| |
| |
| void yxml_init(yxml_t *x, void *stack, size_t stacksize) { |
| memset(x, 0, sizeof(*x)); |
| x->line = 1; |
| x->stack = (unsigned char*)stack; |
| x->stacksize = stacksize; |
| *x->stack = 0; |
| x->elem = x->pi = x->attr = (char *)x->stack; |
| x->state = YXMLS_init; |
| } |
| |
| |
| yxml_ret_t yxml_parse(yxml_t *x, int _ch) { |
| /* Ensure that characters are in the range of 0..255 rather than -126..125. |
| * All character comparisons are done with positive integers. */ |
| unsigned ch = (unsigned)(_ch+256) & 0xff; |
| if(!ch) |
| return YXML_ESYN; |
| x->total++; |
| |
| /* End-of-Line normalization, "\rX", "\r\n" and "\n" are recognized and |
| * normalized to a single '\n' as per XML 1.0 section 2.11. XML 1.1 adds |
| * some non-ASCII character sequences to this list, but we can only handle |
| * ASCII here without making assumptions about the input encoding. */ |
| if(x->ignore == ch) { |
| x->ignore = 0; |
| return YXML_OK; |
| } |
| x->ignore = (ch == 0xd) * 0xa; |
| if(ch == 0xa || ch == 0xd) { |
| ch = 0xa; |
| x->line++; |
| x->byte = 0; |
| } |
| x->byte++; |
| |
| switch((yxml_state_t)x->state) { |
| case YXMLS_string: |
| if(ch == *x->string) { |
| x->string++; |
| if(!*x->string) |
| x->state = x->nextstate; |
| return YXML_OK; |
| } |
| break; |
| /*=SWITCH=*/ |
| } |
| return YXML_ESYN; |
| } |
| |
| |
| yxml_ret_t yxml_eof(yxml_t *x) { |
| if(x->state != YXMLS_misc3) |
| return YXML_EEOF; |
| return YXML_OK; |
| } |
| |
| |
| /* vim: set noet sw=4 ts=4: */ |