blob: eeede4a30ab12b8092c52f21c28b42642d93b8ca [file] [log] [blame]
# Copyright (c) 2013 Yoran Heling
#
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and associated documentation files (the
# "Software"), to deal in the Software without restriction, including
# without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and to
# permit persons to whom the Software is furnished to do so, subject to
# the following conditions:
#
# The above copyright notice and this permission notice shall be included
# in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
# Format of this file (informal):
#
# Line = State Desc (';' Desc)*
# Desc = Cond Act* Next
# Cond = FunctionName # yxml_isFunctionName(char)
# | '$' Varname # match character in Varname
# | C-char ('|' C-char)*
# Act = FunctionName # yxml_FunctionName(x, char)
# | '$' Varname # Store current char into Varname
# | "string" # consume string before moving to next state
# Next = State
#
# Basically, it's just a short notation for manually writing a DFA. The script
# that compiles this to C is pretty simple and stupid, which explains the
# somewhat crude syntax of this file. It'd probably be more convenient to
# modify ragel[1] to generate state machine code that can be used in the
# yxml_parse() API, but I haven't really looked into that yet. I'm also not
# sure how much control I'd lose over the size of the resulting state machine.
#
# 1. http://www.complang.org/ragel/
init '\xef' "\xbb\xbf" misc0; SP misc0; '<' le0
# State numbers for the misc/le/lee/leq states:
# 0 = before XMLDecl, (prolog)
# 1 = before first element, (prolog/misc)
# 2 = inside element (content)
# And naming:
# misc = Nothing special seen yet
# le = Seen '<'
# lee = Seen '<!'
# leq = Seen '<?'
misc0 SP misc0; '<' le0
misc1 SP misc1; '<' le1
misc2 '<' le2; '&' refstart misc2a; Char setdata misc2
misc2a Ref ref misc2a; '\x3b' refend misc2
le0 '!' lee1; '?' leq0; NameStart elemstart elem0
le1 '!' lee1; '?' pi0; NameStart elemstart elem0
le2 '!' lee2; '?' pi0; '/' etag0; NameStart elemstart elem0
lee1 '-' comment0; 'D' "OCTYPE" dt0
lee2 '-' comment0; '[' "CDATA[" cd0
leq0 'x' "ml" xmldecl0; NameStart pi1
# XMLDecl, starting from '<?xml', returns to misc1
xmldecl0 SP xmldecl1
xmldecl1 SP xmldecl1; 'v' "ersion" ver0
xmldecl2 SP xmldecl3; '?' xmldecl7
xmldecl3 SP xmldecl3; '?' xmldecl7; 'e' "ncoding" enc0; 's' std0
xmldecl4 SP xmldecl5; '?' xmldecl7
xmldecl5 SP xmldecl5; '?' xmldecl7; 's' "tandalone" std0
xmldecl6 SP xmldecl6; '?' xmldecl7
xmldecl7 '>' misc1
# VersionInfo, after 'version', returns to xmldecl2
ver0 SP ver0; '=' ver1
ver1 SP ver1; '\''|'"' $quote "1." ver2
ver2 Num ver3;
ver3 Num ver3; $quote xmldecl2
# EncodingDecl, after 'e', returns to xmldecl4
# TODO: Pass the encoding value to the application?
enc0 SP enc0; '=' enc1
enc1 SP enc1; '\''|'"' $quote enc2
enc2 Alpha enc3
enc3 EncName enc3; $quote xmldecl4
# SDDecl, after 'standalone', returns to xmldecl6
# TODO: Pass the standalone flag to the application?
std0 SP std0; '=' std1
std1 SP std1; '\''|'"' $quote std2
std2 'y' "es" std3; 'n' "o" std3
std3 $quote xmldecl6
# Comment, after '<!-', returns to misc1 or misc2
comment0 '-' comment1
comment1 CommentStart comment2
comment2 '-' comment3; Char comment2
comment3 '-' comment4; Char comment2
comment4 '>' retmisc comment4
# PI, starting from '<?', returns to misc1 or misc2
# TODO: Verify that the PI name isn't /xml/i
# TODO: Pass the name and contents to the application
pi0 NameStart pi1
pi1 Name pi1; SP pi2
pi2 '?' pi3; Char pi2
pi3 '>' retmisc pi3; Char pi2
# CDSect, starting from '<![DATA[', returns to misc2
cd0 ']' cd1; Char setdata cd0
cd1 ']' cd2; Char setdata cd0
cd2 '>' misc2
# Doctype, starting from '<!DOCTYPE', returns to misc1
# TODO: This is a hack, all we do is read until we find a '>', not
# validating its content. This hack fails if the DTD contains a '>'
# character, which is very possible. Unfortunately, just figuring out where a
# DTD ends already requires a rather elaborate parser. :-(
dt0 '>' misc1; Char dt0
# End tag, after '</', returns to misc2
# XXX: It's not actually necessary to validate the characters, since the
# 'elemclose' function already verifies (incrementally) that the name is
# equivalent to the corresponding <Open ..> tag. The only difference is that
# with the code below, </x/z> will result in ESYN, whereas a non-validating
# version would give ECLOSE.
etag0 NameStart elemclose etag1
etag1 Name elemclose etag1; SP elemcloseend etag2; '>' elemcloseend misc2
etag2 SP etag2; '>' misc2
# Element, after '<X', returns to misc2
elem0 Name elemname elem0; SP elemnameend elem1; '/' elemnameend attrsend elem3; '>' elemnameend attrsend misc2
elem1 SP elem1; '/' attrsend elem3; '>' attrsend misc2; NameStart attrstart attr0
elem2 SP elem1; '/' attrsend elem3; '>' attrsend misc2
elem3 '>' selfclose misc2
# Attribute, after NameStart, returns to elem2
attr0 Name attrname attr0; SP attrnameend attr1; '=' attrnameend attr2
attr1 SP attr1; '=' attr2
attr2 SP attr2; '\''|'"' $quote attr3
attr3 AttValue setdata attr3; '&' refstart attr4; $quote elem2
attr4 Ref ref attr4; '\x3b' refend attr3