#  Copyright (c) 2013 Yoran Heling
#
#  Permission is hereby granted, free of charge, to any person obtaining
#  a copy of this software and associated documentation files (the
#  "Software"), to deal in the Software without restriction, including
#  without limitation the rights to use, copy, modify, merge, publish,
#  distribute, sublicense, and/or sell copies of the Software, and to
#  permit persons to whom the Software is furnished to do so, subject to
#  the following conditions:
#
#  The above copyright notice and this permission notice shall be included
#  in all copies or substantial portions of the Software.
#
#  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
#  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
#  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
#  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
#  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
#  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
#  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


# Format of this file (informal):
#
#  Line = State Desc (';' Desc)*
#  Desc = Cond Act* Next
#  Cond = FunctionName              # yxml_isFunctionName(char)
#       | '$' Varname               # match character in Varname
#       | C-char ('|' C-char)*
#  Act  = FunctionName              # yxml_FunctionName(x, char)
#       | '$' Varname               # Store current char into Varname
#       | '@' State                 # Remember given state as future next state
#       | "string"                  # consume string before moving to next state
#  Next = State                     # Go to the given state
#       | '@'                       # Go to a previously remembered state
#
# Basically, it's just a short notation for manually writing a DFA. The script
# that compiles this to C is pretty simple and stupid, which explains the
# somewhat crude syntax of this file. It'd probably be more convenient to
# modify ragel[1] to generate state machine code that can be used in the
# yxml_parse() API, but I haven't really looked into that yet. I'm also not
# sure how much control I'd lose over the size of the resulting state machine.
#
# 1. http://www.complang.org/ragel/
#
# Note that the '@' state remembering functionality and "string" consuming
# action use the same variable to store the next state. This means that string
# consuming should not be used when the last @ state still needs to be
# remembered.

init        '\xef' "\xbb\xbf" misc0; SP misc0; '<' le0

# State numbers for the misc/le/lee/leq states:
#   0 = before XMLDecl, (prolog)
#   1 = before first element, (prolog/misc)
#   2 = inside element (content)
#   3 = after root element (misc)
# And naming:
#   misc = Nothing special seen yet
#   le   = Seen '<'
#   lee  = Seen '<!'
#   leq  = Seen '<?'
# The 'misc3' state is entered automatically when the root element has been
# closed, yxml_selfclose() does this.
misc0      SP misc0; '<' le0
misc1      SP misc1; '<' le1
misc2      '<' le2; '&' refstart misc2a; Char dataset misc2
misc2a     Ref ref misc2a; '\x3b' refend misc2
misc3      SP misc3; '<' le3

le0        '!' lee1; '?' leq0; NameStart elemstart elem0
le1        '!' lee1; '?' @misc1 pi0; NameStart elemstart elem0
le2        '!' lee2; '?' @misc2 pi0; '/' etag0; NameStart elemstart elem0
le3        '!' @misc3 comment0; '?' @misc3 pi0

lee1       '-' @misc1 comment1; 'D' "OCTYPE" dt0
lee2       '-' @misc2 comment1; '[' "CDATA[" cd0
leq0       'x' "ml" xmldecl0; NameStart @misc1 pistart pi1


# XMLDecl, starting from '<?xml', returns to misc1
xmldecl0    SP xmldecl1
xmldecl1    SP xmldecl1; 'v' "ersion" ver0
xmldecl2    SP xmldecl3; '?' xmldecl7
xmldecl3    SP xmldecl3; '?' xmldecl7; 'e' "ncoding" enc0; 's' "tandalone" std0
xmldecl4    SP xmldecl5; '?' xmldecl7
xmldecl5    SP xmldecl5; '?' xmldecl7; 's' "tandalone" std0
xmldecl6    SP xmldecl6; '?' xmldecl7
xmldecl7    '>' misc1

# VersionInfo, after 'version', returns to xmldecl2
ver0       SP ver0; '=' ver1
ver1       SP ver1; '\''|'"' $quote "1." ver2
ver2       Num ver3;
ver3       Num ver3; $quote xmldecl2

# EncodingDecl, after 'e', returns to xmldecl4
# TODO: Pass the encoding value to the application?
enc0       SP enc0; '=' enc1
enc1       SP enc1; '\''|'"' $quote enc2
enc2       Alpha enc3
enc3       EncName enc3; $quote xmldecl4

# SDDecl, after 'standalone', returns to xmldecl6
# TODO: Pass the standalone flag to the application?
std0       SP std0; '=' std1
std1       SP std1; '\''|'"' $quote std2
std2       'y' "es" std3; 'n' "o" std3
std3       $quote xmldecl6


# Comment, after '<!', returns to @
comment0   '-' comment1
comment1   '-' comment2
comment2   '-' comment3; Char comment2
comment3   '-' comment4; Char comment2
comment4   '>' @


# PI, starting from '<?', returns to @
pi0        NameStart pistart pi1
pi1        Name piname pi1; '?' pinameend pi4; SP pinameend pi2
pi2        '?' pi3; Char dataset pi2
pi3        '>' pivalend @; Char datapi pi2
pi4        '>' pivalend @


# CDSect, starting from '<![DATA[', returns to misc2
cd0        ']' cd1; Char dataset cd0
cd1        ']' cd2; Char datacd1 cd0
cd2        ']' dataset cd2; '>' misc2; Char datacd2 cd0


# Doctype, starting from '<!DOCTYPE', returns to misc1
# XXX: The state machine below only attempts to figure out where the doctype
#   declaration ends, its contents are not actually parsed or validated.
#   Basically, it allows the following nesting of tags/quotes/PIs/comments:
#
#     <!DOCTYPE ".." '..' <?PI ..?> <!--..--> <!.. ".." '.."> >
#
#   Only the last '>' is correctly recognized as the end of the declaration.
#   Any other '>' found to end a tag/PI/comment, or found within quotes,
#   comments or a PI, is ignored.
# TODO: This still fails on conditional sections, which may nest.
dt0        '>' misc1; '\''|'"' $quote @dt0 dt1; '<' dt2; Char dt0
dt1        $quote @; Char dt1
dt2        '?' @dt0 pi0; '!' dt3
dt3        '-' @dt0 comment1; Char dt4
dt4        '\''|'"' $quote @dt4 dt1; '>' dt0; Char dt4


# End tag, after '</', returns to misc2
# XXX: It's not actually necessary to validate the characters, since the
#   'elemclose' function already verifies (incrementally) that the name is
#   equivalent to the corresponding <Open ..> tag. The only difference is that
#   with the code below, </x/z> will result in ESYN, whereas a non-validating
#   version would give ECLOSE.
etag0      NameStart elemclose etag1
etag1      Name elemclose etag1; SP elemcloseend etag2; '>' elemcloseend misc2
etag2      SP etag2; '>' misc2


# Element, after '<X', returns to misc2
elem0      Name elemname elem0; SP elemnameend elem1; '/' elemnameend content elem3; '>' elemnameend content misc2
elem1      SP elem1; '/' content elem3; '>' content misc2; NameStart attrstart attr0
elem2      SP elem1; '/' content elem3; '>' content misc2
elem3      '>' selfclose misc2

# Attribute, after NameStart, returns to elem2
attr0      Name attrname attr0; SP attrnameend attr1; '=' attrnameend attr2
attr1      SP attr1; '=' attr2
attr2      SP attr2; '\''|'"' $quote attr3
attr3      AttValue dataattr attr3; '&' refstart attr4; $quote attrvalend elem2
attr4      Ref ref attr4; '\x3b' refend attr3

