yxml-states - third_party/yxml - Git at Google

 #  Copyright (c) 2013 Yoran Heling
 #
 #  Permission is hereby granted, free of charge, to any person obtaining
 #  a copy of this software and associated documentation files (the
 #  "Software"), to deal in the Software without restriction, including
 #  without limitation the rights to use, copy, modify, merge, publish,
 #  distribute, sublicense, and/or sell copies of the Software, and to
 #  permit persons to whom the Software is furnished to do so, subject to
 #  the following conditions:
 #
 #  The above copyright notice and this permission notice shall be included
 #  in all copies or substantial portions of the Software.
 #
 #  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 #  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 #  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 #  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 #  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 #  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 #  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


 # Format of this file (informal):
 #
 #  Line = State Desc (';' Desc)*
 #  Desc = Cond Act* Next
 #  Cond = FunctionName              # yxml_isFunctionName(char)
 #       | '$' Varname               # match character in Varname
 #       | C-char ('|' C-char)*
 #  Act  = FunctionName              # yxml_FunctionName(x, char)
 #       | '$' Varname               # Store current char into Varname
 #       | "string"                  # consume string before moving to next state
 #  Next = State
 #
 # Basically, it's just a short notation for manually writing a DFA. The script
 # that compiles this to C is pretty simple and stupid, which explains the
 # somewhat crude syntax of this file. It'd probably be more convenient to
 # modify ragel[1] to generate state machine code that can be used in the
 # yxml_parse() API, but I haven't really looked into that yet. I'm also not
 # sure how much control I'd lose over the size of the resulting state machine.
 #
 # 1. http://www.complang.org/ragel/

 init        '\xef' "\xbb\xbf" misc0; SP misc0; '<' le0

 # State numbers for the misc/le/lee/leq states:
 #   0 = before XMLDecl, (prolog)
 #   1 = before first element, (prolog/misc)
 #   2 = inside element (content)
 # And naming:
 #   misc = Nothing special seen yet
 #   le   = Seen '<'
 #   lee  = Seen '<!'
 #   leq  = Seen '<?'
 misc0      SP misc0; '<' le0
 misc1      SP misc1; '<' le1
 misc2      '<' le2; '&' refstart misc2a; Char setdata misc2
 misc2a     Ref ref misc2a; '\x3b' refend misc2

 le0        '!' lee1; '?' leq0; NameStart elemstart elem0
 le1        '!' lee1; '?' pi0; NameStart elemstart elem0
 le2        '!' lee2; '?' pi0; '/' etag0; NameStart elemstart elem0

 lee1       '-' comment0; 'D' "OCTYPE" dt0
 lee2       '-' comment0; '[' "CDATA[" cd0
 leq0       'x' "ml" xmldecl0; NameStart pi1


 # XMLDecl, starting from '<?xml', returns to misc1
 xmldecl0    SP xmldecl1
 xmldecl1    SP xmldecl1; 'v' "ersion" ver0
 xmldecl2    SP xmldecl3; '?' xmldecl7
 xmldecl3    SP xmldecl3; '?' xmldecl7; 'e' "ncoding" enc0; 's' std0
 xmldecl4    SP xmldecl5; '?' xmldecl7
 xmldecl5    SP xmldecl5; '?' xmldecl7; 's' "tandalone" std0
 xmldecl6    SP xmldecl6; '?' xmldecl7
 xmldecl7    '>' misc1

 # VersionInfo, after 'version', returns to xmldecl2
 ver0       SP ver0; '=' ver1
 ver1       SP ver1; '\''|'"' $quote "1." ver2
 ver2       Num ver3;
 ver3       Num ver3; $quote xmldecl2

 # EncodingDecl, after 'e', returns to xmldecl4
 # TODO: Pass the encoding value to the application?
 enc0       SP enc0; '=' enc1
 enc1       SP enc1; '\''|'"' $quote enc2
 enc2       Alpha enc3
 enc3       EncName enc3; $quote xmldecl4

 # SDDecl, after 'standalone', returns to xmldecl6
 # TODO: Pass the standalone flag to the application?
 std0       SP std0; '=' std1
 std1       SP std1; '\''|'"' $quote std2
 std2       'y' "es" std3; 'n' "o" std3
 std3       $quote xmldecl6


 # Comment, after '<!-', returns to misc1 or misc2
 comment0   '-' comment1
 comment1   CommentStart comment2
 comment2   '-' comment3; Char comment2
 comment3   '-' comment4; Char comment2
 comment4   '>' retmisc comment4


 # PI, starting from '<?', returns to misc1 or misc2
 # TODO: Verify that the PI name isn't /xml/i
 # TODO: Pass the name and contents to the application
 pi0        NameStart pi1
 pi1        Name pi1; SP pi2
 pi2        '?' pi3; Char pi2
 pi3        '>' retmisc pi3; Char pi2


 # CDSect, starting from '<![DATA[', returns to misc2
 cd0        ']' cd1; Char setdata cd0
 cd1        ']' cd2; Char setdata cd0
 cd2        '>' misc2


 # Doctype, starting from '<!DOCTYPE', returns to misc1
 # TODO: This is a hack, all we do is read until we find a '>', not
 #   validating its content. This hack fails if the DTD contains a '>'
 #   character, which is very possible. Unfortunately, just figuring out where a
 #   DTD ends already requires a rather elaborate parser. :-(
 dt0        '>' misc1; Char dt0


 # End tag, after '</', returns to misc2
 # XXX: It's not actually necessary to validate the characters, since the
 #   'elemclose' function already verifies (incrementally) that the name is
 #   equivalent to the corresponding <Open ..> tag. The only difference is that
 #   with the code below, </x/z> will result in ESYN, whereas a non-validating
 #   version would give ECLOSE.
 etag0      NameStart elemclose etag1
 etag1      Name elemclose etag1; SP elemcloseend etag2; '>' elemcloseend misc2
 etag2      SP etag2; '>' misc2


 # Element, after '<X', returns to misc2
 elem0      Name elemname elem0; SP elemnameend elem1; '/' elemnameend attrsend elem3; '>' elemnameend attrsend misc2
 elem1      SP elem1; '/' attrsend elem3; '>' attrsend misc2; NameStart attrstart attr0
 elem2      SP elem1; '/' attrsend elem3; '>' attrsend misc2
 elem3      '>' selfclose misc2

 # Attribute, after NameStart, returns to elem2
 attr0      Name attrname attr0; SP attrnameend attr1; '=' attrnameend attr2
 attr1      SP attr1; '=' attr2
 attr2      SP attr2; '\''|'"' $quote attr3
 attr3      AttValue setdata attr3; '&' refstart attr4; $quote elem2
 attr4      Ref ref attr4; '\x3b' refend attr3
	# Copyright (c) 2013 Yoran Heling
	#
	# Permission is hereby granted, free of charge, to any person obtaining
	# a copy of this software and associated documentation files (the
	# "Software"), to deal in the Software without restriction, including
	# without limitation the rights to use, copy, modify, merge, publish,
	# distribute, sublicense, and/or sell copies of the Software, and to
	# permit persons to whom the Software is furnished to do so, subject to
	# the following conditions:
	#
	# The above copyright notice and this permission notice shall be included
	# in all copies or substantial portions of the Software.
	#
	# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
	# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
	# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
	# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
	# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
	# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
	# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


	# Format of this file (informal):
	#
	# Line = State Desc (';' Desc)*
	# Desc = Cond Act* Next
	# Cond = FunctionName # yxml_isFunctionName(char)
	# \| '$' Varname # match character in Varname
	# \| C-char ('\|' C-char)*
	# Act = FunctionName # yxml_FunctionName(x, char)
	# \| '$' Varname # Store current char into Varname
	# \| "string" # consume string before moving to next state
	# Next = State
	#
	# Basically, it's just a short notation for manually writing a DFA. The script
	# that compiles this to C is pretty simple and stupid, which explains the
	# somewhat crude syntax of this file. It'd probably be more convenient to
	# modify ragel[1] to generate state machine code that can be used in the
	# yxml_parse() API, but I haven't really looked into that yet. I'm also not
	# sure how much control I'd lose over the size of the resulting state machine.
	#
	# 1. http://www.complang.org/ragel/

	init '\xef' "\xbb\xbf" misc0; SP misc0; '<' le0

	# State numbers for the misc/le/lee/leq states:
	# 0 = before XMLDecl, (prolog)
	# 1 = before first element, (prolog/misc)
	# 2 = inside element (content)
	# And naming:
	# misc = Nothing special seen yet
	# le = Seen '<'
	# lee = Seen '<!'
	# leq = Seen '<?'
	misc0 SP misc0; '<' le0
	misc1 SP misc1; '<' le1
	misc2 '<' le2; '&' refstart misc2a; Char setdata misc2
	misc2a Ref ref misc2a; '\x3b' refend misc2

	le0 '!' lee1; '?' leq0; NameStart elemstart elem0
	le1 '!' lee1; '?' pi0; NameStart elemstart elem0
	le2 '!' lee2; '?' pi0; '/' etag0; NameStart elemstart elem0

	lee1 '-' comment0; 'D' "OCTYPE" dt0
	lee2 '-' comment0; '[' "CDATA[" cd0
	leq0 'x' "ml" xmldecl0; NameStart pi1


	# XMLDecl, starting from '<?xml', returns to misc1
	xmldecl0 SP xmldecl1
	xmldecl1 SP xmldecl1; 'v' "ersion" ver0
	xmldecl2 SP xmldecl3; '?' xmldecl7
	xmldecl3 SP xmldecl3; '?' xmldecl7; 'e' "ncoding" enc0; 's' std0
	xmldecl4 SP xmldecl5; '?' xmldecl7
	xmldecl5 SP xmldecl5; '?' xmldecl7; 's' "tandalone" std0
	xmldecl6 SP xmldecl6; '?' xmldecl7
	xmldecl7 '>' misc1

	# VersionInfo, after 'version', returns to xmldecl2
	ver0 SP ver0; '=' ver1
	ver1 SP ver1; '\''\|'"' $quote "1." ver2
	ver2 Num ver3;
	ver3 Num ver3; $quote xmldecl2

	# EncodingDecl, after 'e', returns to xmldecl4
	# TODO: Pass the encoding value to the application?
	enc0 SP enc0; '=' enc1
	enc1 SP enc1; '\''\|'"' $quote enc2
	enc2 Alpha enc3
	enc3 EncName enc3; $quote xmldecl4

	# SDDecl, after 'standalone', returns to xmldecl6
	# TODO: Pass the standalone flag to the application?
	std0 SP std0; '=' std1
	std1 SP std1; '\''\|'"' $quote std2
	std2 'y' "es" std3; 'n' "o" std3
	std3 $quote xmldecl6


	# Comment, after '<!-', returns to misc1 or misc2
	comment0 '-' comment1
	comment1 CommentStart comment2
	comment2 '-' comment3; Char comment2
	comment3 '-' comment4; Char comment2
	comment4 '>' retmisc comment4


	# PI, starting from '<?', returns to misc1 or misc2
	# TODO: Verify that the PI name isn't /xml/i
	# TODO: Pass the name and contents to the application
	pi0 NameStart pi1
	pi1 Name pi1; SP pi2
	pi2 '?' pi3; Char pi2
	pi3 '>' retmisc pi3; Char pi2


	# CDSect, starting from '<![DATA[', returns to misc2
	cd0 ']' cd1; Char setdata cd0
	cd1 ']' cd2; Char setdata cd0
	cd2 '>' misc2


	# Doctype, starting from '<!DOCTYPE', returns to misc1
	# TODO: This is a hack, all we do is read until we find a '>', not
	# validating its content. This hack fails if the DTD contains a '>'
	# character, which is very possible. Unfortunately, just figuring out where a
	# DTD ends already requires a rather elaborate parser. :-(
	dt0 '>' misc1; Char dt0


	# End tag, after '</', returns to misc2
	# XXX: It's not actually necessary to validate the characters, since the
	# 'elemclose' function already verifies (incrementally) that the name is
	# equivalent to the corresponding <Open ..> tag. The only difference is that
	# with the code below, </x/z> will result in ESYN, whereas a non-validating
	# version would give ECLOSE.
	etag0 NameStart elemclose etag1
	etag1 Name elemclose etag1; SP elemcloseend etag2; '>' elemcloseend misc2
	etag2 SP etag2; '>' misc2


	# Element, after '<X', returns to misc2
	elem0 Name elemname elem0; SP elemnameend elem1; '/' elemnameend attrsend elem3; '>' elemnameend attrsend misc2
	elem1 SP elem1; '/' attrsend elem3; '>' attrsend misc2; NameStart attrstart attr0
	elem2 SP elem1; '/' attrsend elem3; '>' attrsend misc2
	elem3 '>' selfclose misc2

	# Attribute, after NameStart, returns to elem2
	attr0 Name attrname attr0; SP attrnameend attr1; '=' attrnameend attr2
	attr1 SP attr1; '=' attr2
	attr2 SP attr2; '\''\|'"' $quote attr3
	attr3 AttValue setdata attr3; '&' refstart attr4; $quote elem2
	attr4 Ref ref attr4; '\x3b' refend attr3