| # Copyright (c) 2013 Yoran Heling |
| # |
| # Permission is hereby granted, free of charge, to any person obtaining |
| # a copy of this software and associated documentation files (the |
| # "Software"), to deal in the Software without restriction, including |
| # without limitation the rights to use, copy, modify, merge, publish, |
| # distribute, sublicense, and/or sell copies of the Software, and to |
| # permit persons to whom the Software is furnished to do so, subject to |
| # the following conditions: |
| # |
| # The above copyright notice and this permission notice shall be included |
| # in all copies or substantial portions of the Software. |
| # |
| # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, |
| # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF |
| # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. |
| # IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY |
| # CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, |
| # TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE |
| # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. |
| |
| |
| # Format of this file (informal): |
| # |
| # Line = State Desc (';' Desc)* |
| # Desc = Cond Act* Next |
| # Cond = FunctionName # yxml_isFunctionName(char) |
| # | '$' Varname # match character in Varname |
| # | C-char ('|' C-char)* |
| # Act = FunctionName # yxml_FunctionName(x, char) |
| # | '$' Varname # Store current char into Varname |
| # | "string" # consume string before moving to next state |
| # Next = State |
| # |
| # Basically, it's just a short notation for manually writing a DFA. The script |
| # that compiles this to C is pretty simple and stupid, which explains the |
| # somewhat crude syntax of this file. It'd probably be more convenient to |
| # modify ragel[1] to generate state machine code that can be used in the |
| # yxml_parse() API, but I haven't really looked into that yet. I'm also not |
| # sure how much control I'd lose over the size of the resulting state machine. |
| # |
| # 1. http://www.complang.org/ragel/ |
| |
| init '\xef' "\xbb\xbf" misc0; SP misc0; '<' le0 |
| |
| # State numbers for the misc/le/lee/leq states: |
| # 0 = before XMLDecl, (prolog) |
| # 1 = before first element, (prolog/misc) |
| # 2 = inside element (content) |
| # And naming: |
| # misc = Nothing special seen yet |
| # le = Seen '<' |
| # lee = Seen '<!' |
| # leq = Seen '<?' |
| misc0 SP misc0; '<' le0 |
| misc1 SP misc1; '<' le1 |
| misc2 '<' le2; '&' refstart misc2a; Char setdata misc2 |
| misc2a Ref ref misc2a; '\x3b' refend misc2 |
| |
| le0 '!' lee1; '?' leq0; NameStart elemstart elem0 |
| le1 '!' lee1; '?' pi0; NameStart elemstart elem0 |
| le2 '!' lee2; '?' pi0; '/' etag0; NameStart elemstart elem0 |
| |
| lee1 '-' comment0; 'D' "OCTYPE" dt0 |
| lee2 '-' comment0; '[' "CDATA[" cd0 |
| leq0 'x' "ml" xmldecl0; NameStart pi1 |
| |
| |
| # XMLDecl, starting from '<?xml', returns to misc1 |
| xmldecl0 SP xmldecl1 |
| xmldecl1 SP xmldecl1; 'v' "ersion" ver0 |
| xmldecl2 SP xmldecl3; '?' xmldecl7 |
| xmldecl3 SP xmldecl3; '?' xmldecl7; 'e' "ncoding" enc0; 's' std0 |
| xmldecl4 SP xmldecl5; '?' xmldecl7 |
| xmldecl5 SP xmldecl5; '?' xmldecl7; 's' "tandalone" std0 |
| xmldecl6 SP xmldecl6; '?' xmldecl7 |
| xmldecl7 '>' misc1 |
| |
| # VersionInfo, after 'version', returns to xmldecl2 |
| ver0 SP ver0; '=' ver1 |
| ver1 SP ver1; '\''|'"' $quote "1." ver2 |
| ver2 Num ver3; |
| ver3 Num ver3; $quote xmldecl2 |
| |
| # EncodingDecl, after 'e', returns to xmldecl4 |
| # TODO: Pass the encoding value to the application? |
| enc0 SP enc0; '=' enc1 |
| enc1 SP enc1; '\''|'"' $quote enc2 |
| enc2 Alpha enc3 |
| enc3 EncName enc3; $quote xmldecl4 |
| |
| # SDDecl, after 'standalone', returns to xmldecl6 |
| # TODO: Pass the standalone flag to the application? |
| std0 SP std0; '=' std1 |
| std1 SP std1; '\''|'"' $quote std2 |
| std2 'y' "es" std3; 'n' "o" std3 |
| std3 $quote xmldecl6 |
| |
| |
| # Comment, after '<!-', returns to misc1 or misc2 |
| comment0 '-' comment1 |
| comment1 CommentStart comment2 |
| comment2 '-' comment3; Char comment2 |
| comment3 '-' comment4; Char comment2 |
| comment4 '>' retmisc comment4 |
| |
| |
| # PI, starting from '<?', returns to misc1 or misc2 |
| # TODO: Verify that the PI name isn't /xml/i |
| # TODO: Pass the name and contents to the application |
| pi0 NameStart pi1 |
| pi1 Name pi1; SP pi2 |
| pi2 '?' pi3; Char pi2 |
| pi3 '>' retmisc pi3; Char pi2 |
| |
| |
| # CDSect, starting from '<![DATA[', returns to misc2 |
| cd0 ']' cd1; Char setdata cd0 |
| cd1 ']' cd2; Char setdata cd0 |
| cd2 '>' misc2 |
| |
| |
| # Doctype, starting from '<!DOCTYPE', returns to misc1 |
| # TODO: This is a hack, all we do is read until we find a '>', not |
| # validating its content. This hack fails if the DTD contains a '>' |
| # character, which is very possible. Unfortunately, just figuring out where a |
| # DTD ends already requires a rather elaborate parser. :-( |
| dt0 '>' misc1; Char dt0 |
| |
| |
| # End tag, after '</', returns to misc2 |
| # XXX: It's not actually necessary to validate the characters, since the |
| # 'elemclose' function already verifies (incrementally) that the name is |
| # equivalent to the corresponding <Open ..> tag. The only difference is that |
| # with the code below, </x/z> will result in ESYN, whereas a non-validating |
| # version would give ECLOSE. |
| etag0 NameStart elemclose etag1 |
| etag1 Name elemclose etag1; SP elemcloseend etag2; '>' elemcloseend misc2 |
| etag2 SP etag2; '>' misc2 |
| |
| |
| # Element, after '<X', returns to misc2 |
| elem0 Name elemname elem0; SP elemnameend elem1; '/' elemnameend attrsend elem3; '>' elemnameend attrsend misc2 |
| elem1 SP elem1; '/' attrsend elem3; '>' attrsend misc2; NameStart attrstart attr0 |
| elem2 SP elem1; '/' attrsend elem3; '>' attrsend misc2 |
| elem3 '>' selfclose misc2 |
| |
| # Attribute, after NameStart, returns to elem2 |
| attr0 Name attrname attr0; SP attrnameend attr1; '=' attrnameend attr2 |
| attr1 SP attr1; '=' attr2 |
| attr2 SP attr2; '\''|'"' $quote attr3 |
| attr3 AttValue setdata attr3; '&' refstart attr4; $quote elem2 |
| attr4 Ref ref attr4; '\x3b' refend attr3 |
| |