add proper unicode handling
diff --git a/json5/json5.g b/json5/json5.g index 8da7a65..c4b98f8 100644 --- a/json5/json5.g +++ b/json5/json5.g
@@ -1,94 +1,130 @@ -grammar = sp value:v sp end -> v +grammar = sp value:v sp end -> v -sp = ws* +sp = ws* -ws = ' ' | '\t' | eol | comment +ws = '\u0020' | eol | comment + | '\u0009' | '\u000B' | '\u000C' | '\u00A0' | '\uFEFF' + | anything:x ?( is_unicat(x, 'Zs') ) -> x -eol = '\r\n' | '\r' | '\n' +eol = '\u000D' '\u000A' | '\u000D' | '\u000A' + | '\u2028' | '\u2029' -comment = '//' (~(eol | end) anything)* (end | eol) - | '/*' (~'*/' anything)* '*/' +comment = '//' (~eol anything)* + | '/*' (~'*/' anything)* '*/' -value = 'null' -> 'None' - | 'true' -> 'True' - | 'false' -> 'False' - | object:v -> ['object', v] - | array:v -> ['array', v] - | string:v -> ['string', v] - | num_literal:v -> ['number', v] +value = 'null' -> 'None' + | 'true' -> 'True' + | 'false' -> 'False' + | object:v -> ['object', v] + | array:v -> ['array', v] + | string:v -> ['string', v] + | num_literal:v -> ['number', v] -object = '{' sp member_list:v sp '}' -> v - | '{' sp '}' -> [] +object = '{' sp member_list:v sp '}' -> v + | '{' sp '}' -> [] -array = '[' sp element_list:v sp ']' -> v - | '[' sp ']' -> [] +array = '[' sp element_list:v sp ']' -> v + | '[' sp ']' -> [] -string = squote sqchars:s squote -> s - | dquote dqchars:s dquote -> s +string = squote sqchar*:cs squote -> join('', cs) + | dquote dqchar*:cs dquote -> join('', cs) -sqchars = (~(squote | eol) sqchar)*:cs -> ''.join(cs) +sqchar = bslash esc_char:c -> c + | bslash eol -> '' + | ~squote ~eol anything:c -> c -sqchar = bslash squote -> '\x5C\x27' - | bslash '\n' -> '\n' - | bslash '\r\n' -> '\r\n' - | bslash '\r' -> '\r' - | bslash bslash -> '\x5C\x5C' - | anything +dqchar = bslash esc_char:c -> c + | bslash eol -> '' + | ~dquote ~eol anything:c -> c -dqchars = (~(dquote | eol) dqchar)*:cs -> ''.join(cs) +bslash = '\u005C' -dqchar = bslash dquote -> '\x5C\x22' - | bslash '\n' -> '\n' - | bslash '\r\n' -> '\r\n' - | bslash '\r' -> '\r' - | bslash bslash -> '\x5C\x5C' - | anything +squote = '\u0027' -bslash = '\x5C' +dquote = '\u0022' -squote = '\x27' +esc_char = 'b' -> '\u0008' + | 'f' -> '\u000C' + | 'n' -> '\u000A' + | 'r' -> '\u000D' + | 't' -> '\u0009' + | 'v' -> '\u000B' + | squote -> '\u0027' + | dquote -> '\u0022' + | bslash -> '\u005C' + | hex_esc:c -> c + | unicode_esc:c -> c -dquote = '\x22' +hex_esc = 'x' hex:h1 hex:h2 -> xtou(h1 + h2) -element_list = value:v sp ',' sp element_list:vs -> [v] + vs - | value:v sp ',' -> [v] - | value:v -> [v] - -member_list = member:m sp ',' sp member_list:ms -> [m] + ms - | member:m sp ',' -> [m] - | member:m -> [m] +unicode_esc = 'u' hex:a hex:b hex:c hex:d -> xtou(a + b + c + d) -member = string:k sp ':' sp value:v -> [k, v] - | ident:k sp ':' sp value:v -> [k, v] +element_list = value:v sp ',' sp element_list:vs -> [v] + vs + | value:v sp ',' -> [v] + | value:v -> [v] -ident = id_start:hd id_continue*:tl -> ''.join([hd] + tl) +member_list = member:m sp ',' sp member_list:ms -> [m] + ms + | member:m sp ',' -> [m] + | member:m -> [m] -id_start = letter | '$' | '_' -id_continue = id_start | digit +member = string:k sp ':' sp value:v -> [k, v] + | ident:k sp ':' sp value:v -> [k, v] -num_literal = '-' num_literal:n -> '-' + n - | '+'? dec_literal:d ~(id_start | digit) -> d - | hex_literal - | 'Infinity' - | 'NaN' - -dec_literal = dec_int_lit:d frac:f exp:e -> d + f + e - | dec_int_lit:d frac:f -> d + f - | dec_int_lit:d exp:e -> d + e - | dec_int_lit:d -> d - | frac:f exp:e -> f + e - | frac:f -> f +ident = id_start:hd id_continue*:tl -> join('', [hd] + tl) -dec_int_lit = '0' ~digit -> '0' - | nonzerodigit:n digit*:ds -> n + ''.join(ds) +id_start = ascii_id_start + | other_id_start + | bslash unicode_esc -nonzerodigit = '1'..'9' +ascii_id_start = 'a'..'z' + | 'A'..'Z' + | digit + | '$' + | '_' -hex_literal = ('0x' | '0X') hex_digit+:hs -> '0x' + ''.join(hs) +other_id_start = anything:x ?(is_unicat(x, 'Ll')) -> x + | anything:x ?(is_unicat(x, 'Lm')) -> x + | anything:x ?(is_unicat(x, 'Lo')) -> x + | anything:x ?(is_unicat(x, 'Lt')) -> x + | anything:x ?(is_unicat(x, 'Lu')) -> x + | anything:x ?(is_unicat(x, 'Nl')) -> x -hex_digit = 'a'..'f' | 'A'..'F' | digit +id_continue = ascii_id_start + | digit + | other_id_start + | anything:x ?(is_unicat(x, 'Mn')) -> x + | anything:x ?(is_unicat(x, 'Mc')) -> x + | anything:x ?(is_unicat(x, 'Nd')) -> x + | anything:x ?(is_unicat(x, 'Pc')) -> x + | bslash unicode_esc + | '\u200C' + | '\u200D' -frac = '.' digit*:ds -> '.' + ''.join(ds) +num_literal = '-' num_literal:n -> '-' + n + | '+'? dec_literal:d ~id_start -> d + | hex_literal + | 'Infinity' + | 'NaN' -exp = ('e' | 'E') ('+' | '-'):s digit*:ds -> 'e' + s + ''.join(ds) - | ('e' | 'E') digit*:ds -> 'e' + ''.join(ds) +dec_literal = dec_int_lit:d frac:f exp:e -> d + f + e + | dec_int_lit:d frac:f -> d + f + | dec_int_lit:d exp:e -> d + e + | dec_int_lit:d -> d + | frac:f exp:e -> f + e + | frac:f -> f + +dec_int_lit = '0' ~digit -> '0' + | nonzerodigit:d digit*:ds -> d + join('', ds) + +digit = '0'..'9' + +nonzerodigit = '1'..'9' + +hex_literal = ('0x' | '0X') hex+:hs -> '0x' + join('', hs) + +hex = 'a'..'f' | 'A'..'F' | digit + +frac = '.' digit*:ds -> '.' + join('', ds) + +exp = ('e' | 'E') ('+' | '-'):s digit*:ds -> 'e' + s + join('', ds) + | ('e' | 'E') digit*:ds -> 'e' + join('', ds)
diff --git a/json5/parser.py b/json5/parser.py index b6fc0b9..bb07626 100755 --- a/json5/parser.py +++ b/json5/parser.py
@@ -2,7 +2,7 @@ class Parser(object): def __init__(self, msg, fname, starting_rule='grammar'): - self.msg = msg + self.msg = unicode(msg) self.end = len(msg) self.fname = fname self.starting_rule = starting_rule @@ -37,14 +37,16 @@ def _err_str(self): lineno, colno, _ = self._err_offsets() - prefix = '%s:%d' % (self.fname, lineno) + prefix = u'%s:%d' % (self.fname, lineno) if isinstance(self.err, basestring): - return '%s %s' % (prefix, self.err) + return u'%s %s' % (prefix, self.err) exps = list(self.errset) if len(exps) > 1: - return '%s Unexpected "%s" at column %d' % ( + return u'%s Unexpected "%s" at column %d' % ( prefix, self.msg[self.errpos], colno) - return '%s Expecting a "%s" at column %d, got a "%s"' % ( + if self.errpos == 0 and len(self.msg) == 0: + return u'input must not be empty' + return u'%s Expecting a "%s" at column %d, got a "%s"' % ( prefix, exps[0], colno, self.msg[self.errpos]) def _err_offsets(self): @@ -53,7 +55,7 @@ i = 0 begpos = 0 while i < self.errpos: - if self.msg[i] == '\n': + if self.msg[i] == u'\n': lineno += 1 colno = 1 begpos = i @@ -63,7 +65,7 @@ return lineno, colno, begpos def _esc(self, val): - return str(val) + return unicode(val) def _expect(self, expr): p = self.pos @@ -121,7 +123,7 @@ def _ws_(self): p = self.pos def choice_0(): - self._expect(' ') + self._expect(u' ') choice_0() if not self.err: return @@ -129,7 +131,7 @@ self.err = False self.pos = p def choice_1(): - self._expect('\t') + self._eol_() choice_1() if not self.err: return @@ -137,7 +139,7 @@ self.err = False self.pos = p def choice_2(): - self._eol_() + self._comment_() choice_2() if not self.err: return @@ -145,13 +147,75 @@ self.err = False self.pos = p def choice_3(): - self._comment_() + self._expect(u'\t') choice_3() + if not self.err: + return + + self.err = False + self.pos = p + def choice_4(): + self._expect(u'\x0b') + choice_4() + if not self.err: + return + + self.err = False + self.pos = p + def choice_5(): + self._expect(u'\x0c') + choice_5() + if not self.err: + return + + self.err = False + self.pos = p + def choice_6(): + self._expect(u'\xa0') + choice_6() + if not self.err: + return + + self.err = False + self.pos = p + def choice_7(): + self._expect(u'\ufeff') + choice_7() + if not self.err: + return + + self.err = False + self.pos = p + def choice_8(): + self._push('ws_8') + self._anything_() + if not self.err: + self._set('x', self.val) + if self.err: + self._pop('ws_8') + return + v = self._is_unicat(self._get('x'), u'Zs') + if v: + self.val = v + self.err = None + else: + self.err = "pred check failed" + self.val = None + if self.err: + self._pop('ws_8') + return + self.val = self._get('x') + self.err = None + self._pop('ws_8') + choice_8() def _eol_(self): p = self.pos def choice_0(): - self._expect('\r\n') + self._expect(u'\r') + if self.err: + return + self._expect(u'\n') choice_0() if not self.err: return @@ -159,7 +223,7 @@ self.err = False self.pos = p def choice_1(): - self._expect('\r') + self._expect(u'\r') choice_1() if not self.err: return @@ -167,13 +231,29 @@ self.err = False self.pos = p def choice_2(): - self._expect('\n') + self._expect(u'\n') choice_2() + if not self.err: + return + + self.err = False + self.pos = p + def choice_3(): + self._expect(u'\u2028') + choice_3() + if not self.err: + return + + self.err = False + self.pos = p + def choice_4(): + self._expect(u'\u2029') + choice_4() def _comment_(self): p = self.pos def choice_0(): - self._expect('//') + self._expect(u'//') if self.err: return vs = [] @@ -181,20 +261,7 @@ p = self.pos def group(): p = self.pos - def group(): - p = self.pos - def choice_0(): - self._eol_() - choice_0() - if not self.err: - return - - self.err = False - self.pos = p - def choice_1(): - self._end_() - choice_1() - group() + self._eol_() self.pos = p if not self.err: self.err = "not" @@ -211,22 +278,6 @@ self.pos = p self.val = vs self.err = None - if self.err: - return - def group(): - p = self.pos - def choice_0(): - self._end_() - choice_0() - if not self.err: - return - - self.err = False - self.pos = p - def choice_1(): - self._eol_() - choice_1() - group() choice_0() if not self.err: return @@ -234,7 +285,7 @@ self.err = False self.pos = p def choice_1(): - self._expect('/*') + self._expect(u'/*') if self.err: return vs = [] @@ -242,7 +293,7 @@ p = self.pos def group(): p = self.pos - self._expect('*/') + self._expect(u'*/') self.pos = p if not self.err: self.err = "not" @@ -261,16 +312,16 @@ self.err = None if self.err: return - self._expect('*/') + self._expect(u'*/') choice_1() def _value_(self): p = self.pos def choice_0(): - self._expect('null') + self._expect(u'null') if self.err: return - self.val = 'None' + self.val = u'None' self.err = None choice_0() if not self.err: @@ -279,10 +330,10 @@ self.err = False self.pos = p def choice_1(): - self._expect('true') + self._expect(u'true') if self.err: return - self.val = 'True' + self.val = u'True' self.err = None choice_1() if not self.err: @@ -291,10 +342,10 @@ self.err = False self.pos = p def choice_2(): - self._expect('false') + self._expect(u'false') if self.err: return - self.val = 'False' + self.val = u'False' self.err = None choice_2() if not self.err: @@ -310,7 +361,7 @@ if self.err: self._pop('value_3') return - self.val = ['object', self._get('v')] + self.val = [u'object', self._get('v')] self.err = None self._pop('value_3') choice_3() @@ -327,7 +378,7 @@ if self.err: self._pop('value_4') return - self.val = ['array', self._get('v')] + self.val = [u'array', self._get('v')] self.err = None self._pop('value_4') choice_4() @@ -344,7 +395,7 @@ if self.err: self._pop('value_5') return - self.val = ['string', self._get('v')] + self.val = [u'string', self._get('v')] self.err = None self._pop('value_5') choice_5() @@ -361,7 +412,7 @@ if self.err: self._pop('value_6') return - self.val = ['number', self._get('v')] + self.val = [u'number', self._get('v')] self.err = None self._pop('value_6') choice_6() @@ -370,7 +421,7 @@ p = self.pos def choice_0(): self._push('object_0') - self._expect('{') + self._expect(u'{') if self.err: self._pop('object_0') return @@ -388,7 +439,7 @@ if self.err: self._pop('object_0') return - self._expect('}') + self._expect(u'}') if self.err: self._pop('object_0') return @@ -402,13 +453,13 @@ self.err = False self.pos = p def choice_1(): - self._expect('{') + self._expect(u'{') if self.err: return self._sp_() if self.err: return - self._expect('}') + self._expect(u'}') if self.err: return self.val = [] @@ -419,7 +470,7 @@ p = self.pos def choice_0(): self._push('array_0') - self._expect('[') + self._expect(u'[') if self.err: self._pop('array_0') return @@ -437,7 +488,7 @@ if self.err: self._pop('array_0') return - self._expect(']') + self._expect(u']') if self.err: self._pop('array_0') return @@ -451,13 +502,13 @@ self.err = False self.pos = p def choice_1(): - self._expect('[') + self._expect(u'[') if self.err: return self._sp_() if self.err: return - self._expect(']') + self._expect(u']') if self.err: return self.val = [] @@ -472,9 +523,18 @@ if self.err: self._pop('string_0') return - self._sqchars_() + vs = [] + while not self.err: + p = self.pos + self._sqchar_() + if not self.err: + vs.append(self.val) + else: + self.pos = p + self.val = vs + self.err = None if not self.err: - self._set('s', self.val) + self._set('cs', self.val) if self.err: self._pop('string_0') return @@ -482,7 +542,7 @@ if self.err: self._pop('string_0') return - self.val = self._get('s') + self.val = self._join(u'', self._get('cs')) self.err = None self._pop('string_0') choice_0() @@ -497,9 +557,18 @@ if self.err: self._pop('string_1') return - self._dqchars_() + vs = [] + while not self.err: + p = self.pos + self._dqchar_() + if not self.err: + vs.append(self.val) + else: + self.pos = p + self.val = vs + self.err = None if not self.err: - self._set('s', self.val) + self._set('cs', self.val) if self.err: self._pop('string_1') return @@ -507,68 +576,28 @@ if self.err: self._pop('string_1') return - self.val = self._get('s') + self.val = self._join(u'', self._get('cs')) self.err = None self._pop('string_1') choice_1() - def _sqchars_(self): - self._push('sqchars') - vs = [] - while not self.err: - p = self.pos - def group(): - p = self.pos - def group(): - p = self.pos - def choice_0(): - self._squote_() - choice_0() - if not self.err: - return - - self.err = False - self.pos = p - def choice_1(): - self._eol_() - choice_1() - group() - self.pos = p - if not self.err: - self.err = "not" - self.val = None - return - self.err = None - if self.err: - return - self._sqchar_() - group() - if not self.err: - vs.append(self.val) - else: - self.pos = p - self.val = vs - self.err = None - if not self.err: - self._set('cs', self.val) - if self.err: - self._pop('sqchars') - return - self.val = ''.join(self._get('cs')) - self.err = None - self._pop('sqchars') - def _sqchar_(self): p = self.pos def choice_0(): + self._push('sqchar_0') self._bslash_() if self.err: + self._pop('sqchar_0') return - self._squote_() + self._esc_char_() + if not self.err: + self._set('c', self.val) if self.err: + self._pop('sqchar_0') return - self.val = '\x5C\x27' + self.val = self._get('c') self.err = None + self._pop('sqchar_0') choice_0() if not self.err: return @@ -579,10 +608,10 @@ self._bslash_() if self.err: return - self._expect('\n') + self._eol_() if self.err: return - self.val = '\n' + self.val = u'' self.err = None choice_1() if not self.err: @@ -591,111 +620,59 @@ self.err = False self.pos = p def choice_2(): - self._bslash_() - if self.err: - return - self._expect('\r\n') - if self.err: - return - self.val = '\r\n' - self.err = None - choice_2() - if not self.err: - return - - self.err = False - self.pos = p - def choice_3(): - self._bslash_() - if self.err: - return - self._expect('\r') - if self.err: - return - self.val = '\r' - self.err = None - choice_3() - if not self.err: - return - - self.err = False - self.pos = p - def choice_4(): - self._bslash_() - if self.err: - return - self._bslash_() - if self.err: - return - self.val = '\x5C\x5C' - self.err = None - choice_4() - if not self.err: - return - - self.err = False - self.pos = p - def choice_5(): - self._anything_() - choice_5() - - def _dqchars_(self): - self._push('dqchars') - vs = [] - while not self.err: + self._push('sqchar_2') p = self.pos - def group(): - p = self.pos - def group(): - p = self.pos - def choice_0(): - self._dquote_() - choice_0() - if not self.err: - return - - self.err = False - self.pos = p - def choice_1(): - self._eol_() - choice_1() - group() - self.pos = p - if not self.err: - self.err = "not" - self.val = None - return - self.err = None - if self.err: - return - self._dqchar_() - group() + self._squote_() + self.pos = p if not self.err: - vs.append(self.val) - else: - self.pos = p - self.val = vs - self.err = None - if not self.err: - self._set('cs', self.val) - if self.err: - self._pop('dqchars') - return - self.val = ''.join(self._get('cs')) - self.err = None - self._pop('dqchars') + self.err = "not" + self.val = None + self._pop('sqchar_2') + return + self.err = None + if self.err: + self._pop('sqchar_2') + return + p = self.pos + self._eol_() + self.pos = p + if not self.err: + self.err = "not" + self.val = None + self._pop('sqchar_2') + return + self.err = None + if self.err: + self._pop('sqchar_2') + return + self._anything_() + if not self.err: + self._set('c', self.val) + if self.err: + self._pop('sqchar_2') + return + self.val = self._get('c') + self.err = None + self._pop('sqchar_2') + choice_2() def _dqchar_(self): p = self.pos def choice_0(): + self._push('dqchar_0') self._bslash_() if self.err: + self._pop('dqchar_0') return - self._dquote_() + self._esc_char_() + if not self.err: + self._set('c', self.val) if self.err: + self._pop('dqchar_0') return - self.val = '\x5C\x22' + self.val = self._get('c') self.err = None + self._pop('dqchar_0') choice_0() if not self.err: return @@ -706,10 +683,10 @@ self._bslash_() if self.err: return - self._expect('\n') + self._eol_() if self.err: return - self.val = '\n' + self.val = u'' self.err = None choice_1() if not self.err: @@ -718,13 +695,82 @@ self.err = False self.pos = p def choice_2(): - self._bslash_() + self._push('dqchar_2') + p = self.pos + self._dquote_() + self.pos = p + if not self.err: + self.err = "not" + self.val = None + self._pop('dqchar_2') + return + self.err = None + if self.err: + self._pop('dqchar_2') + return + p = self.pos + self._eol_() + self.pos = p + if not self.err: + self.err = "not" + self.val = None + self._pop('dqchar_2') + return + self.err = None + if self.err: + self._pop('dqchar_2') + return + self._anything_() + if not self.err: + self._set('c', self.val) + if self.err: + self._pop('dqchar_2') + return + self.val = self._get('c') + self.err = None + self._pop('dqchar_2') + choice_2() + + def _bslash_(self): + self._expect(u'\\') + + def _squote_(self): + self._expect(u"'") + + def _dquote_(self): + self._expect(u'"') + + def _esc_char_(self): + p = self.pos + def choice_0(): + self._expect(u'b') if self.err: return - self._expect('\r\n') + self.val = u'\x08' + self.err = None + choice_0() + if not self.err: + return + + self.err = False + self.pos = p + def choice_1(): + self._expect(u'f') if self.err: return - self.val = '\r\n' + self.val = u'\x0c' + self.err = None + choice_1() + if not self.err: + return + + self.err = False + self.pos = p + def choice_2(): + self._expect(u'n') + if self.err: + return + self.val = u'\n' self.err = None choice_2() if not self.err: @@ -733,13 +779,10 @@ self.err = False self.pos = p def choice_3(): - self._bslash_() + self._expect(u'r') if self.err: return - self._expect('\r') - if self.err: - return - self.val = '\r' + self.val = u'\r' self.err = None choice_3() if not self.err: @@ -748,13 +791,10 @@ self.err = False self.pos = p def choice_4(): - self._bslash_() + self._expect(u't') if self.err: return - self._bslash_() - if self.err: - return - self.val = '\x5C\x5C' + self.val = u'\t' self.err = None choice_4() if not self.err: @@ -763,17 +803,138 @@ self.err = False self.pos = p def choice_5(): - self._anything_() + self._expect(u'v') + if self.err: + return + self.val = u'\x0b' + self.err = None choice_5() + if not self.err: + return - def _bslash_(self): - self._expect('\x5C') + self.err = False + self.pos = p + def choice_6(): + self._squote_() + if self.err: + return + self.val = u"'" + self.err = None + choice_6() + if not self.err: + return - def _squote_(self): - self._expect('\x27') + self.err = False + self.pos = p + def choice_7(): + self._dquote_() + if self.err: + return + self.val = u'"' + self.err = None + choice_7() + if not self.err: + return - def _dquote_(self): - self._expect('\x22') + self.err = False + self.pos = p + def choice_8(): + self._bslash_() + if self.err: + return + self.val = u'\\' + self.err = None + choice_8() + if not self.err: + return + + self.err = False + self.pos = p + def choice_9(): + self._push('esc_char_9') + self._hex_esc_() + if not self.err: + self._set('c', self.val) + if self.err: + self._pop('esc_char_9') + return + self.val = self._get('c') + self.err = None + self._pop('esc_char_9') + choice_9() + if not self.err: + return + + self.err = False + self.pos = p + def choice_10(): + self._push('esc_char_10') + self._unicode_esc_() + if not self.err: + self._set('c', self.val) + if self.err: + self._pop('esc_char_10') + return + self.val = self._get('c') + self.err = None + self._pop('esc_char_10') + choice_10() + + def _hex_esc_(self): + self._push('hex_esc') + self._expect(u'x') + if self.err: + self._pop('hex_esc') + return + self._hex_() + if not self.err: + self._set('h1', self.val) + if self.err: + self._pop('hex_esc') + return + self._hex_() + if not self.err: + self._set('h2', self.val) + if self.err: + self._pop('hex_esc') + return + self.val = self._xtou(self._get('h1') + self._get('h2')) + self.err = None + self._pop('hex_esc') + + def _unicode_esc_(self): + self._push('unicode_esc') + self._expect(u'u') + if self.err: + self._pop('unicode_esc') + return + self._hex_() + if not self.err: + self._set('a', self.val) + if self.err: + self._pop('unicode_esc') + return + self._hex_() + if not self.err: + self._set('b', self.val) + if self.err: + self._pop('unicode_esc') + return + self._hex_() + if not self.err: + self._set('c', self.val) + if self.err: + self._pop('unicode_esc') + return + self._hex_() + if not self.err: + self._set('d', self.val) + if self.err: + self._pop('unicode_esc') + return + self.val = self._xtou(self._get('a') + self._get('b') + self._get('c') + self._get('d')) + self.err = None + self._pop('unicode_esc') def _element_list_(self): p = self.pos @@ -789,7 +950,7 @@ if self.err: self._pop('element_list_0') return - self._expect(',') + self._expect(u',') if self.err: self._pop('element_list_0') return @@ -824,7 +985,7 @@ if self.err: self._pop('element_list_1') return - self._expect(',') + self._expect(u',') if self.err: self._pop('element_list_1') return @@ -864,7 +1025,7 @@ if self.err: self._pop('member_list_0') return - self._expect(',') + self._expect(u',') if self.err: self._pop('member_list_0') return @@ -899,7 +1060,7 @@ if self.err: self._pop('member_list_1') return - self._expect(',') + self._expect(u',') if self.err: self._pop('member_list_1') return @@ -939,7 +1100,7 @@ if self.err: self._pop('member_0') return - self._expect(':') + self._expect(u':') if self.err: self._pop('member_0') return @@ -974,7 +1135,7 @@ if self.err: self._pop('member_1') return - self._expect(':') + self._expect(u':') if self.err: self._pop('member_1') return @@ -1016,14 +1177,14 @@ if self.err: self._pop('ident') return - self.val = ''.join([self._get('hd')] + self._get('tl')) + self.val = self._join(u'', [self._get('hd')] + self._get('tl')) self.err = None self._pop('ident') def _id_start_(self): p = self.pos def choice_0(): - self._letter_() + self._ascii_id_start_() choice_0() if not self.err: return @@ -1031,7 +1192,7 @@ self.err = False self.pos = p def choice_1(): - self._expect('$') + self._other_id_start_() choice_1() if not self.err: return @@ -1039,13 +1200,246 @@ self.err = False self.pos = p def choice_2(): - self._expect('_') + self._bslash_() + if self.err: + return + self._unicode_esc_() choice_2() + def _ascii_id_start_(self): + p = self.pos + def choice_0(): + i = u'a' + j = u'z' + if (self.pos == self.end or + ord(self.msg[self.pos]) < ord(i) or + ord(self.msg[self.pos]) > ord(j)): + self.val = None + self.err = True + if self.pos >= self.errpos: + if self.pos > self.errpos: + self.errset = set() + self.errset.add('something between %s and %s' % (i, j)) + self.errpos = self.pos + else: + self.val = self.msg[self.pos] + self.err = False + self.pos += 1 + return + choice_0() + if not self.err: + return + + self.err = False + self.pos = p + def choice_1(): + i = u'A' + j = u'Z' + if (self.pos == self.end or + ord(self.msg[self.pos]) < ord(i) or + ord(self.msg[self.pos]) > ord(j)): + self.val = None + self.err = True + if self.pos >= self.errpos: + if self.pos > self.errpos: + self.errset = set() + self.errset.add('something between %s and %s' % (i, j)) + self.errpos = self.pos + else: + self.val = self.msg[self.pos] + self.err = False + self.pos += 1 + return + choice_1() + if not self.err: + return + + self.err = False + self.pos = p + def choice_2(): + self._digit_() + choice_2() + if not self.err: + return + + self.err = False + self.pos = p + def choice_3(): + self._expect(u'$') + choice_3() + if not self.err: + return + + self.err = False + self.pos = p + def choice_4(): + self._expect(u'_') + choice_4() + + def _other_id_start_(self): + p = self.pos + def choice_0(): + self._push('other_id_start_0') + self._anything_() + if not self.err: + self._set('x', self.val) + if self.err: + self._pop('other_id_start_0') + return + v = self._is_unicat(self._get('x'), u'Ll') + if v: + self.val = v + self.err = None + else: + self.err = "pred check failed" + self.val = None + if self.err: + self._pop('other_id_start_0') + return + self.val = self._get('x') + self.err = None + self._pop('other_id_start_0') + choice_0() + if not self.err: + return + + self.err = False + self.pos = p + def choice_1(): + self._push('other_id_start_1') + self._anything_() + if not self.err: + self._set('x', self.val) + if self.err: + self._pop('other_id_start_1') + return + v = self._is_unicat(self._get('x'), u'Lm') + if v: + self.val = v + self.err = None + else: + self.err = "pred check failed" + self.val = None + if self.err: + self._pop('other_id_start_1') + return + self.val = self._get('x') + self.err = None + self._pop('other_id_start_1') + choice_1() + if not self.err: + return + + self.err = False + self.pos = p + def choice_2(): + self._push('other_id_start_2') + self._anything_() + if not self.err: + self._set('x', self.val) + if self.err: + self._pop('other_id_start_2') + return + v = self._is_unicat(self._get('x'), u'Lo') + if v: + self.val = v + self.err = None + else: + self.err = "pred check failed" + self.val = None + if self.err: + self._pop('other_id_start_2') + return + self.val = self._get('x') + self.err = None + self._pop('other_id_start_2') + choice_2() + if not self.err: + return + + self.err = False + self.pos = p + def choice_3(): + self._push('other_id_start_3') + self._anything_() + if not self.err: + self._set('x', self.val) + if self.err: + self._pop('other_id_start_3') + return + v = self._is_unicat(self._get('x'), u'Lt') + if v: + self.val = v + self.err = None + else: + self.err = "pred check failed" + self.val = None + if self.err: + self._pop('other_id_start_3') + return + self.val = self._get('x') + self.err = None + self._pop('other_id_start_3') + choice_3() + if not self.err: + return + + self.err = False + self.pos = p + def choice_4(): + self._push('other_id_start_4') + self._anything_() + if not self.err: + self._set('x', self.val) + if self.err: + self._pop('other_id_start_4') + return + v = self._is_unicat(self._get('x'), u'Lu') + if v: + self.val = v + self.err = None + else: + self.err = "pred check failed" + self.val = None + if self.err: + self._pop('other_id_start_4') + return + self.val = self._get('x') + self.err = None + self._pop('other_id_start_4') + choice_4() + if not self.err: + return + + self.err = False + self.pos = p + def choice_5(): + self._push('other_id_start_5') + self._anything_() + if not self.err: + self._set('x', self.val) + if self.err: + self._pop('other_id_start_5') + return + v = self._is_unicat(self._get('x'), u'Nl') + if v: + self.val = v + self.err = None + else: + self.err = "pred check failed" + self.val = None + if self.err: + self._pop('other_id_start_5') + return + self.val = self._get('x') + self.err = None + self._pop('other_id_start_5') + choice_5() + def _id_continue_(self): p = self.pos def choice_0(): - self._id_start_() + self._ascii_id_start_() choice_0() if not self.err: return @@ -1055,12 +1449,155 @@ def choice_1(): self._digit_() choice_1() + if not self.err: + return + + self.err = False + self.pos = p + def choice_2(): + self._other_id_start_() + choice_2() + if not self.err: + return + + self.err = False + self.pos = p + def choice_3(): + self._push('id_continue_3') + self._anything_() + if not self.err: + self._set('x', self.val) + if self.err: + self._pop('id_continue_3') + return + v = self._is_unicat(self._get('x'), u'Mn') + if v: + self.val = v + self.err = None + else: + self.err = "pred check failed" + self.val = None + if self.err: + self._pop('id_continue_3') + return + self.val = self._get('x') + self.err = None + self._pop('id_continue_3') + choice_3() + if not self.err: + return + + self.err = False + self.pos = p + def choice_4(): + self._push('id_continue_4') + self._anything_() + if not self.err: + self._set('x', self.val) + if self.err: + self._pop('id_continue_4') + return + v = self._is_unicat(self._get('x'), u'Mc') + if v: + self.val = v + self.err = None + else: + self.err = "pred check failed" + self.val = None + if self.err: + self._pop('id_continue_4') + return + self.val = self._get('x') + self.err = None + self._pop('id_continue_4') + choice_4() + if not self.err: + return + + self.err = False + self.pos = p + def choice_5(): + self._push('id_continue_5') + self._anything_() + if not self.err: + self._set('x', self.val) + if self.err: + self._pop('id_continue_5') + return + v = self._is_unicat(self._get('x'), u'Nd') + if v: + self.val = v + self.err = None + else: + self.err = "pred check failed" + self.val = None + if self.err: + self._pop('id_continue_5') + return + self.val = self._get('x') + self.err = None + self._pop('id_continue_5') + choice_5() + if not self.err: + return + + self.err = False + self.pos = p + def choice_6(): + self._push('id_continue_6') + self._anything_() + if not self.err: + self._set('x', self.val) + if self.err: + self._pop('id_continue_6') + return + v = self._is_unicat(self._get('x'), u'Pc') + if v: + self.val = v + self.err = None + else: + self.err = "pred check failed" + self.val = None + if self.err: + self._pop('id_continue_6') + return + self.val = self._get('x') + self.err = None + self._pop('id_continue_6') + choice_6() + if not self.err: + return + + self.err = False + self.pos = p + def choice_7(): + self._bslash_() + if self.err: + return + self._unicode_esc_() + choice_7() + if not self.err: + return + + self.err = False + self.pos = p + def choice_8(): + self._expect(u'\u200c') + choice_8() + if not self.err: + return + + self.err = False + self.pos = p + def choice_9(): + self._expect(u'\u200d') + choice_9() def _num_literal_(self): p = self.pos def choice_0(): self._push('num_literal_0') - self._expect('-') + self._expect(u'-') if self.err: self._pop('num_literal_0') return @@ -1070,7 +1607,7 @@ if self.err: self._pop('num_literal_0') return - self.val = '-' + self._get('n') + self.val = u'-' + self._get('n') self.err = None self._pop('num_literal_0') choice_0() @@ -1082,7 +1619,7 @@ def choice_1(): self._push('num_literal_1') p = self.pos - self._expect('+') + self._expect(u'+') if self.err: self.val = [] self.err = None @@ -1099,20 +1636,7 @@ self._pop('num_literal_1') return p = self.pos - def group(): - p = self.pos - def choice_0(): - self._id_start_() - choice_0() - if not self.err: - return - - self.err = False - self.pos = p - def choice_1(): - self._digit_() - choice_1() - group() + self._id_start_() self.pos = p if not self.err: self.err = "not" @@ -1141,7 +1665,7 @@ self.err = False self.pos = p def choice_3(): - self._expect('Infinity') + self._expect(u'Infinity') choice_3() if not self.err: return @@ -1149,7 +1673,7 @@ self.err = False self.pos = p def choice_4(): - self._expect('NaN') + self._expect(u'NaN') choice_4() def _dec_literal_(self): @@ -1285,7 +1809,7 @@ def _dec_int_lit_(self): p = self.pos def choice_0(): - self._expect('0') + self._expect(u'0') if self.err: return p = self.pos @@ -1298,7 +1822,7 @@ self.err = None if self.err: return - self.val = '0' + self.val = u'0' self.err = None choice_0() if not self.err: @@ -1310,7 +1834,7 @@ self._push('dec_int_lit_1') self._nonzerodigit_() if not self.err: - self._set('n', self.val) + self._set('d', self.val) if self.err: self._pop('dec_int_lit_1') return @@ -1329,11 +1853,30 @@ if self.err: self._pop('dec_int_lit_1') return - self.val = self._get('n') + ''.join(self._get('ds')) + self.val = self._get('d') + self._join(u'', self._get('ds')) self.err = None self._pop('dec_int_lit_1') choice_1() + def _digit_(self): + i = u'0' + j = u'9' + if (self.pos == self.end or + ord(self.msg[self.pos]) < ord(i) or + ord(self.msg[self.pos]) > ord(j)): + self.val = None + self.err = True + if self.pos >= self.errpos: + if self.pos > self.errpos: + self.errset = set() + self.errset.add('something between %s and %s' % (i, j)) + self.errpos = self.pos + else: + self.val = self.msg[self.pos] + self.err = False + self.pos += 1 + return + def _nonzerodigit_(self): i = u'1' j = u'9' @@ -1358,7 +1901,7 @@ def group(): p = self.pos def choice_0(): - self._expect('0x') + self._expect(u'0x') choice_0() if not self.err: return @@ -1366,21 +1909,21 @@ self.err = False self.pos = p def choice_1(): - self._expect('0X') + self._expect(u'0X') choice_1() group() if self.err: self._pop('hex_literal') return vs = [] - self._hex_digit_() + self._hex_() if self.err: self._pop('hex_literal') return vs.append(self.val) while not self.err: p = self.pos - self._hex_digit_() + self._hex_() if not self.err: vs.append(self.val) else: @@ -1392,11 +1935,11 @@ if self.err: self._pop('hex_literal') return - self.val = '0x' + ''.join(self._get('hs')) + self.val = u'0x' + self._join(u'', self._get('hs')) self.err = None self._pop('hex_literal') - def _hex_digit_(self): + def _hex_(self): p = self.pos def choice_0(): i = u'a' @@ -1452,7 +1995,7 @@ def _frac_(self): self._push('frac') - self._expect('.') + self._expect(u'.') if self.err: self._pop('frac') return @@ -1471,7 +2014,7 @@ if self.err: self._pop('frac') return - self.val = '.' + ''.join(self._get('ds')) + self.val = u'.' + self._join(u'', self._get('ds')) self.err = None self._pop('frac') @@ -1482,7 +2025,7 @@ def group(): p = self.pos def choice_0(): - self._expect('e') + self._expect(u'e') choice_0() if not self.err: return @@ -1490,7 +2033,7 @@ self.err = False self.pos = p def choice_1(): - self._expect('E') + self._expect(u'E') choice_1() group() if self.err: @@ -1499,7 +2042,7 @@ def group(): p = self.pos def choice_0(): - self._expect('+') + self._expect(u'+') choice_0() if not self.err: return @@ -1507,7 +2050,7 @@ self.err = False self.pos = p def choice_1(): - self._expect('-') + self._expect(u'-') choice_1() group() if not self.err: @@ -1530,7 +2073,7 @@ if self.err: self._pop('exp_0') return - self.val = 'e' + self._get('s') + ''.join(self._get('ds')) + self.val = u'e' + self._get('s') + self._join(u'', self._get('ds')) self.err = None self._pop('exp_0') choice_0() @@ -1544,7 +2087,7 @@ def group(): p = self.pos def choice_0(): - self._expect('e') + self._expect(u'e') choice_0() if not self.err: return @@ -1552,7 +2095,7 @@ self.err = False self.pos = p def choice_1(): - self._expect('E') + self._expect(u'E') choice_1() group() if self.err: @@ -1573,7 +2116,7 @@ if self.err: self._pop('exp_1') return - self.val = 'e' + ''.join(self._get('ds')) + self.val = u'e' + self._join(u'', self._get('ds')) self.err = None self._pop('exp_1') choice_1() @@ -1585,7 +2128,7 @@ self.pos += 1 else: self.val = None - self.err = "anything" + self.err = u'anything' def _digit_(self): if self.pos < self.end and self.msg[self.pos].isdigit(): @@ -1594,7 +2137,7 @@ self.pos += 1 else: self.val = None - self.err = "a digit" + self.err = u'a digit' return def _end_(self): @@ -1603,15 +2146,21 @@ self.err = None else: self.val = None - self.err = "the end" + self.err = u'the end' return - def _letter_(self): - if self.pos < self.end and self.msg[self.pos].isalpha(): - self.val = self.msg[self.pos] + def _is_unicat(self, var, cat): + import unicodedata + if unicodedata.category(var) == cat: + self.val = True self.err = None self.pos += 1 else: - self.val = None - self.err = "a letter" - return + self.val = False + self.err = u'unicode cat %s' % cat + + def _join(self, s, vs): + return s.join(vs) + + def _xtou(self, s): + return unichr(int(s, base=16))
diff --git a/json5/tests/json5_test.py b/json5/tests/json5_test.py index b349151..e201bc7 100644 --- a/json5/tests/json5_test.py +++ b/json5/tests/json5_test.py
@@ -20,6 +20,8 @@ class Tests(unittest.TestCase): + maxDiff = None + def check(self, s, obj): self.assertEqual(json5.loads(s), obj) @@ -62,28 +64,28 @@ with open(path) as fp: obj = json5.load(fp) self.assertEqual({ - "oh": [ - "we shouldn't forget", - "arrays can have", - "trailing commas too", + u'oh': [ + u"we shouldn't forget", + u"arrays can have", + u"trailing commas too", ], - "this": "is a \nmulti-line string", - "delta": 10, - "hex": 3735928559, - "finally": "a trailing comma", - "here": "is another", - "to": float("inf"), - "while": True, - "half": 0.5, - "foo": "bar" + u"this": u"is a multi-line string", + u"delta": 10, + u"hex": 3735928559, + u"finally": "a trailing comma", + u"here": "is another", + u"to": float("inf"), + u"while": True, + u"half": 0.5, + u"foo": u"bar" }, obj) def test_strings(self): self.check('"foo"', 'foo') self.check("'foo'", 'foo') self.check("'\x66oo'", 'foo') - self.check('"foo\\\nbar"', 'foo\nbar') - self.check("'foo\\\nbar'", 'foo\nbar') + self.check('"foo\\\nbar"', 'foobar') + self.check("'foo\\\nbar'", 'foobar') self.assertRaises(Exception, self.check, '"\n', None) self.assertRaises(Exception, self.check, "'\n", None)