blob: bf187dc4fc57d8c9a6c58e901a6b16d4126f015b [file] [log] [blame]
cdef struct ReaderUCS:
Py_ssize_t remaining
Py_ssize_t position
Py_ssize_t maxdepth
cdef struct ReaderUCS1:
ReaderUCS base
const Py_UCS1 *string
cdef struct ReaderUCS2:
ReaderUCS base
const Py_UCS2 *string
cdef struct ReaderUCS4:
ReaderUCS base
const Py_UCS4 *string
cdef struct ReaderUTF8:
ReaderUCS base
const Py_UCS1 *string
ctypedef ReaderUCS1 &ReaderUCS1Ref
ctypedef ReaderUCS2 &ReaderUCS2Ref
ctypedef ReaderUCS4 &ReaderUCS4Ref
ctypedef ReaderUTF8 &ReaderUTF8Ref
ctypedef Py_UCS1 *UCS1String
ctypedef Py_UCS2 *UCS2String
ctypedef Py_UCS4 *UCS4String
ctypedef fused ReaderUCSRef:
ReaderUCS1Ref
ReaderUCS2Ref
ReaderUCS4Ref
ReaderUTF8Ref
ctypedef fused UCSString:
UCS1String
UCS2String
UCS4String
cdef inline int32_t _reader_ucs_good(ReaderUCSRef self):
return self.base.remaining > 0
cdef inline uint32_t _reader_ucs_get(ReaderUCSRef self):
cdef int32_t c = self.string[0]
self.string += 1
self.base.remaining -= 1
self.base.position += 1
return cast_to_uint32(c)
cdef inline uint32_t _reader_utf8_get(ReaderUCSRef self):
cdef uint32_t c0 = _reader_ucs_get(self)
cdef unsigned int n
if (c0 & 0b1_0000000) == 0b0_0000000: # ASCII
return c0
elif (c0 & 0b11_000000) == 0b10_000000: # broken continuation
return c0
elif (c0 & 0b111_00000) == 0b110_00000: # 2 bytes
c0 = (c0 & 0b000_11111)
n = 1
elif (c0 & 0b1111_0000) == 0b1110_0000: # 3 bytes
c0 = (c0 & 0b0000_1111)
n = 2
elif (c0 & 0b11111_000) == 0b11110_000: # 4 bytes
c0 = (c0 & 0b00000_111)
n = 3
else: # 5+ bytes, invalid
return c0
for n in range(n, 0, -1):
if not _reader_ucs_good(self):
return c0
c0 = (c0 << 6) | (_reader_ucs_get(self) & 0b00_111111)
return c0