blob: 2acafe70fe395dda46d3943cbb0389705535837d [file] [log] [blame]
//! UTF-8 Parse Transition Table
/// Transition table for parsing UTF-8. This is built from the grammar described
/// at https://tools.ietf.org/html/rfc3629#section-4 which I have copied and
/// formatted below.
///
/// # UTF-8 Grammar
///
/// ```ignore
/// UTF8-octets = *( UTF8-char )
/// UTF8-char = UTF8-1 / UTF8-2 / UTF8-3 / UTF8-4
/// UTF8-1 = %x00-7F
/// UTF8-2 = %xC2-DF UTF8-tail
/// UTF8-3 = %xE0 %xA0-BF UTF8-tail /
/// %xE1-EC 2( UTF8-tail ) /
/// %xED %x80-9F UTF8-tail /
/// %xEE-EF 2( UTF8-tail )
/// UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) /
/// %xF1-F3 3( UTF8-tail ) /
/// %xF4 %x80-8F 2( UTF8-tail )
/// UTF8-tail = %x80-BF
/// ```
///
/// Not specifying an action in this table is equivalent to specifying
/// Action::InvalidSequence. Not specifying a state is equivalent to specifying
/// state::ground.
pub static TRANSITIONS: [[u8; 256]; 8] = utf8_state_table! {
State::Ground => {
0x00...0x7f => (State::Ground, Action::EmitByte),
0xc2...0xdf => (State::Tail1, Action::SetByte2Top),
0xe0 => (State::U3_2_e0, Action::SetByte3Top),
0xe1...0xec => (State::Tail2, Action::SetByte3Top),
0xed => (State::U3_2_ed, Action::SetByte3Top),
0xee...0xef => (State::Tail2, Action::SetByte3Top),
0xf0 => (State::Utf8_4_3_f0, Action::SetByte4),
0xf1...0xf3 => (State::Tail3, Action::SetByte4),
0xf4 => (State::Utf8_4_3_f4, Action::SetByte4),
},
State::U3_2_e0 => {
0xa0...0xbf => (State::Tail1, Action::SetByte2),
},
State::U3_2_ed => {
0x80...0x9f => (State::Tail1, Action::SetByte2),
},
State::Utf8_4_3_f0 => {
0x90...0xbf => (State::Tail2, Action::SetByte3),
},
State::Utf8_4_3_f4 => {
0x80...0x8f => (State::Tail2, Action::SetByte3),
},
State::Tail3 => {
0x80...0xbf => (State::Tail2, Action::SetByte3),
},
State::Tail2 => {
0x80...0xbf => (State::Tail1, Action::SetByte2),
},
State::Tail1 => {
0x80...0xbf => (State::Ground, Action::SetByte1),
},
};