| use super::*; |
| use unicode_width::UnicodeWidthChar; |
| |
| #[cfg(test)] |
| mod tests; |
| |
| /// Finds all newlines, multi-byte characters, and non-narrow characters in a |
| /// SourceFile. |
| /// |
| /// This function will use an SSE2 enhanced implementation if hardware support |
| /// is detected at runtime. |
| pub fn analyze_source_file( |
| src: &str, |
| ) -> (Vec<RelativeBytePos>, Vec<MultiByteChar>, Vec<NonNarrowChar>) { |
| let mut lines = vec![RelativeBytePos::from_u32(0)]; |
| let mut multi_byte_chars = vec![]; |
| let mut non_narrow_chars = vec![]; |
| |
| // Calls the right implementation, depending on hardware support available. |
| analyze_source_file_dispatch(src, &mut lines, &mut multi_byte_chars, &mut non_narrow_chars); |
| |
| // The code above optimistically registers a new line *after* each \n |
| // it encounters. If that point is already outside the source_file, remove |
| // it again. |
| if let Some(&last_line_start) = lines.last() { |
| let source_file_end = RelativeBytePos::from_usize(src.len()); |
| assert!(source_file_end >= last_line_start); |
| if last_line_start == source_file_end { |
| lines.pop(); |
| } |
| } |
| |
| (lines, multi_byte_chars, non_narrow_chars) |
| } |
| |
| cfg_match! { |
| cfg(any(target_arch = "x86", target_arch = "x86_64")) => { |
| fn analyze_source_file_dispatch( |
| src: &str, |
| lines: &mut Vec<RelativeBytePos>, |
| multi_byte_chars: &mut Vec<MultiByteChar>, |
| non_narrow_chars: &mut Vec<NonNarrowChar>, |
| ) { |
| if is_x86_feature_detected!("sse2") { |
| unsafe { |
| analyze_source_file_sse2(src, lines, multi_byte_chars, non_narrow_chars); |
| } |
| } else { |
| analyze_source_file_generic( |
| src, |
| src.len(), |
| RelativeBytePos::from_u32(0), |
| lines, |
| multi_byte_chars, |
| non_narrow_chars, |
| ); |
| } |
| } |
| |
| /// Checks 16 byte chunks of text at a time. If the chunk contains |
| /// something other than printable ASCII characters and newlines, the |
| /// function falls back to the generic implementation. Otherwise it uses |
| /// SSE2 intrinsics to quickly find all newlines. |
| #[target_feature(enable = "sse2")] |
| unsafe fn analyze_source_file_sse2( |
| src: &str, |
| lines: &mut Vec<RelativeBytePos>, |
| multi_byte_chars: &mut Vec<MultiByteChar>, |
| non_narrow_chars: &mut Vec<NonNarrowChar>, |
| ) { |
| #[cfg(target_arch = "x86")] |
| use std::arch::x86::*; |
| #[cfg(target_arch = "x86_64")] |
| use std::arch::x86_64::*; |
| |
| const CHUNK_SIZE: usize = 16; |
| |
| let src_bytes = src.as_bytes(); |
| |
| let chunk_count = src.len() / CHUNK_SIZE; |
| |
| // This variable keeps track of where we should start decoding a |
| // chunk. If a multi-byte character spans across chunk boundaries, |
| // we need to skip that part in the next chunk because we already |
| // handled it. |
| let mut intra_chunk_offset = 0; |
| |
| for chunk_index in 0..chunk_count { |
| let ptr = src_bytes.as_ptr() as *const __m128i; |
| // We don't know if the pointer is aligned to 16 bytes, so we |
| // use `loadu`, which supports unaligned loading. |
| let chunk = unsafe { _mm_loadu_si128(ptr.add(chunk_index)) }; |
| |
| // For character in the chunk, see if its byte value is < 0, which |
| // indicates that it's part of a UTF-8 char. |
| let multibyte_test = unsafe { _mm_cmplt_epi8(chunk, _mm_set1_epi8(0)) }; |
| // Create a bit mask from the comparison results. |
| let multibyte_mask = unsafe { _mm_movemask_epi8(multibyte_test) }; |
| |
| // If the bit mask is all zero, we only have ASCII chars here: |
| if multibyte_mask == 0 { |
| assert!(intra_chunk_offset == 0); |
| |
| // Check if there are any control characters in the chunk. All |
| // control characters that we can encounter at this point have a |
| // byte value less than 32 or ... |
| let control_char_test0 = unsafe { _mm_cmplt_epi8(chunk, _mm_set1_epi8(32)) }; |
| let control_char_mask0 = unsafe { _mm_movemask_epi8(control_char_test0) }; |
| |
| // ... it's the ASCII 'DEL' character with a value of 127. |
| let control_char_test1 = unsafe { _mm_cmpeq_epi8(chunk, _mm_set1_epi8(127)) }; |
| let control_char_mask1 = unsafe { _mm_movemask_epi8(control_char_test1) }; |
| |
| let control_char_mask = control_char_mask0 | control_char_mask1; |
| |
| if control_char_mask != 0 { |
| // Check for newlines in the chunk |
| let newlines_test = unsafe { _mm_cmpeq_epi8(chunk, _mm_set1_epi8(b'\n' as i8)) }; |
| let newlines_mask = unsafe { _mm_movemask_epi8(newlines_test) }; |
| |
| if control_char_mask == newlines_mask { |
| // All control characters are newlines, record them |
| let mut newlines_mask = 0xFFFF0000 | newlines_mask as u32; |
| let output_offset = RelativeBytePos::from_usize(chunk_index * CHUNK_SIZE + 1); |
| |
| loop { |
| let index = newlines_mask.trailing_zeros(); |
| |
| if index >= CHUNK_SIZE as u32 { |
| // We have arrived at the end of the chunk. |
| break; |
| } |
| |
| lines.push(RelativeBytePos(index) + output_offset); |
| |
| // Clear the bit, so we can find the next one. |
| newlines_mask &= (!1) << index; |
| } |
| |
| // We are done for this chunk. All control characters were |
| // newlines and we took care of those. |
| continue; |
| } else { |
| // Some of the control characters are not newlines, |
| // fall through to the slow path below. |
| } |
| } else { |
| // No control characters, nothing to record for this chunk |
| continue; |
| } |
| } |
| |
| // The slow path. |
| // There are control chars in here, fallback to generic decoding. |
| let scan_start = chunk_index * CHUNK_SIZE + intra_chunk_offset; |
| intra_chunk_offset = analyze_source_file_generic( |
| &src[scan_start..], |
| CHUNK_SIZE - intra_chunk_offset, |
| RelativeBytePos::from_usize(scan_start), |
| lines, |
| multi_byte_chars, |
| non_narrow_chars, |
| ); |
| } |
| |
| // There might still be a tail left to analyze |
| let tail_start = chunk_count * CHUNK_SIZE + intra_chunk_offset; |
| if tail_start < src.len() { |
| analyze_source_file_generic( |
| &src[tail_start..], |
| src.len() - tail_start, |
| RelativeBytePos::from_usize(tail_start), |
| lines, |
| multi_byte_chars, |
| non_narrow_chars, |
| ); |
| } |
| } |
| } |
| _ => { |
| // The target (or compiler version) does not support SSE2 ... |
| fn analyze_source_file_dispatch( |
| src: &str, |
| lines: &mut Vec<RelativeBytePos>, |
| multi_byte_chars: &mut Vec<MultiByteChar>, |
| non_narrow_chars: &mut Vec<NonNarrowChar>, |
| ) { |
| analyze_source_file_generic( |
| src, |
| src.len(), |
| RelativeBytePos::from_u32(0), |
| lines, |
| multi_byte_chars, |
| non_narrow_chars, |
| ); |
| } |
| } |
| } |
| // `scan_len` determines the number of bytes in `src` to scan. Note that the |
| // function can read past `scan_len` if a multi-byte character start within the |
| // range but extends past it. The overflow is returned by the function. |
| fn analyze_source_file_generic( |
| src: &str, |
| scan_len: usize, |
| output_offset: RelativeBytePos, |
| lines: &mut Vec<RelativeBytePos>, |
| multi_byte_chars: &mut Vec<MultiByteChar>, |
| non_narrow_chars: &mut Vec<NonNarrowChar>, |
| ) -> usize { |
| assert!(src.len() >= scan_len); |
| let mut i = 0; |
| let src_bytes = src.as_bytes(); |
| |
| while i < scan_len { |
| let byte = unsafe { |
| // We verified that i < scan_len <= src.len() |
| *src_bytes.get_unchecked(i) |
| }; |
| |
| // How much to advance in order to get to the next UTF-8 char in the |
| // string. |
| let mut char_len = 1; |
| |
| if byte < 32 { |
| // This is an ASCII control character, it could be one of the cases |
| // that are interesting to us. |
| |
| let pos = RelativeBytePos::from_usize(i) + output_offset; |
| |
| match byte { |
| b'\n' => { |
| lines.push(pos + RelativeBytePos(1)); |
| } |
| b'\t' => { |
| non_narrow_chars.push(NonNarrowChar::Tab(pos)); |
| } |
| _ => { |
| non_narrow_chars.push(NonNarrowChar::ZeroWidth(pos)); |
| } |
| } |
| } else if byte >= 127 { |
| // The slow path: |
| // This is either ASCII control character "DEL" or the beginning of |
| // a multibyte char. Just decode to `char`. |
| let c = src[i..].chars().next().unwrap(); |
| char_len = c.len_utf8(); |
| |
| let pos = RelativeBytePos::from_usize(i) + output_offset; |
| |
| if char_len > 1 { |
| assert!((2..=4).contains(&char_len)); |
| let mbc = MultiByteChar { pos, bytes: char_len as u8 }; |
| multi_byte_chars.push(mbc); |
| } |
| |
| // Assume control characters are zero width. |
| // FIXME: How can we decide between `width` and `width_cjk`? |
| let char_width = UnicodeWidthChar::width(c).unwrap_or(0); |
| |
| if char_width != 1 { |
| non_narrow_chars.push(NonNarrowChar::new(pos, char_width)); |
| } |
| } |
| |
| i += char_len; |
| } |
| |
| i - scan_len |
| } |