| /* Copyright 2016 The encode_unicode Developers |
| * |
| * Licensed under the Apache License, Version 2.0, <LICENSE-APACHE or |
| * http://apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or |
| * http://opensource.org/licenses/MIT>, at your option. This file may not be |
| * copied, modified, or distributed except according to those terms. |
| */ |
| |
| use utf8_char::Utf8Char; |
| use errors::EmptyStrError; |
| extern crate core; |
| use self::core::{mem, u32, u64}; |
| use self::core::ops::Not; |
| use self::core::fmt; |
| use self::core::borrow::Borrow; |
| #[cfg(feature="std")] |
| use std::io::{Read, Error as ioError}; |
| |
| |
| |
| /// Read or iterate over the bytes of the UTF-8 representation of a codepoint. |
| #[derive(Clone)] |
| pub struct Utf8Iterator (u32); |
| |
| impl From<Utf8Char> for Utf8Iterator { |
| fn from(uc: Utf8Char) -> Self { |
| let used = u32::from_le(unsafe{ mem::transmute(uc.to_array().0) }); |
| // uses u64 because shifting an u32 by 32 bits is a no-op. |
| let unused_set = (u64::MAX << uc.len() as u64*8) as u32; |
| Utf8Iterator(used | unused_set) |
| } |
| } |
| impl From<char> for Utf8Iterator { |
| fn from(c: char) -> Self { |
| Self::from(Utf8Char::from(c)) |
| } |
| } |
| impl Iterator for Utf8Iterator { |
| type Item=u8; |
| fn next(&mut self) -> Option<u8> { |
| let next = self.0 as u8; |
| if next == 0xff { |
| None |
| } else { |
| self.0 = (self.0 >> 8) | 0xff_00_00_00; |
| Some(next) |
| } |
| } |
| fn size_hint(&self) -> (usize, Option<usize>) { |
| (self.len(), Some(self.len())) |
| } |
| } |
| impl ExactSizeIterator for Utf8Iterator { |
| fn len(&self) -> usize {// not straightforward, but possible |
| let unused_bytes = self.0.not().leading_zeros() / 8; |
| 4 - unused_bytes as usize |
| } |
| } |
| #[cfg(feature="std")] |
| impl Read for Utf8Iterator { |
| /// Always returns Ok |
| fn read(&mut self, buf: &mut[u8]) -> Result<usize, ioError> { |
| // Cannot call self.next() until I know I can write the result. |
| for (i, dst) in buf.iter_mut().enumerate() { |
| match self.next() { |
| Some(b) => *dst = b, |
| None => return Ok(i), |
| } |
| } |
| Ok(buf.len()) |
| } |
| } |
| impl fmt::Debug for Utf8Iterator { |
| fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result { |
| let mut content = [0; 4]; |
| let mut i = 0; |
| for b in self.clone() { |
| content[i] = b; |
| i += 1; |
| } |
| write!(fmtr, "{:?}", &content[..i]) |
| } |
| } |
| |
| |
| |
| /// Converts an iterator of `Utf8Char` (or `&Utf8Char`) |
| /// to an iterator of `u8`s. |
| /// Is equivalent to calling `.flat_map()` on the original iterator, |
| /// but the returned iterator is ~40% faster. |
| /// |
| /// The iterator also implements `Read` (if the `std` feature isn't disabled). |
| /// Reading will never produce an error, and calls to `.read()` and `.next()` |
| /// can be mixed. |
| /// |
| /// The exact number of bytes cannot be known in advance, but `size_hint()` |
| /// gives the possible range. |
| /// (min: all remaining characters are ASCII, max: all require four bytes) |
| /// |
| /// # Examples |
| /// |
| /// From iterator of values: |
| /// |
| /// ``` |
| /// use encode_unicode::{iter_bytes, CharExt}; |
| /// |
| /// let iterator = "foo".chars().map(|c| c.to_utf8() ); |
| /// let mut bytes = [0; 4]; |
| /// for (u,dst) in iter_bytes(iterator).zip(&mut bytes) {*dst=u;} |
| /// assert_eq!(&bytes, b"foo\0"); |
| /// ``` |
| /// |
| /// From iterator of references: |
| /// |
| #[cfg_attr(feature="std", doc=" ```")] |
| #[cfg_attr(not(feature="std"), doc=" ```no_compile")] |
| /// use encode_unicode::{iter_bytes, CharExt, Utf8Char}; |
| /// |
| /// let chars: Vec<Utf8Char> = "💣 bomb 💣".chars().map(|c| c.to_utf8() ).collect(); |
| /// let bytes: Vec<u8> = iter_bytes(&chars).collect(); |
| /// let flat_map: Vec<u8> = chars.iter().flat_map(|u8c| *u8c ).collect(); |
| /// assert_eq!(bytes, flat_map); |
| /// ``` |
| /// |
| /// `Read`ing from it: |
| /// |
| #[cfg_attr(feature="std", doc=" ```")] |
| #[cfg_attr(not(feature="std"), doc=" ```no_compile")] |
| /// use encode_unicode::{iter_bytes, CharExt}; |
| /// use std::io::Read; |
| /// |
| /// let s = "Ååh‽"; |
| /// assert_eq!(s.len(), 8); |
| /// let mut buf = [b'E'; 9]; |
| /// let mut reader = iter_bytes(s.chars().map(|c| c.to_utf8() )); |
| /// assert_eq!(reader.read(&mut buf[..]).unwrap(), 8); |
| /// assert_eq!(reader.read(&mut buf[..]).unwrap(), 0); |
| /// assert_eq!(&buf[..8], s.as_bytes()); |
| /// assert_eq!(buf[8], b'E'); |
| /// ``` |
| pub fn iter_bytes<U:Borrow<Utf8Char>, I:IntoIterator<Item=U>> |
| (iterable: I) -> Utf8CharSplitter<U, I::IntoIter> { |
| Utf8CharSplitter{ inner: iterable.into_iter(), prev: 0 } |
| } |
| |
| /// The iterator type returned by `iter_bytes()` |
| /// |
| /// See its documentation for details. |
| #[derive(Clone)] |
| pub struct Utf8CharSplitter<U:Borrow<Utf8Char>, I:Iterator<Item=U>> { |
| inner: I, |
| prev: u32, |
| } |
| impl<I:Iterator<Item=Utf8Char>> From<I> for Utf8CharSplitter<Utf8Char,I> { |
| /// A less generic constructor than `iter_bytes()` |
| fn from(iter: I) -> Self { |
| iter_bytes(iter) |
| } |
| } |
| impl<U:Borrow<Utf8Char>, I:Iterator<Item=U>> Utf8CharSplitter<U,I> { |
| /// Extracts the source iterator. |
| /// |
| /// Note that `iter_bytes(iter.into_inner())` is not a no-op: |
| /// If the last returned byte from `next()` was not an ASCII by, |
| /// the remaining bytes of that codepoint is lost. |
| pub fn into_inner(self) -> I { |
| self.inner |
| } |
| } |
| impl<U:Borrow<Utf8Char>, I:Iterator<Item=U>> Iterator for Utf8CharSplitter<U,I> { |
| type Item = u8; |
| fn next(&mut self) -> Option<Self::Item> { |
| if self.prev == 0 { |
| self.inner.next().map(|u8c| { |
| let array = u8c.borrow().to_array().0; |
| self.prev = unsafe{ u32::from_le(mem::transmute(array)) } >> 8; |
| array[0] |
| }) |
| } else { |
| let next = self.prev as u8; |
| self.prev >>= 8; |
| Some(next) |
| } |
| } |
| fn size_hint(&self) -> (usize,Option<usize>) { |
| // Doesn't need to handle unlikely overflows correctly because |
| // size_hint() cannot be relied upon anyway. (the trait isn't unsafe) |
| let (min, max) = self.inner.size_hint(); |
| let add = 4 - (self.prev.leading_zeros() / 8) as usize; |
| (min.wrapping_add(add), max.map(|max| max.wrapping_mul(4).wrapping_add(add) )) |
| } |
| } |
| #[cfg(feature="std")] |
| impl<U:Borrow<Utf8Char>, I:Iterator<Item=U>> Read for Utf8CharSplitter<U,I> { |
| /// Always returns `Ok` |
| fn read(&mut self, buf: &mut[u8]) -> Result<usize, ioError> { |
| let mut i = 0; |
| // write remaining bytes of previous codepoint |
| while self.prev != 0 && i < buf.len() { |
| buf[i] = self.prev as u8; |
| self.prev >>= 8; |
| i += 1; |
| } |
| // write whole characters |
| while i < buf.len() { |
| let bytes = match self.inner.next() { |
| Some(u8c) => u8c.borrow().to_array().0, |
| None => break |
| }; |
| buf[i] = bytes[0]; |
| i += 1; |
| if bytes[1] != 0 { |
| let len = bytes[0].not().leading_zeros() as usize; |
| let mut written = 1; |
| while written < len { |
| if i < buf.len() { |
| buf[i] = bytes[written]; |
| i += 1; |
| written += 1; |
| } else { |
| let bytes_as_u32 = unsafe{ u32::from_le(mem::transmute(bytes)) }; |
| self.prev = bytes_as_u32 >> (8*written); |
| return Ok(i); |
| } |
| } |
| } |
| } |
| Ok(i) |
| } |
| } |
| |
| |
| |
| /// An iterator over the `Utf8Char` of a string slice, and their positions. |
| /// |
| /// This struct is created by the `utf8char_indices() method from [`StrExt`] trait. See its documentation for more. |
| #[derive(Clone)] |
| pub struct Utf8CharIndices<'a>{ |
| str: &'a str, |
| index: usize, |
| } |
| impl<'a> From<&'a str> for Utf8CharIndices<'a> { |
| fn from(s: &str) -> Utf8CharIndices { |
| Utf8CharIndices{str: s, index: 0} |
| } |
| } |
| impl<'a> Utf8CharIndices<'a> { |
| /// Extract the remainder of the source `str`. |
| /// |
| /// # Examples |
| /// |
| /// ``` |
| /// use encode_unicode::{StrExt, Utf8Char}; |
| /// let mut iter = "abc".utf8char_indices(); |
| /// assert_eq!(iter.next_back(), Some((2, Utf8Char::from('c')))); |
| /// assert_eq!(iter.next(), Some((0, Utf8Char::from('a')))); |
| /// assert_eq!(iter.as_str(), "b"); |
| /// ``` |
| pub fn as_str(&self) -> &'a str { |
| &self.str[self.index..] |
| } |
| } |
| impl<'a> Iterator for Utf8CharIndices<'a> { |
| type Item = (usize,Utf8Char); |
| fn next(&mut self) -> Option<(usize,Utf8Char)> { |
| match Utf8Char::from_str_start(&self.str[self.index..]) { |
| Ok((u8c, len)) => { |
| let item = (self.index, u8c); |
| self.index += len; |
| Some(item) |
| }, |
| Err(EmptyStrError) => None |
| } |
| } |
| fn size_hint(&self) -> (usize,Option<usize>) { |
| let len = self.str.len() - self.index; |
| // For len+3 to overflow, the slice must fill all but two bytes of |
| // addressable memory, and size_hint() doesn't need to be correct. |
| (len.wrapping_add(3)/4, Some(len)) |
| } |
| } |
| impl<'a> DoubleEndedIterator for Utf8CharIndices<'a> { |
| fn next_back(&mut self) -> Option<(usize,Utf8Char)> { |
| // Cannot refactor out the unwrap without switching to ::from_slice() |
| // since slicing the str panics if not on a boundary. |
| if self.index < self.str.len() { |
| let rev = self.str.bytes().rev(); |
| let len = 1 + rev.take_while(|b| b & 0b1100_0000 == 0b1000_0000 ).count(); |
| let starts = self.str.len() - len; |
| let (u8c,_) = Utf8Char::from_str_start(&self.str[starts..]).unwrap(); |
| self.str = &self.str[..starts]; |
| Some((starts, u8c)) |
| } else { |
| None |
| } |
| } |
| } |
| impl<'a> fmt::Debug for Utf8CharIndices<'a> { |
| fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result { |
| fmtr.debug_tuple("Utf8CharIndices") |
| .field(&self.index) |
| .field(&self.as_str()) |
| .finish() |
| } |
| } |
| |
| |
| /// An iterator over the codepoints in a `str` represented as `Utf8Char`. |
| #[derive(Clone)] |
| pub struct Utf8Chars<'a>(Utf8CharIndices<'a>); |
| impl<'a> From<&'a str> for Utf8Chars<'a> { |
| fn from(s: &str) -> Utf8Chars { |
| Utf8Chars(Utf8CharIndices::from(s)) |
| } |
| } |
| impl<'a> Utf8Chars<'a> { |
| /// Extract the remainder of the source `str`. |
| /// |
| /// # Examples |
| /// |
| /// ``` |
| /// use encode_unicode::{StrExt, Utf8Char}; |
| /// let mut iter = "abc".utf8chars(); |
| /// assert_eq!(iter.next(), Some(Utf8Char::from('a'))); |
| /// assert_eq!(iter.next_back(), Some(Utf8Char::from('c'))); |
| /// assert_eq!(iter.as_str(), "b"); |
| /// ``` |
| pub fn as_str(&self) -> &'a str { |
| self.0.as_str() |
| } |
| } |
| impl<'a> Iterator for Utf8Chars<'a> { |
| type Item = Utf8Char; |
| fn next(&mut self) -> Option<Utf8Char> { |
| self.0.next().map(|(_,u8c)| u8c ) |
| } |
| fn size_hint(&self) -> (usize,Option<usize>) { |
| self.0.size_hint() |
| } |
| } |
| impl<'a> DoubleEndedIterator for Utf8Chars<'a> { |
| fn next_back(&mut self) -> Option<Utf8Char> { |
| self.0.next_back().map(|(_,u8c)| u8c ) |
| } |
| } |
| impl<'a> fmt::Debug for Utf8Chars<'a> { |
| fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result { |
| fmtr.debug_tuple("Utf8CharIndices") |
| .field(&self.as_str()) |
| .finish() |
| } |
| } |