blob: 8f7a0e8f17699588ddb41f8469dad8b5a0dd2bda [file] [log] [blame]
/* Copyright 2016 The encode_unicode Developers
*
* Licensed under the Apache License, Version 2.0, <LICENSE-APACHE or
* http://apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or
* http://opensource.org/licenses/MIT>, at your option. This file may not be
* copied, modified, or distributed except according to those terms.
*/
use traits::CharExt;
use utf16_char::Utf16Char;
use errors::EmptyStrError;
extern crate core;
use self::core::fmt;
use self::core::borrow::Borrow;
// Invalid values that says the field is consumed or empty.
const FIRST_USED: u16 = 0x_dc_00;
const SECOND_USED: u16 = 0;
/// Iterate over the units of the UTF-16 representation of a codepoint.
#[derive(Clone)]
pub struct Utf16Iterator {
first: u16,
second: u16,
}
impl From<char> for Utf16Iterator {
fn from(c: char) -> Self {
let (first, second) = c.to_utf16_tuple();
Utf16Iterator{ first: first, second: second.unwrap_or(SECOND_USED) }
}
}
impl From<Utf16Char> for Utf16Iterator {
fn from(uc: Utf16Char) -> Self {
let (first, second) = uc.to_tuple();
Utf16Iterator{ first: first, second: second.unwrap_or(SECOND_USED) }
}
}
impl Iterator for Utf16Iterator {
type Item=u16;
fn next(&mut self) -> Option<u16> {
match (self.first, self.second) {
(FIRST_USED, SECOND_USED) => { None },
(FIRST_USED, second ) => {self.second = SECOND_USED; Some(second)},
(first , _ ) => {self.first = FIRST_USED; Some(first )},
}
}
fn size_hint(&self) -> (usize, Option<usize>) {
(self.len(), Some(self.len()))
}
}
impl ExactSizeIterator for Utf16Iterator {
fn len(&self) -> usize {
(if self.first == FIRST_USED {0} else {1}) +
(if self.second == SECOND_USED {0} else {1})
}
}
impl fmt::Debug for Utf16Iterator {
fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result {
let mut clone = self.clone();
match (clone.next(), clone.next()) {
(Some(one), None) => write!(fmtr, "[{}]", one),
(Some(a), Some(b)) => write!(fmtr, "[{}, {}]", a, b),
(None, _) => write!(fmtr, "[]"),
}
}
}
/// Converts an iterator of `Utf16Char` (or `&Utf16Char`)
/// to an iterator of `u16`s.
/// Is equivalent to calling `.flat_map()` on the original iterator,
/// but the returned iterator is about twice as fast.
///
/// The exact number of units cannot be known in advance, but `size_hint()`
/// gives the possible range.
///
/// # Examples
///
/// From iterator of values:
///
/// ```
/// use encode_unicode::{iter_units, CharExt};
///
/// let iterator = "foo".chars().map(|c| c.to_utf16() );
/// let mut units = [0; 4];
/// for (u,dst) in iter_units(iterator).zip(&mut units) {*dst=u;}
/// assert_eq!(units, ['f' as u16, 'o' as u16, 'o' as u16, 0]);
/// ```
///
/// From iterator of references:
///
#[cfg_attr(feature="std", doc=" ```")]
#[cfg_attr(not(feature="std"), doc=" ```no_compile")]
/// use encode_unicode::{iter_units, CharExt, Utf16Char};
///
/// // (💣 takes two units)
/// let chars: Vec<Utf16Char> = "💣 bomb 💣".chars().map(|c| c.to_utf16() ).collect();
/// let units: Vec<u16> = iter_units(&chars).collect();
/// let flat_map: Vec<u16> = chars.iter().flat_map(|u16c| *u16c ).collect();
/// assert_eq!(units, flat_map);
/// ```
pub fn iter_units<U:Borrow<Utf16Char>, I:IntoIterator<Item=U>>
(iterable: I) -> Utf16CharSplitter<U, I::IntoIter> {
Utf16CharSplitter{ inner: iterable.into_iter(), prev_second: 0 }
}
/// The iterator type returned by `iter_units()`
#[derive(Clone)]
pub struct Utf16CharSplitter<U:Borrow<Utf16Char>, I:Iterator<Item=U>> {
inner: I,
prev_second: u16,
}
impl<I:Iterator<Item=Utf16Char>> From<I> for Utf16CharSplitter<Utf16Char,I> {
/// A less generic constructor than `iter_units()`
fn from(iter: I) -> Self {
iter_units(iter)
}
}
impl<U:Borrow<Utf16Char>, I:Iterator<Item=U>> Utf16CharSplitter<U,I> {
/// Extracts the source iterator.
///
/// Note that `iter_units(iter.into_inner())` is not a no-op:
/// If the last returned unit from `next()` was a leading surrogate,
/// the trailing surrogate is lost.
pub fn into_inner(self) -> I {
self.inner
}
}
impl<U:Borrow<Utf16Char>, I:Iterator<Item=U>> Iterator for Utf16CharSplitter<U,I> {
type Item = u16;
fn next(&mut self) -> Option<Self::Item> {
if self.prev_second == 0 {
self.inner.next().map(|u16c| {
let (first, second) = u16c.borrow().to_tuple();
self.prev_second = second.unwrap_or(0);
first
})
} else {
let prev_second = self.prev_second;
self.prev_second = 0;
Some(prev_second)
}
}
fn size_hint(&self) -> (usize,Option<usize>) {
// Doesn't need to handle unlikely overflows correctly because
// size_hint() cannot be relied upon anyway. (the trait isn't unsafe)
let (min, max) = self.inner.size_hint();
let add = if self.prev_second == 0 {0} else {1};
(min.wrapping_add(add), max.map(|max| max.wrapping_mul(2).wrapping_add(add) ))
}
}
/// An iterator over the codepoints in a `str` represented as `Utf16Char`.
#[derive(Clone)]
pub struct Utf16CharIndices<'a>{
str: &'a str,
index: usize,
}
impl<'a> From<&'a str> for Utf16CharIndices<'a> {
fn from(s: &str) -> Utf16CharIndices {
Utf16CharIndices{str: s, index: 0}
}
}
impl<'a> Utf16CharIndices<'a> {
/// Extract the remainder of the source `str`.
///
/// # Examples
///
/// ```
/// use encode_unicode::{StrExt, Utf16Char};
/// let mut iter = "abc".utf16char_indices();
/// assert_eq!(iter.next_back(), Some((2, Utf16Char::from('c'))));
/// assert_eq!(iter.next(), Some((0, Utf16Char::from('a'))));
/// assert_eq!(iter.as_str(), "b");
/// ```
pub fn as_str(&self) -> &'a str {
&self.str[self.index..]
}
}
impl<'a> Iterator for Utf16CharIndices<'a> {
type Item = (usize,Utf16Char);
fn next(&mut self) -> Option<(usize,Utf16Char)> {
match Utf16Char::from_str_start(&self.str[self.index..]) {
Ok((u16c, bytes)) => {
let item = (self.index, u16c);
self.index += bytes;
Some(item)
},
Err(EmptyStrError) => None
}
}
fn size_hint(&self) -> (usize,Option<usize>) {
let len = self.str.len() - self.index;
// For len+3 to overflow, the slice must fill all but two bytes of
// addressable memory, and size_hint() doesn't need to be correct.
(len.wrapping_add(3)/4, Some(len))
}
}
impl<'a> DoubleEndedIterator for Utf16CharIndices<'a> {
fn next_back(&mut self) -> Option<(usize,Utf16Char)> {
if self.index < self.str.len() {
let rev = self.str.bytes().rev();
let len = 1 + rev.take_while(|b| b & 0b1100_0000 == 0b1000_0000 ).count();
let starts = self.str.len() - len;
let (u16c,_) = Utf16Char::from_str_start(&self.str[starts..]).unwrap();
self.str = &self.str[..starts];
Some((starts, u16c))
} else {
None
}
}
}
impl<'a> fmt::Debug for Utf16CharIndices<'a> {
fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result {
fmtr.debug_tuple("Utf16CharIndices")
.field(&self.index)
.field(&self.as_str())
.finish()
}
}
/// An iterator over the codepoints in a `str` represented as `Utf16Char`.
#[derive(Clone)]
pub struct Utf16Chars<'a>(Utf16CharIndices<'a>);
impl<'a> From<&'a str> for Utf16Chars<'a> {
fn from(s: &str) -> Utf16Chars {
Utf16Chars(Utf16CharIndices::from(s))
}
}
impl<'a> Utf16Chars<'a> {
/// Extract the remainder of the source `str`.
///
/// # Examples
///
/// ```
/// use encode_unicode::{StrExt, Utf16Char};
/// let mut iter = "abc".utf16chars();
/// assert_eq!(iter.next(), Some(Utf16Char::from('a')));
/// assert_eq!(iter.next_back(), Some(Utf16Char::from('c')));
/// assert_eq!(iter.as_str(), "b");
/// ```
pub fn as_str(&self) -> &'a str {
self.0.as_str()
}
}
impl<'a> Iterator for Utf16Chars<'a> {
type Item = Utf16Char;
fn next(&mut self) -> Option<Utf16Char> {
self.0.next().map(|(_,u16c)| u16c )
}
fn size_hint(&self) -> (usize,Option<usize>) {
self.0.size_hint()
}
}
impl<'a> DoubleEndedIterator for Utf16Chars<'a> {
fn next_back(&mut self) -> Option<Utf16Char> {
self.0.next_back().map(|(_,u16c)| u16c )
}
}
impl<'a> fmt::Debug for Utf16Chars<'a> {
fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result {
fmtr.debug_tuple("Utf16Chars")
.field(&self.as_str())
.finish()
}
}