src/librustc_unicode/u_str.rs - third_party/rust - Git at Google

 // Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
 // file at the top-level directory of this distribution and at
 // http://rust-lang.org/COPYRIGHT.
 //
 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
 // option. This file may not be copied, modified, or distributed
 // except according to those terms.

 //! Unicode-intensive string manipulations.
 //!
 //! This module provides functionality to `str` that requires the Unicode
 //! methods provided by the unicode parts of the CharExt trait.

 use core::char;
 use core::iter::Filter;
 use core::str::Split;

 /// An iterator over the non-whitespace substrings of a string,
 /// separated by any amount of whitespace.
 #[stable(feature = "split_whitespace", since = "1.1.0")]
 pub struct SplitWhitespace<'a> {
     inner: Filter<Split<'a, fn(char) -> bool>, fn(&&str) -> bool>,
 }

 /// Methods for Unicode string slices
 #[allow(missing_docs)] // docs in libcollections
 pub trait UnicodeStr {
     fn split_whitespace<'a>(&'a self) -> SplitWhitespace<'a>;
     fn is_whitespace(&self) -> bool;
     fn is_alphanumeric(&self) -> bool;
     fn trim(&self) -> &str;
     fn trim_left(&self) -> &str;
     fn trim_right(&self) -> &str;
 }

 impl UnicodeStr for str {
     #[inline]
     fn split_whitespace(&self) -> SplitWhitespace {
         fn is_not_empty(s: &&str) -> bool {
             !s.is_empty()
         }
         let is_not_empty: fn(&&str) -> bool = is_not_empty; // coerce to fn pointer

         fn is_whitespace(c: char) -> bool {
             c.is_whitespace()
         }
         let is_whitespace: fn(char) -> bool = is_whitespace; // coerce to fn pointer

         SplitWhitespace { inner: self.split(is_whitespace).filter(is_not_empty) }
     }

     #[inline]
     fn is_whitespace(&self) -> bool {
         self.chars().all(|c| c.is_whitespace())
     }

     #[inline]
     fn is_alphanumeric(&self) -> bool {
         self.chars().all(|c| c.is_alphanumeric())
     }

     #[inline]
     fn trim(&self) -> &str {
         self.trim_matches(|c: char| c.is_whitespace())
     }

     #[inline]
     fn trim_left(&self) -> &str {
         self.trim_left_matches(|c: char| c.is_whitespace())
     }

     #[inline]
     fn trim_right(&self) -> &str {
         self.trim_right_matches(|c: char| c.is_whitespace())
     }
 }

 // https://tools.ietf.org/html/rfc3629
 static UTF8_CHAR_WIDTH: [u8; 256] = [
 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x1F
 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x3F
 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x5F
 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x7F
 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0x9F
 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0xBF
 0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // 0xDF
 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, // 0xEF
 4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, // 0xFF
 ];

 /// Given a first byte, determine how many bytes are in this UTF-8 character
 #[inline]
 pub fn utf8_char_width(b: u8) -> usize {
     return UTF8_CHAR_WIDTH[b as usize] as usize;
 }

 /// Determines if a vector of `u16` contains valid UTF-16
 pub fn is_utf16(v: &[u16]) -> bool {
     let mut it = v.iter();
     macro_rules! next { ($ret:expr) => {
             match it.next() { Some(u) => *u, None => return $ret }
         }
     }
     loop {
         let u = next!(true);

         match char::from_u32(u as u32) {
             Some(_) => {}
             None => {
                 let u2 = next!(false);
                 if u < 0xD7FF || u > 0xDBFF || u2 < 0xDC00 || u2 > 0xDFFF {
                     return false;
                 }
             }
         }
     }
 }

 /// Iterator adaptor for encoding `char`s to UTF-16.
 #[derive(Clone)]
 pub struct Utf16Encoder<I> {
     chars: I,
     extra: u16,
 }

 impl<I> Utf16Encoder<I> {
     /// Create a UTF-16 encoder from any `char` iterator.
     pub fn new(chars: I) -> Utf16Encoder<I>
         where I: Iterator<Item = char>
     {
         Utf16Encoder {
             chars: chars,
             extra: 0,
         }
     }
 }

 impl<I> Iterator for Utf16Encoder<I>
     where I: Iterator<Item = char>
 {
     type Item = u16;

     #[inline]
     fn next(&mut self) -> Option<u16> {
         if self.extra != 0 {
             let tmp = self.extra;
             self.extra = 0;
             return Some(tmp);
         }

         self.chars.next().map(|ch| {
             let n = CharExt::encode_utf16(ch);
             let n = n.as_slice();
             if n.len() == 2 {
                 self.extra = n[1];
             }
             n[0]
         })
     }

     #[inline]
     fn size_hint(&self) -> (usize, Option<usize>) {
         let (low, high) = self.chars.size_hint();
         // every char gets either one u16 or two u16,
         // so this iterator is between 1 or 2 times as
         // long as the underlying iterator.
         (low, high.and_then(|n| n.checked_mul(2)))
     }
 }

 impl<'a> Iterator for SplitWhitespace<'a> {
     type Item = &'a str;

     fn next(&mut self) -> Option<&'a str> {
         self.inner.next()
     }
 }
 impl<'a> DoubleEndedIterator for SplitWhitespace<'a> {
     fn next_back(&mut self) -> Option<&'a str> {
         self.inner.next_back()
     }
 }
	// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
	// file at the top-level directory of this distribution and at
	// http://rust-lang.org/COPYRIGHT.
	//
	// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
	// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
	// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
	// option. This file may not be copied, modified, or distributed
	// except according to those terms.

	//! Unicode-intensive string manipulations.
	//!
	//! This module provides functionality to `str` that requires the Unicode
	//! methods provided by the unicode parts of the CharExt trait.

	use core::char;
	use core::iter::Filter;
	use core::str::Split;

	/// An iterator over the non-whitespace substrings of a string,
	/// separated by any amount of whitespace.
	#[stable(feature = "split_whitespace", since = "1.1.0")]
	pub struct SplitWhitespace<'a> {
	inner: Filter<Split<'a, fn(char) -> bool>, fn(&&str) -> bool>,
	}

	/// Methods for Unicode string slices
	#[allow(missing_docs)] // docs in libcollections
	pub trait UnicodeStr {
	fn split_whitespace<'a>(&'a self) -> SplitWhitespace<'a>;
	fn is_whitespace(&self) -> bool;
	fn is_alphanumeric(&self) -> bool;
	fn trim(&self) -> &str;
	fn trim_left(&self) -> &str;
	fn trim_right(&self) -> &str;
	}

	impl UnicodeStr for str {
	#[inline]
	fn split_whitespace(&self) -> SplitWhitespace {
	fn is_not_empty(s: &&str) -> bool {
	!s.is_empty()
	}
	let is_not_empty: fn(&&str) -> bool = is_not_empty; // coerce to fn pointer

	fn is_whitespace(c: char) -> bool {
	c.is_whitespace()
	}
	let is_whitespace: fn(char) -> bool = is_whitespace; // coerce to fn pointer

	SplitWhitespace { inner: self.split(is_whitespace).filter(is_not_empty) }
	}

	#[inline]
	fn is_whitespace(&self) -> bool {
	self.chars().all(\|c\| c.is_whitespace())
	}

	#[inline]
	fn is_alphanumeric(&self) -> bool {
	self.chars().all(\|c\| c.is_alphanumeric())
	}

	#[inline]
	fn trim(&self) -> &str {
	self.trim_matches(\|c: char\| c.is_whitespace())
	}

	#[inline]
	fn trim_left(&self) -> &str {
	self.trim_left_matches(\|c: char\| c.is_whitespace())
	}

	#[inline]
	fn trim_right(&self) -> &str {
	self.trim_right_matches(\|c: char\| c.is_whitespace())
	}
	}

	// https://tools.ietf.org/html/rfc3629
	static UTF8_CHAR_WIDTH: [u8; 256] = [
	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x1F
	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x3F
	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x5F
	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x7F
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0x9F
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0xBF
	0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
	2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // 0xDF
	3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, // 0xEF
	4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, // 0xFF
	];

	/// Given a first byte, determine how many bytes are in this UTF-8 character
	#[inline]
	pub fn utf8_char_width(b: u8) -> usize {
	return UTF8_CHAR_WIDTH[b as usize] as usize;
	}

	/// Determines if a vector of `u16` contains valid UTF-16
	pub fn is_utf16(v: &[u16]) -> bool {
	let mut it = v.iter();
	macro_rules! next { ($ret:expr) => {
	match it.next() { Some(u) => *u, None => return $ret }
	}
	}
	loop {
	let u = next!(true);

	match char::from_u32(u as u32) {
	Some(_) => {}
	None => {
	let u2 = next!(false);
	if u < 0xD7FF \|\| u > 0xDBFF \|\| u2 < 0xDC00 \|\| u2 > 0xDFFF {
	return false;
	}
	}
	}
	}
	}

	/// Iterator adaptor for encoding `char`s to UTF-16.
	#[derive(Clone)]
	pub struct Utf16Encoder<I> {
	chars: I,
	extra: u16,
	}

	impl<I> Utf16Encoder<I> {
	/// Create a UTF-16 encoder from any `char` iterator.
	pub fn new(chars: I) -> Utf16Encoder<I>
	where I: Iterator<Item = char>
	{
	Utf16Encoder {
	chars: chars,
	extra: 0,
	}
	}
	}

	impl<I> Iterator for Utf16Encoder<I>
	where I: Iterator<Item = char>
	{
	type Item = u16;

	#[inline]
	fn next(&mut self) -> Option<u16> {
	if self.extra != 0 {
	let tmp = self.extra;
	self.extra = 0;
	return Some(tmp);
	}

	self.chars.next().map(\|ch\| {
	let n = CharExt::encode_utf16(ch);
	let n = n.as_slice();
	if n.len() == 2 {
	self.extra = n[1];
	}
	n[0]
	})
	}

	#[inline]
	fn size_hint(&self) -> (usize, Option<usize>) {
	let (low, high) = self.chars.size_hint();
	// every char gets either one u16 or two u16,
	// so this iterator is between 1 or 2 times as
	// long as the underlying iterator.
	(low, high.and_then(\|n\| n.checked_mul(2)))
	}
	}

	impl<'a> Iterator for SplitWhitespace<'a> {
	type Item = &'a str;

	fn next(&mut self) -> Option<&'a str> {
	self.inner.next()
	}
	}
	impl<'a> DoubleEndedIterator for SplitWhitespace<'a> {
	fn next_back(&mut self) -> Option<&'a str> {
	self.inner.next_back()
	}
	}