blob: 0bac44b837ff247dc80743994b5fe58daff5d3d9 [file] [log] [blame]
// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
//! Unicode-intensive string manipulations.
//!
//! This module provides functionality to `str` that requires the Unicode
//! methods provided by the unicode parts of the CharExt trait.
use core::char;
use core::iter::Filter;
use core::str::Split;
/// An iterator over the non-whitespace substrings of a string,
/// separated by any amount of whitespace.
#[stable(feature = "split_whitespace", since = "1.1.0")]
pub struct SplitWhitespace<'a> {
inner: Filter<Split<'a, fn(char) -> bool>, fn(&&str) -> bool>,
}
/// Methods for Unicode string slices
#[allow(missing_docs)] // docs in libcollections
pub trait UnicodeStr {
fn split_whitespace<'a>(&'a self) -> SplitWhitespace<'a>;
fn is_whitespace(&self) -> bool;
fn is_alphanumeric(&self) -> bool;
fn trim(&self) -> &str;
fn trim_left(&self) -> &str;
fn trim_right(&self) -> &str;
}
impl UnicodeStr for str {
#[inline]
fn split_whitespace(&self) -> SplitWhitespace {
fn is_not_empty(s: &&str) -> bool {
!s.is_empty()
}
let is_not_empty: fn(&&str) -> bool = is_not_empty; // coerce to fn pointer
fn is_whitespace(c: char) -> bool {
c.is_whitespace()
}
let is_whitespace: fn(char) -> bool = is_whitespace; // coerce to fn pointer
SplitWhitespace { inner: self.split(is_whitespace).filter(is_not_empty) }
}
#[inline]
fn is_whitespace(&self) -> bool {
self.chars().all(|c| c.is_whitespace())
}
#[inline]
fn is_alphanumeric(&self) -> bool {
self.chars().all(|c| c.is_alphanumeric())
}
#[inline]
fn trim(&self) -> &str {
self.trim_matches(|c: char| c.is_whitespace())
}
#[inline]
fn trim_left(&self) -> &str {
self.trim_left_matches(|c: char| c.is_whitespace())
}
#[inline]
fn trim_right(&self) -> &str {
self.trim_right_matches(|c: char| c.is_whitespace())
}
}
// https://tools.ietf.org/html/rfc3629
static UTF8_CHAR_WIDTH: [u8; 256] = [
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x1F
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x3F
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x5F
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x7F
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0x9F
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0xBF
0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // 0xDF
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, // 0xEF
4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, // 0xFF
];
/// Given a first byte, determine how many bytes are in this UTF-8 character
#[inline]
pub fn utf8_char_width(b: u8) -> usize {
return UTF8_CHAR_WIDTH[b as usize] as usize;
}
/// Determines if a vector of `u16` contains valid UTF-16
pub fn is_utf16(v: &[u16]) -> bool {
let mut it = v.iter();
macro_rules! next { ($ret:expr) => {
match it.next() { Some(u) => *u, None => return $ret }
}
}
loop {
let u = next!(true);
match char::from_u32(u as u32) {
Some(_) => {}
None => {
let u2 = next!(false);
if u < 0xD7FF || u > 0xDBFF || u2 < 0xDC00 || u2 > 0xDFFF {
return false;
}
}
}
}
}
/// Iterator adaptor for encoding `char`s to UTF-16.
#[derive(Clone)]
pub struct Utf16Encoder<I> {
chars: I,
extra: u16,
}
impl<I> Utf16Encoder<I> {
/// Create a UTF-16 encoder from any `char` iterator.
pub fn new(chars: I) -> Utf16Encoder<I>
where I: Iterator<Item = char>
{
Utf16Encoder {
chars: chars,
extra: 0,
}
}
}
impl<I> Iterator for Utf16Encoder<I>
where I: Iterator<Item = char>
{
type Item = u16;
#[inline]
fn next(&mut self) -> Option<u16> {
if self.extra != 0 {
let tmp = self.extra;
self.extra = 0;
return Some(tmp);
}
self.chars.next().map(|ch| {
let n = CharExt::encode_utf16(ch);
let n = n.as_slice();
if n.len() == 2 {
self.extra = n[1];
}
n[0]
})
}
#[inline]
fn size_hint(&self) -> (usize, Option<usize>) {
let (low, high) = self.chars.size_hint();
// every char gets either one u16 or two u16,
// so this iterator is between 1 or 2 times as
// long as the underlying iterator.
(low, high.and_then(|n| n.checked_mul(2)))
}
}
impl<'a> Iterator for SplitWhitespace<'a> {
type Item = &'a str;
fn next(&mut self) -> Option<&'a str> {
self.inner.next()
}
}
impl<'a> DoubleEndedIterator for SplitWhitespace<'a> {
fn next_back(&mut self) -> Option<&'a str> {
self.inner.next_back()
}
}