blob: 7245439f6383d6459007e0880b45823ded9ea212 [file] [log] [blame]
// Copyright 2019 The Fuchsia Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
use {
failure::{format_err, Error},
std::cmp::Ordering,
};
type BitmapElement = u64;
const BITMAP_ELEMENT_SIZE: usize = 64;
const MAX_RANGE_GAP: u32 = 2048;
/// Represents an ordered set of code points that begin at [CharSetRange.start]. The largest
/// allowed discontinuity between two consecutive code points in the set is [MAX_RANGE_GAP].
#[derive(Debug, Clone)]
struct CharSetRange {
start: u32,
bitmap: Vec<BitmapElement>,
}
impl CharSetRange {
fn new() -> CharSetRange {
CharSetRange { start: 0, bitmap: vec![] }
}
fn start(&self) -> u32 {
self.start
}
fn end(&self) -> u32 {
self.start + (self.bitmap.len() * BITMAP_ELEMENT_SIZE) as u32
}
fn is_empty(&self) -> bool {
self.bitmap.is_empty()
}
fn add(&mut self, val: u32) {
assert!(val >= self.start);
if self.bitmap.is_empty() {
self.start = val;
}
let pos = (val - self.start) as usize;
let element_pos = pos / BITMAP_ELEMENT_SIZE;
if element_pos >= self.bitmap.len() {
self.bitmap.resize(element_pos + 1, 0);
}
self.bitmap[element_pos] |= 1 << (pos % BITMAP_ELEMENT_SIZE);
}
fn contains(&self, c: u32) -> bool {
if c < self.start || c >= self.end() {
false
} else {
let index = c as usize - self.start as usize;
(self.bitmap[index / 64] & (1 << (index % 64))) > 0
}
}
}
/// Represents an ordered set of code points.
///
/// TODO(kpozin): Evaluate replacing with `MultiCharRange`, which might be more space-efficient for
/// large sets with few discontinuities.
#[derive(Debug, Clone)]
pub struct CharSet {
ranges: Vec<CharSetRange>,
}
impl CharSet {
pub fn new(mut code_points: Vec<u32>) -> CharSet {
code_points.sort_unstable();
let mut ranges = vec![];
let mut range = CharSetRange::new();
for c in code_points {
if c != 0 && !range.is_empty() && c >= range.end() + MAX_RANGE_GAP {
ranges.push(range);
range = CharSetRange::new();
}
range.add(c);
}
if !range.is_empty() {
ranges.push(range)
}
CharSet { ranges }
}
pub fn from_string(s: String) -> Result<CharSet, Error> {
let mut code_points: Vec<u32> = vec![];
let mut prev: u32 = 0;
for range in s.split(',').filter(|x| !x.is_empty()) {
let mut split = range.split('+');
let offset: u32 = match split.next() {
Some(off) => off
.parse()
.or_else(|_| Err(format_err!("Failed to parse {:?} as u32.", off)))?,
None => return Err(format_err!("Failed to parse {:?}: not a valid range.", range)),
};
let length: u32 = match split.next() {
Some(len) => len
.parse()
.or_else(|_| Err(format_err!("Failed to parse {:?} as u32.", len)))?,
None => 0, // We can treat "0,2,..." as "0+0,2+0,..."
};
if split.next().is_some() {
return Err(format_err!("Failed to parse {:?}: not a valid range.", range));
}
let begin = prev + offset;
prev = begin + length;
code_points.extend(begin..=prev);
}
Ok(CharSet::new(code_points))
}
pub fn contains(&self, c: u32) -> bool {
match self.ranges.binary_search_by(|r| {
if r.end() < c {
Ordering::Less
} else if r.start() > c {
Ordering::Greater
} else {
Ordering::Equal
}
}) {
Ok(r) => self.ranges[r].contains(c),
Err(_) => false,
}
}
pub fn is_empty(&self) -> bool {
self.ranges.is_empty()
}
}
impl Default for CharSet {
fn default() -> Self {
CharSet::new(vec![])
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_charset() {
let charset = CharSet::new(vec![1, 2, 3, 10, 500, 5000, 5001, 10000]);
assert!(!charset.contains(0));
assert!(charset.contains(1));
assert!(charset.contains(2));
assert!(charset.contains(3));
assert!(!charset.contains(4));
assert!(!charset.contains(9));
assert!(charset.contains(10));
assert!(charset.contains(500));
assert!(!charset.contains(501));
assert!(charset.contains(5000));
assert!(charset.contains(5001));
assert!(!charset.contains(5002));
assert!(!charset.contains(9999));
assert!(charset.contains(10000));
assert!(!charset.contains(10001));
}
#[test]
fn test_charset_from_string() -> Result<(), Error> {
let s = "0,2,11,19+94".to_string();
let charset = CharSet::from_string(s)?;
assert!([0, 2, 13, 32, 54, 126].into_iter().all(|c| charset.contains(*c)));
assert!([1, 11, 19, 127, 10000].into_iter().all(|c| !charset.contains(*c)));
Ok(())
}
#[test]
fn test_charset_from_string_not_a_number() {
for s in &["0,p,11", "q+1", "3+r"] {
assert!(CharSet::from_string(s.to_string()).is_err())
}
}
}