src/fonts/char_set/src/lib.rs - fuchsia - Git at Google

 // Copyright 2019 The Fuchsia Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 use {
     char_collection::CharCollection,
     std::{cmp::Ordering, convert::TryFrom, iter::Iterator},
 };

 type BitmapElement = u64;
 const BITMAP_ELEMENT_SIZE: usize = 64;
 /// This value is optimal for memory use, based on some non-scientific experimentation with
 /// real-world font files. (Though compared to `2048`, it does add a few nanoseconds to the average
 /// `contains` call.)
 const MAX_RANGE_GAP: u32 = 256;

 /// Represents an ordered set of code points that begin at [CharSetRange.start]. The largest
 /// allowed discontinuity between two consecutive code points in the set is [MAX_RANGE_GAP].
 #[derive(Debug, Clone, Hash, Eq, PartialEq)]
 struct CharSetRange {
     start: u32,
     bitmap: Vec<BitmapElement>,
 }

 impl CharSetRange {
     fn new() -> CharSetRange {
         CharSetRange { start: 0, bitmap: vec![] }
     }

     fn start(&self) -> u32 {
         self.start
     }

     fn end(&self) -> u32 {
         self.start + (self.bitmap.len() * BITMAP_ELEMENT_SIZE) as u32
     }

     fn is_empty(&self) -> bool {
         self.bitmap.is_empty()
     }

     fn add(&mut self, val: u32) {
         assert!(val >= self.start);
         assert!(char::try_from(val).is_ok());

         if self.bitmap.is_empty() {
             self.start = val;
         }

         let pos = (val - self.start) as usize;
         let element_pos = pos / BITMAP_ELEMENT_SIZE;

         if element_pos >= self.bitmap.len() {
             self.bitmap.resize(element_pos + 1, 0);
         }

         self.bitmap[element_pos] |= 1 << (pos % BITMAP_ELEMENT_SIZE);
     }

     fn contains(&self, c: u32) -> bool {
         if c < self.start || c >= self.end() {
             false
         } else {
             let index = c as usize - self.start as usize;
             (self.bitmap[index / BITMAP_ELEMENT_SIZE] & (1 << (index % BITMAP_ELEMENT_SIZE))) > 0
         }
     }

     fn iter(&self) -> CharSetRangeIterator<'_> {
         CharSetRangeIterator { char_set_range: &self, position: self.start.clone() }
     }
 }

 struct CharSetRangeIterator<'a> {
     char_set_range: &'a CharSetRange,
     position: u32,
 }

 impl Iterator for CharSetRangeIterator<'_> {
     type Item = char;

     fn next(&mut self) -> Option<char> {
         while self.position < self.char_set_range.end() {
             if self.char_set_range.contains(self.position) {
                 self.position += 1;
                 return Some(std::char::from_u32(self.position - 1).unwrap());
             }
             self.position += 1;
         }
         None
     }
 }

 /// Represents an ordered set of code points.
 ///
 /// TODO(kpozin): Add factory method that takes lazy `Iterator<Item = char>` instead of `Vec`.
 /// TODO(kpozin): Enforce `char` values.
 #[derive(Debug, Clone, Hash, Eq, PartialEq)]
 pub struct CharSet {
     ranges: Vec<CharSetRange>,
     len: usize,
 }

 impl CharSet {
     pub fn new(mut code_points: Vec<u32>) -> CharSet {
         let len = code_points.len();
         code_points.sort_unstable();

         let mut ranges = vec![];
         let mut range = CharSetRange::new();
         for c in code_points {
             if c != 0 && !range.is_empty() && c >= range.end() + MAX_RANGE_GAP {
                 ranges.push(range);
                 range = CharSetRange::new();
             }
             range.add(c);
         }
         if !range.is_empty() {
             ranges.push(range)
         }
         CharSet { ranges, len }
     }

     pub fn contains(&self, c: u32) -> bool {
         match self.ranges.binary_search_by(|r| {
             if r.end() < c {
                 Ordering::Less
             } else if r.start() > c {
                 Ordering::Greater
             } else {
                 Ordering::Equal
             }
         }) {
             Ok(r) => self.ranges[r].contains(c),
             Err(_) => false,
         }
     }

     pub fn is_empty(&self) -> bool {
         self.ranges.is_empty()
     }

     /// Returns the number of code points in the set.
     pub fn len(&self) -> usize {
         self.len
     }

     /// Iterate over all the characters in the the `CharSet` in code point order.
     pub fn iter(&self) -> impl Iterator<Item = char> + '_ {
         self.ranges.iter().flat_map(CharSetRange::iter)
     }
 }

 impl Default for CharSet {
     fn default() -> Self {
         CharSet::new(vec![])
     }
 }

 impl Into<CharCollection> for &CharSet {
     fn into(self) -> CharCollection {
         // Unwrapping is safe because we know `CharSet` iterates in order.
         CharCollection::from_sorted_chars(self.iter()).unwrap()
     }
 }

 impl From<CharCollection> for CharSet {
     fn from(value: CharCollection) -> CharSet {
         CharSet::from(&value)
     }
 }

 impl From<&CharCollection> for CharSet {
     fn from(value: &CharCollection) -> CharSet {
         CharSet::new(value.iter().map(|ch| ch as u32).collect::<Vec<u32>>())
     }
 }

 #[cfg(test)]
 mod tests {
     use {super::*, char_collection::char_collect};

     #[test]
     fn test_charset() {
         let charset = CharSet::new(vec![1, 2, 3, 10, 500, 5000, 5001, 10000]);
         assert!(!charset.contains(0));
         assert!(charset.contains(1));
         assert!(charset.contains(2));
         assert!(charset.contains(3));
         assert!(!charset.contains(4));
         assert!(!charset.contains(9));
         assert!(charset.contains(10));
         assert!(charset.contains(500));
         assert!(!charset.contains(501));
         assert!(charset.contains(5000));
         assert!(charset.contains(5001));
         assert!(!charset.contains(5002));
         assert!(!charset.contains(9999));
         assert!(charset.contains(10000));
         assert!(!charset.contains(10001));

         assert_eq!(
             charset.iter().map(|ch| ch as u32).collect::<Vec<u32>>(),
             vec![1, 2, 3, 10, 500, 5000, 5001, 10000]
         );
     }

     #[test]
     fn test_charset_from_char_collection() {
         let collection = char_collect!(0..=0, 2..=2, 13..=13, 32..=126);
         let charset = CharSet::from(&collection);
         assert!([0, 2, 13, 32, 54, 126].iter().all(|c| charset.contains(*c)));
         assert!([1, 11, 19, 127, 10000].iter().all(|c| !charset.contains(*c)));
     }
 }
	// Copyright 2019 The Fuchsia Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.

	use {
	char_collection::CharCollection,
	std::{cmp::Ordering, convert::TryFrom, iter::Iterator},
	};

	type BitmapElement = u64;
	const BITMAP_ELEMENT_SIZE: usize = 64;
	/// This value is optimal for memory use, based on some non-scientific experimentation with
	/// real-world font files. (Though compared to `2048`, it does add a few nanoseconds to the average
	/// `contains` call.)
	const MAX_RANGE_GAP: u32 = 256;

	/// Represents an ordered set of code points that begin at [CharSetRange.start]. The largest
	/// allowed discontinuity between two consecutive code points in the set is [MAX_RANGE_GAP].
	#[derive(Debug, Clone, Hash, Eq, PartialEq)]
	struct CharSetRange {
	start: u32,
	bitmap: Vec<BitmapElement>,
	}

	impl CharSetRange {
	fn new() -> CharSetRange {
	CharSetRange { start: 0, bitmap: vec![] }
	}

	fn start(&self) -> u32 {
	self.start
	}

	fn end(&self) -> u32 {
	self.start + (self.bitmap.len() * BITMAP_ELEMENT_SIZE) as u32
	}

	fn is_empty(&self) -> bool {
	self.bitmap.is_empty()
	}

	fn add(&mut self, val: u32) {
	assert!(val >= self.start);
	assert!(char::try_from(val).is_ok());

	if self.bitmap.is_empty() {
	self.start = val;
	}

	let pos = (val - self.start) as usize;
	let element_pos = pos / BITMAP_ELEMENT_SIZE;

	if element_pos >= self.bitmap.len() {
	self.bitmap.resize(element_pos + 1, 0);
	}

	self.bitmap[element_pos] \|= 1 << (pos % BITMAP_ELEMENT_SIZE);
	}

	fn contains(&self, c: u32) -> bool {
	if c < self.start \|\| c >= self.end() {
	false
	} else {
	let index = c as usize - self.start as usize;
	(self.bitmap[index / BITMAP_ELEMENT_SIZE] & (1 << (index % BITMAP_ELEMENT_SIZE))) > 0
	}
	}

	fn iter(&self) -> CharSetRangeIterator<'_> {
	CharSetRangeIterator { char_set_range: &self, position: self.start.clone() }
	}
	}

	struct CharSetRangeIterator<'a> {
	char_set_range: &'a CharSetRange,
	position: u32,
	}

	impl Iterator for CharSetRangeIterator<'_> {
	type Item = char;

	fn next(&mut self) -> Option<char> {
	while self.position < self.char_set_range.end() {
	if self.char_set_range.contains(self.position) {
	self.position += 1;
	return Some(std::char::from_u32(self.position - 1).unwrap());
	}
	self.position += 1;
	}
	None
	}
	}

	/// Represents an ordered set of code points.
	///
	/// TODO(kpozin): Add factory method that takes lazy `Iterator<Item = char>` instead of `Vec`.
	/// TODO(kpozin): Enforce `char` values.
	#[derive(Debug, Clone, Hash, Eq, PartialEq)]
	pub struct CharSet {
	ranges: Vec<CharSetRange>,
	len: usize,
	}

	impl CharSet {
	pub fn new(mut code_points: Vec<u32>) -> CharSet {
	let len = code_points.len();
	code_points.sort_unstable();

	let mut ranges = vec![];
	let mut range = CharSetRange::new();
	for c in code_points {
	if c != 0 && !range.is_empty() && c >= range.end() + MAX_RANGE_GAP {
	ranges.push(range);
	range = CharSetRange::new();
	}
	range.add(c);
	}
	if !range.is_empty() {
	ranges.push(range)
	}
	CharSet { ranges, len }
	}

	pub fn contains(&self, c: u32) -> bool {
	match self.ranges.binary_search_by(\|r\| {
	if r.end() < c {
	Ordering::Less
	} else if r.start() > c {
	Ordering::Greater
	} else {
	Ordering::Equal
	}
	}) {
	Ok(r) => self.ranges[r].contains(c),
	Err(_) => false,
	}
	}

	pub fn is_empty(&self) -> bool {
	self.ranges.is_empty()
	}

	/// Returns the number of code points in the set.
	pub fn len(&self) -> usize {
	self.len
	}

	/// Iterate over all the characters in the the `CharSet` in code point order.
	pub fn iter(&self) -> impl Iterator<Item = char> + '_ {
	self.ranges.iter().flat_map(CharSetRange::iter)
	}
	}

	impl Default for CharSet {
	fn default() -> Self {
	CharSet::new(vec![])
	}
	}

	impl Into<CharCollection> for &CharSet {
	fn into(self) -> CharCollection {
	// Unwrapping is safe because we know `CharSet` iterates in order.
	CharCollection::from_sorted_chars(self.iter()).unwrap()
	}
	}

	impl From<CharCollection> for CharSet {
	fn from(value: CharCollection) -> CharSet {
	CharSet::from(&value)
	}
	}

	impl From<&CharCollection> for CharSet {
	fn from(value: &CharCollection) -> CharSet {
	CharSet::new(value.iter().map(\|ch\| ch as u32).collect::<Vec<u32>>())
	}
	}

	#[cfg(test)]
	mod tests {
	use {super::*, char_collection::char_collect};

	#[test]
	fn test_charset() {
	let charset = CharSet::new(vec![1, 2, 3, 10, 500, 5000, 5001, 10000]);
	assert!(!charset.contains(0));
	assert!(charset.contains(1));
	assert!(charset.contains(2));
	assert!(charset.contains(3));
	assert!(!charset.contains(4));
	assert!(!charset.contains(9));
	assert!(charset.contains(10));
	assert!(charset.contains(500));
	assert!(!charset.contains(501));
	assert!(charset.contains(5000));
	assert!(charset.contains(5001));
	assert!(!charset.contains(5002));
	assert!(!charset.contains(9999));
	assert!(charset.contains(10000));
	assert!(!charset.contains(10001));

	assert_eq!(
	charset.iter().map(\|ch\| ch as u32).collect::<Vec<u32>>(),
	vec![1, 2, 3, 10, 500, 5000, 5001, 10000]
	);
	}

	#[test]
	fn test_charset_from_char_collection() {
	let collection = char_collect!(0..=0, 2..=2, 13..=13, 32..=126);
	let charset = CharSet::from(&collection);
	assert!([0, 2, 13, 32, 54, 126].iter().all(\|c\| charset.contains(*c)));
	assert!([1, 11, 19, 127, 10000].iter().all(\|c\| !charset.contains(*c)));
	}
	}