| use std::collections::{BTreeMap, BTreeSet, HashMap}; |
| use std::fmt::{self, Write}; |
| use std::ops::Range; |
| |
| use crate::fmt_list; |
| |
| #[derive(Clone)] |
| pub struct RawEmitter { |
| pub file: String, |
| pub desc: String, |
| pub bytes_used: usize, |
| } |
| |
| impl RawEmitter { |
| pub fn new() -> RawEmitter { |
| RawEmitter { file: String::new(), bytes_used: 0, desc: String::new() } |
| } |
| |
| fn blank_line(&mut self) { |
| if self.file.is_empty() || self.file.ends_with("\n\n") { |
| return; |
| } |
| writeln!(&mut self.file).unwrap(); |
| } |
| |
| fn emit_bitset(&mut self, ranges: &[Range<u32>]) -> Result<(), String> { |
| let first_code_point = ranges.first().unwrap().start; |
| let last_code_point = ranges.last().unwrap().end; |
| // bitset for every bit in the codepoint range |
| // |
| // + 2 to ensure an all zero word to use for padding |
| let mut buckets = vec![0u64; (last_code_point as usize / 64) + 2]; |
| for range in ranges { |
| for codepoint in range.clone() { |
| let bucket = codepoint as usize / 64; |
| let bit = codepoint as u64 % 64; |
| buckets[bucket] |= 1 << bit; |
| } |
| } |
| |
| let mut words = buckets; |
| // Ensure that there's a zero word in the dataset, used for padding and |
| // such. |
| words.push(0); |
| let unique_words = |
| words.iter().cloned().collect::<BTreeSet<_>>().into_iter().collect::<Vec<_>>(); |
| if unique_words.len() > u8::MAX as usize { |
| return Err(format!("cannot pack {} into 8 bits", unique_words.len())); |
| } |
| // needed for the chunk mapping to work |
| assert_eq!(unique_words[0], 0, "has a zero word"); |
| let canonicalized = Canonicalized::canonicalize(&unique_words); |
| |
| let word_indices = canonicalized.unique_mapping.clone(); |
| let compressed_words = words.iter().map(|w| word_indices[w]).collect::<Vec<u8>>(); |
| |
| let mut best = None; |
| for length in 1..=64 { |
| let mut temp = self.clone(); |
| temp.emit_chunk_map(word_indices[&0], &compressed_words, length); |
| if let Some((_, size)) = best { |
| if temp.bytes_used < size { |
| best = Some((length, temp.bytes_used)); |
| } |
| } else { |
| best = Some((length, temp.bytes_used)); |
| } |
| } |
| self.emit_chunk_map(word_indices[&0], &compressed_words, best.unwrap().0); |
| |
| struct Bits(u64); |
| impl fmt::Debug for Bits { |
| fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| write!(f, "0b{:064b}", self.0) |
| } |
| } |
| |
| writeln!( |
| &mut self.file, |
| "static BITSET_CANONICAL: [u64; {}] = [{}];", |
| canonicalized.canonical_words.len(), |
| fmt_list(canonicalized.canonical_words.iter().map(|v| Bits(*v))), |
| ) |
| .unwrap(); |
| self.bytes_used += 8 * canonicalized.canonical_words.len(); |
| writeln!( |
| &mut self.file, |
| "static BITSET_MAPPING: [(u8, u8); {}] = [{}];", |
| canonicalized.canonicalized_words.len(), |
| fmt_list(&canonicalized.canonicalized_words), |
| ) |
| .unwrap(); |
| // 8 bit index into shifted words, 7 bits for shift + optional flip |
| // We only need it for the words that we removed by applying a shift and |
| // flip to them. |
| self.bytes_used += 2 * canonicalized.canonicalized_words.len(); |
| |
| self.blank_line(); |
| |
| writeln!(&mut self.file, "pub const fn lookup(c: char) -> bool {{").unwrap(); |
| if first_code_point > 0x7f { |
| writeln!(&mut self.file, " (c as u32) >= {first_code_point:#04x} &&").unwrap(); |
| } |
| writeln!(&mut self.file, " super::bitset_search(").unwrap(); |
| writeln!(&mut self.file, " c as u32,").unwrap(); |
| writeln!(&mut self.file, " &BITSET_CHUNKS_MAP,").unwrap(); |
| writeln!(&mut self.file, " &BITSET_INDEX_CHUNKS,").unwrap(); |
| writeln!(&mut self.file, " &BITSET_CANONICAL,").unwrap(); |
| writeln!(&mut self.file, " &BITSET_MAPPING,").unwrap(); |
| writeln!(&mut self.file, " )").unwrap(); |
| writeln!(&mut self.file, "}}").unwrap(); |
| |
| Ok(()) |
| } |
| |
| fn emit_chunk_map(&mut self, zero_at: u8, compressed_words: &[u8], chunk_length: usize) { |
| let mut compressed_words = compressed_words.to_vec(); |
| for _ in 0..(chunk_length - (compressed_words.len() % chunk_length)) { |
| // pad out bitset index with zero words so we have all chunks of |
| // chunkchunk_length |
| compressed_words.push(zero_at); |
| } |
| |
| let mut chunks = BTreeSet::new(); |
| for chunk in compressed_words.chunks(chunk_length) { |
| chunks.insert(chunk); |
| } |
| let chunk_map = |
| chunks.iter().enumerate().map(|(idx, &chunk)| (chunk, idx)).collect::<HashMap<_, _>>(); |
| let mut chunk_indices = Vec::new(); |
| for chunk in compressed_words.chunks(chunk_length) { |
| chunk_indices.push(chunk_map[chunk]); |
| } |
| |
| writeln!( |
| &mut self.file, |
| "static BITSET_CHUNKS_MAP: [u8; {}] = [{}];", |
| chunk_indices.len(), |
| fmt_list(&chunk_indices), |
| ) |
| .unwrap(); |
| self.bytes_used += chunk_indices.len(); |
| writeln!( |
| &mut self.file, |
| "static BITSET_INDEX_CHUNKS: [[u8; {}]; {}] = [{}];", |
| chunk_length, |
| chunks.len(), |
| fmt_list(chunks.iter()), |
| ) |
| .unwrap(); |
| self.bytes_used += chunk_length * chunks.len(); |
| } |
| } |
| |
| pub fn emit_codepoints(emitter: &mut RawEmitter, ranges: &[Range<u32>]) { |
| emitter.blank_line(); |
| |
| let mut bitset = emitter.clone(); |
| let bitset_ok = bitset.emit_bitset(&ranges).is_ok(); |
| |
| let mut skiplist = emitter.clone(); |
| skiplist.emit_skiplist(&ranges); |
| |
| if bitset_ok && bitset.bytes_used <= skiplist.bytes_used { |
| *emitter = bitset; |
| emitter.desc = String::from("bitset"); |
| } else { |
| *emitter = skiplist; |
| emitter.desc = String::from("skiplist"); |
| } |
| } |
| |
| pub fn emit_whitespace(emitter: &mut RawEmitter, ranges: &[Range<u32>]) { |
| emitter.blank_line(); |
| |
| let mut cascading = emitter.clone(); |
| cascading.emit_cascading_map(&ranges); |
| *emitter = cascading; |
| emitter.desc = String::from("cascading"); |
| } |
| |
| struct Canonicalized { |
| canonical_words: Vec<u64>, |
| canonicalized_words: Vec<(u8, u8)>, |
| |
| /// Maps an input unique word to the associated index (u8) which is into |
| /// canonical_words or canonicalized_words (in order). |
| unique_mapping: HashMap<u64, u8>, |
| } |
| |
| impl Canonicalized { |
| fn canonicalize(unique_words: &[u64]) -> Self { |
| #[derive(Copy, Clone, Debug)] |
| enum Mapping { |
| Rotate(u32), |
| Invert, |
| RotateAndInvert(u32), |
| ShiftRight(u32), |
| } |
| |
| // key is the word being mapped to |
| let mut mappings: BTreeMap<u64, Vec<(u64, Mapping)>> = BTreeMap::new(); |
| for &a in unique_words { |
| 'b: for &b in unique_words { |
| // skip self |
| if a == b { |
| continue; |
| } |
| |
| // All possible distinct rotations |
| for rotation in 1..64 { |
| if a.rotate_right(rotation) == b { |
| mappings.entry(b).or_default().push((a, Mapping::Rotate(rotation))); |
| // We're not interested in further mappings between a and b |
| continue 'b; |
| } |
| } |
| |
| if (!a) == b { |
| mappings.entry(b).or_default().push((a, Mapping::Invert)); |
| // We're not interested in further mappings between a and b |
| continue 'b; |
| } |
| |
| // All possible distinct rotations, inverted |
| for rotation in 1..64 { |
| if (!a.rotate_right(rotation)) == b { |
| mappings |
| .entry(b) |
| .or_default() |
| .push((a, Mapping::RotateAndInvert(rotation))); |
| // We're not interested in further mappings between a and b |
| continue 'b; |
| } |
| } |
| |
| // All possible shifts |
| for shift_by in 1..64 { |
| if a == (b >> shift_by) { |
| mappings |
| .entry(b) |
| .or_default() |
| .push((a, Mapping::ShiftRight(shift_by as u32))); |
| // We're not interested in further mappings between a and b |
| continue 'b; |
| } |
| } |
| } |
| } |
| // These are the bitset words which will be represented "raw" (as a u64) |
| let mut canonical_words = Vec::new(); |
| // These are mapped words, which will be represented by an index into |
| // the canonical_words and a Mapping; u16 when encoded. |
| let mut canonicalized_words = Vec::new(); |
| let mut unique_mapping = HashMap::new(); |
| |
| #[derive(Debug, PartialEq, Eq)] |
| enum UniqueMapping { |
| Canonical(usize), |
| Canonicalized(usize), |
| } |
| |
| // Map 0 first, so that it is the first canonical word. |
| // This is realistically not inefficient because 0 is not mapped to by |
| // anything else (a shift pattern could do it, but would be wasteful). |
| // |
| // However, 0s are quite common in the overall dataset, and it is quite |
| // wasteful to have to go through a mapping function to determine that |
| // we have a zero. |
| // |
| // FIXME: Experiment with choosing most common words in overall data set |
| // for canonical when possible. |
| while let Some((&to, _)) = mappings |
| .iter() |
| .find(|(&to, _)| to == 0) |
| .or_else(|| mappings.iter().max_by_key(|m| m.1.len())) |
| { |
| // Get the mapping with the most entries. Currently, no mapping can |
| // only exist transitively (i.e., there is no A, B, C such that A |
| // does not map to C and but A maps to B maps to C), so this is |
| // guaranteed to be acceptable. |
| // |
| // In the future, we may need a more sophisticated algorithm to |
| // identify which keys to prefer as canonical. |
| let mapped_from = mappings.remove(&to).unwrap(); |
| for (from, how) in &mapped_from { |
| // Remove the entries which mapped to this one. |
| // Noting that it should be associated with the Nth canonical word. |
| // |
| // We do not assert that this is present, because there may be |
| // no mappings to the `from` word; that's fine. |
| mappings.remove(from); |
| assert_eq!( |
| unique_mapping |
| .insert(*from, UniqueMapping::Canonicalized(canonicalized_words.len())), |
| None |
| ); |
| canonicalized_words.push((canonical_words.len(), *how)); |
| |
| // Remove the now-canonicalized word from other mappings, |
| // to ensure that we deprioritize them in the next iteration of |
| // the while loop. |
| for mapped in mappings.values_mut() { |
| let mut i = 0; |
| while i != mapped.len() { |
| if mapped[i].0 == *from { |
| mapped.remove(i); |
| } else { |
| i += 1; |
| } |
| } |
| } |
| } |
| assert!( |
| unique_mapping |
| .insert(to, UniqueMapping::Canonical(canonical_words.len())) |
| .is_none() |
| ); |
| canonical_words.push(to); |
| |
| // Remove the now-canonical word from other mappings, to ensure that |
| // we deprioritize them in the next iteration of the while loop. |
| for mapped in mappings.values_mut() { |
| let mut i = 0; |
| while i != mapped.len() { |
| if mapped[i].0 == to { |
| mapped.remove(i); |
| } else { |
| i += 1; |
| } |
| } |
| } |
| } |
| |
| // Any words which we couldn't shrink, just stick into the canonical |
| // words. |
| // |
| // FIXME: work harder -- there are more possibilities for mapping |
| // functions (e.g., multiplication, shifting instead of rotation, etc.) |
| // We'll probably always have some slack though so this loop will still |
| // be needed. |
| for &w in unique_words { |
| if !unique_mapping.contains_key(&w) { |
| assert!( |
| unique_mapping |
| .insert(w, UniqueMapping::Canonical(canonical_words.len())) |
| .is_none() |
| ); |
| canonical_words.push(w); |
| } |
| } |
| assert_eq!(canonicalized_words.len() + canonical_words.len(), unique_words.len()); |
| assert_eq!(unique_mapping.len(), unique_words.len()); |
| |
| let unique_mapping = unique_mapping |
| .into_iter() |
| .map(|(key, value)| { |
| (key, match value { |
| UniqueMapping::Canonicalized(idx) => { |
| u8::try_from(canonical_words.len() + idx).unwrap() |
| } |
| UniqueMapping::Canonical(idx) => u8::try_from(idx).unwrap(), |
| }) |
| }) |
| .collect::<HashMap<_, _>>(); |
| |
| let mut distinct_indices = BTreeSet::new(); |
| for &w in unique_words { |
| let idx = unique_mapping.get(&w).unwrap(); |
| assert!(distinct_indices.insert(idx)); |
| } |
| |
| const LOWER_6: u32 = (1 << 6) - 1; |
| |
| let canonicalized_words = canonicalized_words |
| .into_iter() |
| .map(|v| { |
| (u8::try_from(v.0).unwrap(), match v.1 { |
| Mapping::RotateAndInvert(amount) => { |
| assert_eq!(amount, amount & LOWER_6); |
| 1 << 6 | (amount as u8) |
| } |
| Mapping::Rotate(amount) => { |
| assert_eq!(amount, amount & LOWER_6); |
| amount as u8 |
| } |
| Mapping::Invert => 1 << 6, |
| Mapping::ShiftRight(shift_by) => { |
| assert_eq!(shift_by, shift_by & LOWER_6); |
| 1 << 7 | (shift_by as u8) |
| } |
| }) |
| }) |
| .collect::<Vec<(u8, u8)>>(); |
| Canonicalized { unique_mapping, canonical_words, canonicalized_words } |
| } |
| } |