| //! This module contains functionality for doing lz77 compression of data. |
| #![macro_use] |
| use std::cmp; |
| use std::ops::{Range, RangeFrom}; |
| use std::iter::{self, Iterator}; |
| use std::slice::Iter; |
| use std::fmt; |
| |
| use input_buffer::InputBuffer; |
| use matching::longest_match; |
| #[cfg(test)] |
| use lzvalue::{LZValue, LZType}; |
| use huffman_table; |
| use chained_hash_table::{ChainedHashTable, update_hash}; |
| #[cfg(test)] |
| use compression_options::{HIGH_MAX_HASH_CHECKS, HIGH_LAZY_IF_LESS_THAN}; |
| use output_writer::{BufferStatus, DynamicWriter}; |
| use compress::Flush; |
| use rle::process_chunk_greedy_rle; |
| |
| const MAX_MATCH: usize = huffman_table::MAX_MATCH as usize; |
| const MIN_MATCH: usize = huffman_table::MIN_MATCH as usize; |
| |
| const NO_RLE: u16 = 43212; |
| |
| /// An enum describing whether we use lazy or greedy matching. |
| #[derive(Clone, Copy, Debug, Eq, PartialEq, Ord, PartialOrd, Hash)] |
| pub enum MatchingType { |
| /// Use greedy matching: the matching algorithm simply uses a match right away |
| /// if found. |
| Greedy, |
| /// Use lazy matching: after finding a match, the next input byte is checked, to see |
| /// if there is a better match starting at that byte. |
| /// |
| /// As a special case, if max_hash_checks is set to 0, compression using only run-length |
| /// (i.e maximum match distance of 1) is performed instead. |
| Lazy, |
| } |
| |
| impl fmt::Display for MatchingType { |
| fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
| match *self { |
| MatchingType::Greedy => write!(f, "Greedy matching"), |
| MatchingType::Lazy => write!(f, "Lazy matching"), |
| } |
| } |
| } |
| |
| /// A struct that contains the hash table, and keeps track of where we are in the input data |
| pub struct LZ77State { |
| /// Struct containing hash chains that will be used to find matches. |
| hash_table: ChainedHashTable, |
| /// True if this is the first window that is being processed. |
| is_first_window: bool, |
| /// Set to true when the last block has been processed. |
| is_last_block: bool, |
| /// How many bytes the last match in the previous window extended into the current one. |
| overlap: usize, |
| /// How many bytes of input the current block contains. |
| current_block_input_bytes: u64, |
| /// The maximum number of hash entries to search. |
| max_hash_checks: u16, |
| /// Only lazy match if we have a match length less than this. |
| lazy_if_less_than: u16, |
| /// Whether to use greedy or lazy parsing |
| matching_type: MatchingType, |
| /// Keep track of the previous match and byte in case the buffer is full when lazy matching. |
| match_state: ChunkState, |
| /// Keep track of how many bytes in the lookahead that was part of a match, but has not been |
| /// added to the hash chain yet. |
| bytes_to_hash: usize, |
| /// Keep track of if sync flush was used. If this is the case, the two first bytes needs to be |
| /// hashed. |
| was_synced: bool, |
| } |
| |
| impl LZ77State { |
| /// Creates a new LZ77 state |
| pub fn new( |
| max_hash_checks: u16, |
| lazy_if_less_than: u16, |
| matching_type: MatchingType, |
| ) -> LZ77State { |
| LZ77State { |
| hash_table: ChainedHashTable::new(), |
| is_first_window: true, |
| is_last_block: false, |
| overlap: 0, |
| current_block_input_bytes: 0, |
| max_hash_checks: max_hash_checks, |
| lazy_if_less_than: lazy_if_less_than, |
| matching_type: matching_type, |
| match_state: ChunkState::new(), |
| bytes_to_hash: 0, |
| was_synced: false, |
| } |
| } |
| |
| /// Resets the state excluding max_hash_checks and lazy_if_less_than |
| pub fn reset(&mut self) { |
| self.hash_table.reset(); |
| self.is_first_window = true; |
| self.is_last_block = false; |
| self.overlap = 0; |
| self.current_block_input_bytes = 0; |
| self.match_state = ChunkState::new(); |
| self.bytes_to_hash = 0 |
| } |
| |
| pub fn set_last(&mut self) { |
| self.is_last_block = true; |
| } |
| |
| /// Is this the last block we are outputting? |
| pub fn is_last_block(&self) -> bool { |
| self.is_last_block |
| } |
| |
| /// How many bytes of input the current block contains. |
| pub fn current_block_input_bytes(&self) -> u64 { |
| self.current_block_input_bytes |
| } |
| |
| /// Sets the number of input bytes for the current block to 0. |
| pub fn reset_input_bytes(&mut self) { |
| self.current_block_input_bytes = 0; |
| } |
| |
| /// Is there a buffered byte that has not been output yet? |
| pub fn pending_byte(&self) -> bool { |
| self.match_state.add |
| } |
| |
| /// Returns 1 if pending_byte is true, 0 otherwise. |
| pub fn pending_byte_as_num(&self) -> usize { |
| // This could be implemented by using `as usize` as the documentation states this would give |
| // the same result, but not sure if that should be relied upon. |
| if self.match_state.add { |
| 1 |
| } else { |
| 0 |
| } |
| } |
| } |
| |
| const DEFAULT_WINDOW_SIZE: usize = 32768; |
| |
| #[derive(Debug)] |
| /// Status after calling `process_chunk`. |
| pub enum ProcessStatus { |
| /// All the input data was processed. |
| Ok, |
| /// The output buffer was full. |
| /// |
| /// The argument is the position of the next byte to be checked by `process_chunk`. |
| BufferFull(usize), |
| } |
| |
| #[derive(Debug)] |
| /// A struct to keep track of status between calls of `process_chunk_lazy` |
| /// |
| /// This is needed as the output buffer might become full before having output all pending data. |
| pub struct ChunkState { |
| /// Length of the last match that was found, if any. |
| current_length: u16, |
| /// Distance of the last match that was found, if any. |
| current_distance: u16, |
| /// The previous byte checked in process_chunk. |
| prev_byte: u8, |
| /// The current byte being checked in process_chunk. |
| cur_byte: u8, |
| /// Whether prev_byte still needs to be output. |
| add: bool, |
| } |
| |
| impl ChunkState { |
| pub fn new() -> ChunkState { |
| ChunkState { |
| current_length: 0, |
| current_distance: 0, |
| prev_byte: 0, |
| cur_byte: 0, |
| add: false, |
| } |
| } |
| } |
| |
| pub fn buffer_full(position: usize) -> ProcessStatus { |
| ProcessStatus::BufferFull(position) |
| } |
| |
| fn process_chunk( |
| data: &[u8], |
| iterated_data: &Range<usize>, |
| mut match_state: &mut ChunkState, |
| hash_table: &mut ChainedHashTable, |
| writer: &mut DynamicWriter, |
| max_hash_checks: u16, |
| lazy_if_less_than: usize, |
| matching_type: MatchingType, |
| ) -> (usize, ProcessStatus) { |
| let avoid_rle = if cfg!(test) { |
| // Avoid RLE if lazy_if_less than is a specific value. |
| // This is used in some tests, ideally we should probably do this in a less clunky way, |
| // but we use a value here that is higher than the maximum sensible one anyhow, and will |
| // be truncated by deflate_state for calls from outside the library. |
| lazy_if_less_than == NO_RLE as usize |
| } else { |
| false |
| }; |
| match matching_type { |
| MatchingType::Greedy => { |
| process_chunk_greedy(data, iterated_data, hash_table, writer, max_hash_checks) |
| } |
| MatchingType::Lazy => { |
| if max_hash_checks > 0 || avoid_rle { |
| process_chunk_lazy( |
| data, |
| iterated_data, |
| &mut match_state, |
| hash_table, |
| writer, |
| max_hash_checks, |
| lazy_if_less_than, |
| ) |
| } else { |
| // Use the RLE method if max_hash_checks is set to 0. |
| process_chunk_greedy_rle(data, iterated_data, writer) |
| } |
| } |
| } |
| } |
| |
| /// Add the specified number of bytes to the hash table from the iterators |
| /// adding `start` to the position supplied to the hash table. |
| fn add_to_hash_table( |
| bytes_to_add: usize, |
| insert_it: &mut iter::Zip<RangeFrom<usize>, Iter<u8>>, |
| hash_it: &mut Iter<u8>, |
| hash_table: &mut ChainedHashTable, |
| ) { |
| let taker = insert_it.by_ref().take(bytes_to_add); |
| let mut hash_taker = hash_it.by_ref().take(bytes_to_add); |
| // Update the hash manually here to keep it in a register. |
| let mut hash = hash_table.current_hash(); |
| // Advance the iterators and add the bytes we jump over to the hash table and |
| // checksum |
| for (ipos, _) in taker { |
| if let Some(&i_hash_byte) = hash_taker.next() { |
| hash = update_hash(hash, i_hash_byte); |
| hash_table.add_with_hash(ipos, hash); |
| } |
| } |
| // Write the hash back once we are done. |
| hash_table.set_hash(hash); |
| } |
| |
| /// Write the specified literal `byte` to the writer `w`, and return |
| /// `ProcessStatus::BufferFull($pos)` if the buffer is full after writing. |
| /// |
| /// `pos` should indicate the byte to start at in the next call to `process_chunk`, |
| /// `is_hashed` should be set to true of the byte at pos has been added to the hash chain. |
| macro_rules! write_literal{ |
| ($w:ident, $byte:expr, $pos:expr) => { |
| let b_status = $w.write_literal($byte); |
| |
| if let BufferStatus::Full = b_status { |
| return (0, buffer_full($pos)); |
| } |
| }; |
| } |
| |
| /// If the match is only 3 bytes long and the distance is more than 8 * 1024, it's likely to take |
| /// up more space than it would save. |
| #[inline] |
| fn match_too_far(match_len: usize, match_dist: usize) -> bool { |
| const TOO_FAR: usize = 8 * 1024; |
| match_len == MIN_MATCH && match_dist > TOO_FAR |
| } |
| |
| ///Create the iterators used when processing through a chunk of data. |
| fn create_iterators<'a>( |
| data: &'a [u8], |
| iterated_data: &Range<usize>, |
| ) -> ( |
| usize, |
| iter::Zip<RangeFrom<usize>, Iter<'a, u8>>, |
| Iter<'a, u8>, |
| ) { |
| let end = cmp::min(data.len(), iterated_data.end); |
| let start = iterated_data.start; |
| let current_chunk = &data[start..end]; |
| |
| let insert_it = (start..).zip(current_chunk.iter()); |
| let hash_it = { |
| let hash_start = if data.len() - start > 2 { |
| start + 2 |
| } else { |
| data.len() |
| }; |
| (&data[hash_start..]).iter() |
| }; |
| (end, insert_it, hash_it) |
| } |
| |
| fn process_chunk_lazy( |
| data: &[u8], |
| iterated_data: &Range<usize>, |
| state: &mut ChunkState, |
| mut hash_table: &mut ChainedHashTable, |
| writer: &mut DynamicWriter, |
| max_hash_checks: u16, |
| lazy_if_less_than: usize, |
| ) -> (usize, ProcessStatus) { |
| |
| let (end, mut insert_it, mut hash_it) = create_iterators(data, iterated_data); |
| |
| const NO_LENGTH: u16 = 0; |
| |
| // The previous match length, if any. |
| let mut prev_length = state.current_length; |
| // The distance of the previous match if any. |
| let mut prev_distance = state.current_distance; |
| |
| state.current_length = 0; |
| state.current_distance = 0; |
| |
| // The number of bytes past end that was added due to finding a match that extends into |
| // the lookahead window. |
| let mut overlap = 0; |
| |
| |
| // Set to true if we found a match that is equal to or longer than `lazy_if_less_than`, |
| // indicating that we won't lazy match (check for a better match at the next byte). |
| // If we had a good match, carry this over from the previous call. |
| let mut ignore_next = prev_length as usize >= lazy_if_less_than; |
| |
| // This is to output the correct byte in case there is one pending to be output |
| // from the previous call. |
| state.prev_byte = state.cur_byte; |
| |
| // Iterate through the slice, adding literals or length/distance pairs |
| while let Some((position, &b)) = insert_it.next() { |
| state.cur_byte = b; |
| if let Some(&hash_byte) = hash_it.next() { |
| hash_table.add_hash_value(position, hash_byte); |
| |
| // Only lazy match if we have a match shorter than a set value |
| // TODO: This should be cleaned up a bit |
| if !ignore_next { |
| let (mut match_len, match_dist) = { |
| // If there already was a decent match at the previous byte |
| // and we are lazy matching, do less match checks in this step. |
| let max_hash_checks = if prev_length >= 32 { |
| max_hash_checks >> 2 |
| } else { |
| max_hash_checks |
| }; |
| |
| // Check if we can find a better match here than the one we had at |
| // the previous byte. |
| longest_match( |
| data, |
| hash_table, |
| position, |
| prev_length as usize, |
| max_hash_checks, |
| ) |
| }; |
| |
| // If the match is only 3 bytes long and very far back, it's probably not worth |
| // outputting. |
| if match_too_far(match_len, match_dist) { |
| match_len = NO_LENGTH as usize; |
| }; |
| |
| if match_len >= lazy_if_less_than { |
| // We found a decent match, so we won't check for a better one at the next byte. |
| ignore_next = true; |
| } |
| state.current_length = match_len as u16; |
| state.current_distance = match_dist as u16; |
| } else { |
| // We already had a decent match, so we don't bother checking for another one. |
| state.current_length = NO_LENGTH; |
| state.current_distance = 0; |
| // Make sure we check again next time. |
| ignore_next = false; |
| }; |
| |
| if prev_length >= state.current_length && prev_length >= MIN_MATCH as u16 { |
| // The previous match was better so we add it. |
| // Casting note: length and distance is already bounded by the longest match |
| // function. Usize is just used for convenience. |
| let b_status = |
| writer.write_length_distance(prev_length as u16, prev_distance as u16); |
| |
| // We add the bytes to the hash table and checksum. |
| // Since we've already added two of them, we need to add two less than |
| // the length. |
| let bytes_to_add = prev_length - 2; |
| |
| add_to_hash_table( |
| bytes_to_add as usize, |
| &mut insert_it, |
| &mut hash_it, |
| &mut hash_table, |
| ); |
| |
| // If the match is longer than the current window, we have note how many |
| // bytes we overlap, since we don't need to do any matching on these bytes |
| // in the next call of this function. |
| // We don't have to worry about setting overlap to 0 if this is false, as the |
| // function will stop after this condition is true, and overlap is not altered |
| // elsewhere. |
| if position + prev_length as usize > end { |
| // We need to subtract 1 since the byte at pos is also included. |
| overlap = position + prev_length as usize - end - 1; |
| }; |
| |
| state.add = false; |
| |
| // Note that there is no current match. |
| state.current_length = 0; |
| state.current_distance = 0; |
| |
| if let BufferStatus::Full = b_status { |
| // MATCH(lazy) |
| return (overlap, buffer_full(position + prev_length as usize - 1)); |
| } |
| |
| ignore_next = false; |
| |
| } else if state.add { |
| // We found a better match (or there was no previous match) |
| // so output the previous byte. |
| // BETTER OR NO MATCH |
| write_literal!(writer, state.prev_byte, position + 1); |
| } else { |
| state.add = true |
| } |
| |
| prev_length = state.current_length; |
| prev_distance = state.current_distance; |
| state.prev_byte = b; |
| } else { |
| // If there is a match at this point, it will not have been added, so we need to add it. |
| if prev_length >= MIN_MATCH as u16 { |
| let b_status = |
| writer.write_length_distance(prev_length as u16, prev_distance as u16); |
| |
| state.current_length = 0; |
| state.current_distance = 0; |
| state.add = false; |
| |
| // As this will be a 3-length match at the end of the input data, there can't be any |
| // overlap. |
| // TODO: Not sure if we need to signal that the buffer is full here. |
| // It's only needed in the case of syncing. |
| if let BufferStatus::Full = b_status { |
| // TODO: These bytes should be hashed when doing a sync flush. |
| // This can't be done here as the new input data does not exist yet. |
| return (0, buffer_full(end)); |
| } else { |
| return (0, ProcessStatus::Ok); |
| } |
| }; |
| |
| if state.add { |
| // We may still have a leftover byte at this point, so we add it here if needed. |
| state.add = false; |
| |
| // ADD |
| write_literal!(writer, state.prev_byte, position + 1); |
| |
| }; |
| |
| // We are at the last two bytes we want to add, so there is no point |
| // searching for matches here. |
| |
| // AFTER ADD |
| write_literal!(writer, b, position + 1); |
| } |
| } |
| (overlap, ProcessStatus::Ok) |
| } |
| |
| fn process_chunk_greedy( |
| data: &[u8], |
| iterated_data: &Range<usize>, |
| mut hash_table: &mut ChainedHashTable, |
| writer: &mut DynamicWriter, |
| max_hash_checks: u16, |
| ) -> (usize, ProcessStatus) { |
| |
| let (end, mut insert_it, mut hash_it) = create_iterators(data, iterated_data); |
| |
| const NO_LENGTH: usize = 0; |
| |
| // The number of bytes past end that was added due to finding a match that extends into |
| // the lookahead window. |
| let mut overlap = 0; |
| |
| // Iterate through the slice, adding literals or length/distance pairs. |
| while let Some((position, &b)) = insert_it.next() { |
| if let Some(&hash_byte) = hash_it.next() { |
| hash_table.add_hash_value(position, hash_byte); |
| |
| // TODO: This should be cleaned up a bit. |
| let (match_len, match_dist) = |
| { longest_match(data, hash_table, position, NO_LENGTH, max_hash_checks) }; |
| |
| if match_len >= MIN_MATCH as usize && !match_too_far(match_len, match_dist) { |
| // Casting note: length and distance is already bounded by the longest match |
| // function. Usize is just used for convenience. |
| let b_status = writer.write_length_distance(match_len as u16, match_dist as u16); |
| |
| // We add the bytes to the hash table and checksum. |
| // Since we've already added one of them, we need to add one less than |
| // the length. |
| let bytes_to_add = match_len - 1; |
| add_to_hash_table( |
| bytes_to_add, |
| &mut insert_it, |
| &mut hash_it, |
| &mut hash_table, |
| ); |
| |
| // If the match is longer than the current window, we have note how many |
| // bytes we overlap, since we don't need to do any matching on these bytes |
| // in the next call of this function. |
| if position + match_len > end { |
| // We need to subtract 1 since the byte at pos is also included. |
| overlap = position + match_len - end; |
| }; |
| |
| if let BufferStatus::Full = b_status { |
| // MATCH |
| return (overlap, buffer_full(position + match_len)); |
| } |
| |
| } else { |
| // NO MATCH |
| write_literal!(writer, b, position + 1); |
| } |
| } else { |
| // We are at the last two bytes we want to add, so there is no point |
| // searching for matches here. |
| // END |
| write_literal!(writer, b, position + 1); |
| } |
| } |
| (overlap, ProcessStatus::Ok) |
| } |
| |
| |
| #[derive(Eq, PartialEq, Clone, Copy, Debug)] |
| pub enum LZ77Status { |
| /// Waiting for more input before doing any processing |
| NeedInput, |
| /// The output buffer is full, so the current block needs to be ended so the |
| /// buffer can be flushed. |
| EndBlock, |
| /// All pending data has been processed. |
| Finished, |
| } |
| |
| #[cfg(test)] |
| pub fn lz77_compress_block_finish( |
| data: &[u8], |
| state: &mut LZ77State, |
| buffer: &mut InputBuffer, |
| mut writer: &mut DynamicWriter, |
| ) -> (usize, LZ77Status) { |
| let (consumed, status, _) = |
| lz77_compress_block(data, state, buffer, &mut writer, Flush::Finish); |
| (consumed, status) |
| } |
| |
| /// Compress a slice with lz77 compression. |
| /// |
| /// This function processes one window at a time, and returns when there is no input left, |
| /// or it determines it's time to end a block. |
| /// |
| /// Returns the number of bytes of the input that were consumed, a status describing |
| /// whether there is no input, it's time to finish, or it's time to end the block, and the position |
| /// of the first byte in the input buffer that has not been output (but may have been checked for |
| /// matches). |
| pub fn lz77_compress_block( |
| data: &[u8], |
| state: &mut LZ77State, |
| buffer: &mut InputBuffer, |
| mut writer: &mut DynamicWriter, |
| flush: Flush, |
| ) -> (usize, LZ77Status, usize) { |
| // Currently we only support the maximum window size |
| let window_size = DEFAULT_WINDOW_SIZE; |
| |
| // Indicates whether we should try to process all the data including the lookahead, or if we |
| // should wait until we have at least one window size of data before doing anything. |
| let finish = flush == Flush::Finish || flush == Flush::Sync; |
| let sync = flush == Flush::Sync; |
| |
| let mut current_position = 0; |
| |
| // The current status of the encoding. |
| let mut status = LZ77Status::EndBlock; |
| |
| // Whether warm up the hash chain with the two first values. |
| let mut add_initial = true; |
| |
| // If we have synced, add the two first bytes to the hash as they couldn't be added before. |
| if state.was_synced { |
| if buffer.current_end() > 2 { |
| let pos_add = buffer.current_end() - 2; |
| for (n, &b) in data.iter().take(2).enumerate() { |
| state.hash_table.add_hash_value(n + pos_add, b); |
| } |
| add_initial = false; |
| } |
| state.was_synced = false; |
| } |
| |
| // Add data to the input buffer and keep a reference to the slice of data not added yet. |
| let mut remaining_data = buffer.add_data(data); |
| |
| loop { |
| // Note if there is a pending byte from the previous call to process_chunk, |
| // so we get the block input size right. |
| let pending_previous = state.pending_byte_as_num(); |
| |
| assert!(writer.buffer_length() <= (window_size * 2)); |
| // The process is a bit different for the first 32k bytes. |
| // TODO: There is a lot of duplicate code between the two branches here, we should be able |
| // to simplify this. |
| if state.is_first_window { |
| // Don't do anything until we are either flushing, or we have at least one window of |
| // data. |
| if buffer.current_end() >= (window_size * 2) + MAX_MATCH || finish { |
| |
| if buffer.get_buffer().len() >= 2 && add_initial && |
| state.current_block_input_bytes == 0 |
| { |
| let b = buffer.get_buffer(); |
| // Warm up the hash with the two first values, so we can find matches at |
| // index 0. |
| state.hash_table.add_initial_hash_values(b[0], b[1]); |
| add_initial = false; |
| } |
| |
| let first_chunk_end = cmp::min(window_size, buffer.current_end()); |
| |
| let start = state.overlap; |
| |
| let (overlap, p_status) = process_chunk( |
| buffer.get_buffer(), |
| &(start..first_chunk_end), |
| &mut state.match_state, |
| &mut state.hash_table, |
| &mut writer, |
| state.max_hash_checks, |
| state.lazy_if_less_than as usize, |
| state.matching_type, |
| ); |
| |
| state.overlap = overlap; |
| state.bytes_to_hash = overlap; |
| |
| // If the buffer is full, we want to end the block. |
| if let ProcessStatus::BufferFull(written) = p_status { |
| state.overlap = if overlap > 0 { overlap } else { written }; |
| status = LZ77Status::EndBlock; |
| current_position = written - state.pending_byte_as_num(); |
| state.current_block_input_bytes += |
| (written - start + overlap + pending_previous - |
| state.pending_byte_as_num()) as u64; |
| break; |
| } |
| |
| |
| // Update the length of how many input bytes the current block is representing, |
| // taking into account pending bytes. |
| state.current_block_input_bytes += |
| (first_chunk_end - start + overlap + pending_previous - |
| state.pending_byte_as_num()) as u64; |
| |
| // We are at the first window so we don't need to slide the hash table yet. |
| // If finishing or syncing, we stop here. |
| if first_chunk_end >= buffer.current_end() && finish { |
| current_position = first_chunk_end - state.pending_byte_as_num(); |
| if !sync { |
| state.set_last(); |
| state.is_first_window = false; |
| } else { |
| state.overlap = first_chunk_end; |
| state.was_synced = true; |
| } |
| debug_assert!( |
| !state.pending_byte(), |
| "Bug! Ended compression wit a pending byte!" |
| ); |
| status = LZ77Status::Finished; |
| break; |
| } |
| // Otherwise, continue. |
| state.is_first_window = false; |
| } else { |
| status = LZ77Status::NeedInput; |
| break; |
| } |
| } else if buffer.current_end() >= (window_size * 2) + MAX_MATCH || finish { |
| if buffer.current_end() >= window_size + 2 { |
| for (n, &h) in buffer.get_buffer()[window_size + 2..] |
| .iter() |
| .enumerate() |
| .take(state.bytes_to_hash) |
| { |
| state.hash_table.add_hash_value(window_size + n, h); |
| } |
| state.bytes_to_hash = 0; |
| } |
| // This isn't the first chunk, so we start reading at one window in in the |
| // buffer plus any additional overlap from earlier. |
| let start = window_size + state.overlap; |
| |
| // Determine where we have to stop iterating to slide the buffer and hash, |
| // or stop because we are at the end of the input data. |
| let end = cmp::min(window_size * 2, buffer.current_end()); |
| |
| let (overlap, p_status) = process_chunk( |
| buffer.get_buffer(), |
| &(start..end), |
| &mut state.match_state, |
| &mut state.hash_table, |
| &mut writer, |
| state.max_hash_checks, |
| state.lazy_if_less_than as usize, |
| state.matching_type, |
| ); |
| |
| state.bytes_to_hash = overlap; |
| |
| |
| if let ProcessStatus::BufferFull(written) = p_status { |
| state.current_block_input_bytes += |
| (written - start + pending_previous - state.pending_byte_as_num()) as u64; |
| |
| // If the buffer is full, return and end the block. |
| // If overlap is non-zero, the buffer was full after outputting the last byte, |
| // otherwise we have to skip to the point in the buffer where we stopped in the |
| // next call. |
| state.overlap = if overlap > 0 { |
| // If we are at the end of the window, make sure we slide the buffer and the |
| // hash table. |
| if state.max_hash_checks > 0 { |
| state.hash_table.slide(window_size); |
| } |
| remaining_data = buffer.slide(remaining_data.unwrap_or(&[])); |
| overlap |
| } else { |
| written - window_size |
| }; |
| |
| current_position = written - state.pending_byte_as_num(); |
| |
| |
| // Status is already EndBlock at this point. |
| // status = LZ77Status::EndBlock; |
| break; |
| } |
| |
| state.current_block_input_bytes += |
| (end - start + overlap + pending_previous - state.pending_byte_as_num()) as u64; |
| |
| // The buffer is not full, but we still need to note if there is any overlap into the |
| // next window. |
| state.overlap = overlap; |
| |
| if remaining_data.is_none() && finish && end == buffer.current_end() { |
| current_position = buffer.current_end(); |
| debug_assert!( |
| !state.pending_byte(), |
| "Bug! Ended compression wit a pending byte!" |
| ); |
| |
| // We stopped before or at the window size, so we are at the end. |
| if !sync { |
| // If we are finishing and not syncing, we simply indicate that we are done. |
| state.set_last(); |
| } else { |
| // For sync flushing we need to slide the buffer and the hash chains so that the |
| // next call to this function starts at the right place. |
| |
| // There won't be any overlap, since when syncing, we process to the end of the. |
| // pending data. |
| state.overlap = buffer.current_end() - window_size; |
| state.was_synced = true; |
| } |
| status = LZ77Status::Finished; |
| break; |
| } else { |
| // We are not at the end, so slide and continue. |
| // We slide the hash table back to make space for new hash values |
| // We only need to remember 2^15 bytes back (the maximum distance allowed by the |
| // deflate spec). |
| if state.max_hash_checks > 0 { |
| state.hash_table.slide(window_size); |
| } |
| |
| // Also slide the buffer, discarding data we no longer need and adding new data. |
| remaining_data = buffer.slide(remaining_data.unwrap_or(&[])); |
| } |
| } else { |
| // The caller has not indicated that they want to finish or flush, and there is less |
| // than a window + lookahead of new data, so we wait for more. |
| status = LZ77Status::NeedInput; |
| break; |
| } |
| |
| } |
| |
| ( |
| data.len() - remaining_data.unwrap_or(&[]).len(), |
| status, |
| current_position, |
| ) |
| } |
| |
| #[cfg(test)] |
| pub fn decompress_lz77(input: &[LZValue]) -> Vec<u8> { |
| decompress_lz77_with_backbuffer(input, &[]) |
| } |
| |
| #[cfg(test)] |
| pub fn decompress_lz77_with_backbuffer(input: &[LZValue], back_buffer: &[u8]) -> Vec<u8> { |
| let mut output = Vec::new(); |
| for p in input { |
| match p.value() { |
| LZType::Literal(l) => output.push(l), |
| LZType::StoredLengthDistance(l, d) => { |
| // We found a match, so we have to get the data that the match refers to. |
| let d = d as usize; |
| let prev_output_len = output.len(); |
| // Get match data from the back buffer if the match extends that far. |
| let consumed = if d > output.len() { |
| let into_back_buffer = d - output.len(); |
| |
| assert!( |
| into_back_buffer <= back_buffer.len(), |
| "ERROR: Attempted to refer to a match in non-existing data!\ |
| into_back_buffer: {}, back_buffer len {}, d {}, l {:?}", |
| into_back_buffer, |
| back_buffer.len(), |
| d, |
| l |
| ); |
| let start = back_buffer.len() - into_back_buffer; |
| let end = cmp::min(back_buffer.len(), start + l.actual_length() as usize); |
| output.extend_from_slice(&back_buffer[start..end]); |
| |
| end - start |
| } else { |
| 0 |
| }; |
| |
| // Get match data from the currently decompressed data. |
| let start = prev_output_len.saturating_sub(d); |
| let mut n = 0; |
| while n < (l.actual_length() as usize).saturating_sub(consumed) { |
| let b = output[start + n]; |
| output.push(b); |
| n += 1; |
| } |
| } |
| } |
| } |
| output |
| } |
| |
| #[cfg(test)] |
| pub struct TestStruct { |
| state: LZ77State, |
| buffer: InputBuffer, |
| writer: DynamicWriter, |
| } |
| |
| #[cfg(test)] |
| impl TestStruct { |
| fn new() -> TestStruct { |
| TestStruct::with_config( |
| HIGH_MAX_HASH_CHECKS, |
| HIGH_LAZY_IF_LESS_THAN, |
| MatchingType::Lazy, |
| ) |
| } |
| |
| fn with_config( |
| max_hash_checks: u16, |
| lazy_if_less_than: u16, |
| matching_type: MatchingType, |
| ) -> TestStruct { |
| TestStruct { |
| state: LZ77State::new(max_hash_checks, lazy_if_less_than, matching_type), |
| buffer: InputBuffer::empty(), |
| writer: DynamicWriter::new(), |
| } |
| } |
| |
| fn compress_block(&mut self, data: &[u8], flush: bool) -> (usize, LZ77Status, usize) { |
| lz77_compress_block( |
| data, |
| &mut self.state, |
| &mut self.buffer, |
| &mut self.writer, |
| if flush { Flush::Finish } else { Flush::None }, |
| ) |
| } |
| } |
| |
| #[cfg(test)] |
| pub fn lz77_compress(data: &[u8]) -> Option<Vec<LZValue>> { |
| lz77_compress_conf( |
| data, |
| HIGH_MAX_HASH_CHECKS, |
| HIGH_LAZY_IF_LESS_THAN, |
| MatchingType::Lazy, |
| ) |
| } |
| |
| /// Compress a slice, not storing frequency information |
| /// |
| /// This is a convenience function for compression with fixed huffman values |
| /// Only used in tests for now |
| #[cfg(test)] |
| pub fn lz77_compress_conf( |
| data: &[u8], |
| max_hash_checks: u16, |
| lazy_if_less_than: u16, |
| matching_type: MatchingType, |
| ) -> Option<Vec<LZValue>> { |
| let mut test_boxed = Box::new(TestStruct::with_config( |
| max_hash_checks, |
| lazy_if_less_than, |
| matching_type, |
| )); |
| let mut out = Vec::<LZValue>::with_capacity(data.len() / 3); |
| { |
| let test = test_boxed.as_mut(); |
| let mut slice = data; |
| |
| while !test.state.is_last_block { |
| let bytes_written = lz77_compress_block_finish( |
| slice, |
| &mut test.state, |
| &mut test.buffer, |
| &mut test.writer, |
| ).0; |
| slice = &slice[bytes_written..]; |
| out.extend(test.writer.get_buffer()); |
| test.writer.clear(); |
| } |
| |
| } |
| |
| Some(out) |
| } |
| |
| #[cfg(test)] |
| mod test { |
| use super::*; |
| |
| use lzvalue::{LZValue, LZType, lit, ld}; |
| use chained_hash_table::WINDOW_SIZE; |
| use compression_options::DEFAULT_LAZY_IF_LESS_THAN; |
| use test_utils::get_test_data; |
| use output_writer::MAX_BUFFER_LENGTH; |
| |
| /// Helper function to print the output from the lz77 compression function |
| fn print_output(input: &[LZValue]) { |
| let mut output = vec![]; |
| for l in input { |
| match l.value() { |
| LZType::Literal(l) => output.push(l), |
| LZType::StoredLengthDistance(l, d) => { |
| output.extend(format!("<L {}>", l.actual_length()).into_bytes()); |
| output.extend(format!("<D {}>", d).into_bytes()) |
| } |
| } |
| } |
| |
| println!("\"{}\"", String::from_utf8(output).unwrap()); |
| } |
| |
| /// Test that a short string from an example on SO compresses correctly |
| #[test] |
| fn compress_short() { |
| let test_bytes = String::from("Deflate late").into_bytes(); |
| let res = lz77_compress(&test_bytes).unwrap(); |
| |
| let decompressed = decompress_lz77(&res); |
| |
| assert_eq!(test_bytes, decompressed); |
| assert_eq!(*res.last().unwrap(), LZValue::length_distance(4, 5)); |
| } |
| |
| /// Test that compression is working for a longer file |
| #[test] |
| fn compress_long() { |
| let input = get_test_data(); |
| let compressed = lz77_compress(&input).unwrap(); |
| assert!(compressed.len() < input.len()); |
| |
| let decompressed = decompress_lz77(&compressed); |
| println!("compress_long length: {}", input.len()); |
| |
| // This is to check where the compression fails, if it were to |
| for (n, (&a, &b)) in input.iter().zip(decompressed.iter()).enumerate() { |
| if a != b { |
| println!("First difference at {}, input: {}, output: {}", n, a, b); |
| break; |
| } |
| } |
| assert_eq!(input.len(), decompressed.len()); |
| assert!(&decompressed == &input); |
| } |
| |
| /// Check that lazy matching is working as intended |
| #[test] |
| fn lazy() { |
| // We want to match on `badger` rather than `nba` as it is longer |
| // let data = b" nba nbadg badger nbadger"; |
| let data = b"nba badger nbadger"; |
| let compressed = lz77_compress(data).unwrap(); |
| let test = compressed[compressed.len() - 1]; |
| if let LZType::StoredLengthDistance(l, _) = test.value() { |
| assert_eq!(l.actual_length(), 6); |
| } else { |
| print_output(&compressed); |
| panic!(); |
| } |
| } |
| |
| fn roundtrip(data: &[u8]) { |
| let compressed = super::lz77_compress(&data).unwrap(); |
| let decompressed = decompress_lz77(&compressed); |
| assert!(decompressed == data); |
| } |
| |
| // Check that data with the exact window size is working properly |
| #[test] |
| #[allow(unused)] |
| fn exact_window_size() { |
| use std::io::Write; |
| let mut data = vec![0; WINDOW_SIZE]; |
| roundtrip(&data); |
| { |
| data.write(&[22; WINDOW_SIZE]); |
| } |
| roundtrip(&data); |
| { |
| data.write(&[55; WINDOW_SIZE]); |
| } |
| roundtrip(&data); |
| } |
| |
| /// Test that matches at the window border are working correctly |
| #[test] |
| fn border() { |
| use chained_hash_table::WINDOW_SIZE; |
| let mut data = vec![35; WINDOW_SIZE]; |
| data.extend(b"Test"); |
| let compressed = super::lz77_compress(&data).unwrap(); |
| assert!(compressed.len() < data.len()); |
| let decompressed = decompress_lz77(&compressed); |
| |
| assert_eq!(decompressed.len(), data.len()); |
| assert!(decompressed == data); |
| } |
| |
| #[test] |
| fn border_multiple_blocks() { |
| use chained_hash_table::WINDOW_SIZE; |
| let mut data = vec![0; (WINDOW_SIZE * 2) + 50]; |
| data.push(1); |
| let compressed = super::lz77_compress(&data).unwrap(); |
| assert!(compressed.len() < data.len()); |
| let decompressed = decompress_lz77(&compressed); |
| assert_eq!(decompressed.len(), data.len()); |
| assert!(decompressed == data); |
| } |
| |
| #[test] |
| fn compress_block_status() { |
| use input_buffer::InputBuffer; |
| |
| let data = b"Test data data"; |
| let mut writer = DynamicWriter::new(); |
| |
| let mut buffer = InputBuffer::empty(); |
| let mut state = LZ77State::new(4096, DEFAULT_LAZY_IF_LESS_THAN, MatchingType::Lazy); |
| let status = lz77_compress_block_finish(data, &mut state, &mut buffer, &mut writer); |
| assert_eq!(status.1, LZ77Status::Finished); |
| assert!(&buffer.get_buffer()[..data.len()] == data); |
| assert_eq!(buffer.current_end(), data.len()); |
| } |
| |
| #[test] |
| fn compress_block_multiple_windows() { |
| use input_buffer::InputBuffer; |
| use output_writer::DynamicWriter; |
| |
| let data = get_test_data(); |
| assert!(data.len() > (WINDOW_SIZE * 2) + super::MAX_MATCH); |
| let mut writer = DynamicWriter::new(); |
| |
| let mut buffer = InputBuffer::empty(); |
| let mut state = LZ77State::new(0, DEFAULT_LAZY_IF_LESS_THAN, MatchingType::Lazy); |
| let (bytes_consumed, status) = |
| lz77_compress_block_finish(&data, &mut state, &mut buffer, &mut writer); |
| assert_eq!( |
| buffer.get_buffer().len(), |
| (WINDOW_SIZE * 2) + super::MAX_MATCH |
| ); |
| assert_eq!(status, LZ77Status::EndBlock); |
| let buf_len = buffer.get_buffer().len(); |
| assert!(buffer.get_buffer()[..] == data[..buf_len]); |
| |
| writer.clear(); |
| let (_, status) = lz77_compress_block_finish( |
| &data[bytes_consumed..], |
| &mut state, |
| &mut buffer, |
| &mut writer, |
| ); |
| assert_eq!(status, LZ77Status::EndBlock); |
| |
| } |
| |
| #[test] |
| fn multiple_inputs() { |
| let data = b"Badger badger bababa test data 25 asfgestghresjkgh"; |
| let comp1 = lz77_compress(data).unwrap(); |
| let comp2 = { |
| const SPLIT: usize = 25; |
| let first_part = &data[..SPLIT]; |
| let second_part = &data[SPLIT..]; |
| let mut state = TestStruct::new(); |
| let (bytes_written, status, _) = state.compress_block(first_part, false); |
| assert_eq!(bytes_written, first_part.len()); |
| assert_eq!(status, LZ77Status::NeedInput); |
| let (bytes_written, status, _) = state.compress_block(second_part, true); |
| assert_eq!(bytes_written, second_part.len()); |
| assert_eq!(status, LZ77Status::Finished); |
| Vec::from(state.writer.get_buffer()) |
| }; |
| assert!(comp1 == comp2); |
| } |
| |
| |
| #[test] |
| /// Test that the exit from process_chunk when buffer is full is working correctly. |
| fn buffer_fill() { |
| let data = get_test_data(); |
| // The comments above these calls refers the positions with the |
| // corersponding comments in process_chunk_{greedy/lazy}. |
| // POS BETTER OR NO MATCH |
| buffer_test_literals(&data); |
| // POS END |
| // POS NO MATCH |
| buffer_test_last_bytes(MatchingType::Greedy, &data); |
| // POS ADD |
| // POS AFTER ADD |
| buffer_test_last_bytes(MatchingType::Lazy, &data); |
| |
| // POS MATCH |
| buffer_test_match(MatchingType::Greedy); |
| // POS MATCH(lazy) |
| buffer_test_match(MatchingType::Lazy); |
| |
| // POS END |
| buffer_test_add_end(&data); |
| } |
| |
| /// Test buffer fill when a byte is added due to no match being found. |
| fn buffer_test_literals(data: &[u8]) { |
| let mut state = TestStruct::with_config(0, NO_RLE, MatchingType::Lazy); |
| let (bytes_consumed, status, position) = state.compress_block(&data, false); |
| |
| // There should be enough data for the block to have ended. |
| assert_eq!(status, LZ77Status::EndBlock); |
| assert!(bytes_consumed <= (WINDOW_SIZE * 2) + MAX_MATCH); |
| |
| // The buffer should be full. |
| assert_eq!(state.writer.get_buffer().len(), MAX_BUFFER_LENGTH); |
| assert_eq!(position, state.writer.get_buffer().len()); |
| // Since all literals have been input, the block should have the exact number of litlens |
| // as there were input bytes. |
| assert_eq!( |
| state.state.current_block_input_bytes() as usize, |
| MAX_BUFFER_LENGTH |
| ); |
| state.state.reset_input_bytes(); |
| |
| let mut out = decompress_lz77(state.writer.get_buffer()); |
| |
| state.writer.clear(); |
| // The buffer should now be cleared. |
| assert_eq!(state.writer.get_buffer().len(), 0); |
| |
| assert!(data[..out.len()] == out[..]); |
| |
| let _ = state.compress_block(&data[bytes_consumed..], false); |
| // We should have some new data in the buffer at this point. |
| assert!(state.writer.get_buffer().len() > 0); |
| assert_eq!( |
| state.state.current_block_input_bytes() as usize, |
| MAX_BUFFER_LENGTH |
| ); |
| |
| out.extend_from_slice(&decompress_lz77(state.writer.get_buffer())); |
| assert!(data[..out.len()] == out[..]); |
| } |
| |
| /// Test buffer fill at the last two bytes that are not hashed. |
| fn buffer_test_last_bytes(matching_type: MatchingType, data: &[u8]) { |
| const BYTES_USED: usize = MAX_BUFFER_LENGTH; |
| assert!( |
| &data[..BYTES_USED] == |
| &decompress_lz77(&lz77_compress_conf( |
| &data[..BYTES_USED], |
| 0, |
| NO_RLE, |
| matching_type, |
| ).unwrap()) |
| [..] |
| ); |
| assert!( |
| &data[..BYTES_USED + 1] == |
| &decompress_lz77(&lz77_compress_conf( |
| &data[..BYTES_USED + 1], |
| 0, |
| NO_RLE, |
| matching_type, |
| ).unwrap()) |
| [..] |
| ); |
| } |
| |
| /// Test buffer fill when buffer is full at a match. |
| fn buffer_test_match(matching_type: MatchingType) { |
| // TODO: Also test this for the second block to make sure |
| // buffer is slid. |
| let mut state = TestStruct::with_config(1, 0, matching_type); |
| for _ in 0..MAX_BUFFER_LENGTH - 4 { |
| assert!(state.writer.write_literal(0) == BufferStatus::NotFull); |
| } |
| state.compress_block(&[1, 2, 3, 1, 2, 3, 4], true); |
| assert!(*state.writer.get_buffer().last().unwrap() == LZValue::length_distance(3, 3)); |
| |
| } |
| |
| /// Test buffer fill for the lazy match algorithm when adding a pending byte at the end. |
| fn buffer_test_add_end(_data: &[u8]) { |
| // This is disabled while the buffer size has not been stabilized. |
| /* |
| let mut state = TestStruct::with_config(DEFAULT_MAX_HASH_CHECKS, |
| DEFAULT_LAZY_IF_LESS_THAN, |
| MatchingType::Lazy); |
| // For the test file, this is how much data needs to be added to get the buffer |
| // full at the right spot to test that this buffer full exit is workong correctly. |
| for i in 0..31743 { |
| assert!(state.writer.write_literal(0) == BufferStatus::NotFull, "Buffer pos: {}", i); |
| } |
| |
| let dec = decompress_lz77(&state.writer.get_buffer()[pos..]); |
| assert!(dec.len() > 0); |
| assert!(dec[..] == data[..dec.len()]); |
| */ |
| } |
| |
| /// Check that decompressing lz77-data that refers to the back-buffer works. |
| #[test] |
| fn test_decompress_with_backbuffer() { |
| let bb = vec![5; WINDOW_SIZE]; |
| let lz_compressed = [lit(2), lit(4), ld(4, 20), lit(1), lit(1), ld(5, 10)]; |
| let dec = decompress_lz77_with_backbuffer(&lz_compressed, &bb); |
| |
| // ------------l2 l4 <-ld4,20-> l1 l1 <---ld5,10--> |
| assert!(dec == [2, 4, 5, 5, 5, 5, 1, 1, 5, 5, 2, 4, 5]); |
| } |
| |
| } |
| |
| #[cfg(all(test, feature = "benchmarks"))] |
| mod bench { |
| use test_std::Bencher; |
| use test_utils::get_test_data; |
| use super::lz77_compress; |
| #[bench] |
| fn test_file_zlib_lz77_only(b: &mut Bencher) { |
| let test_data = get_test_data(); |
| b.iter(|| lz77_compress(&test_data)); |
| } |
| } |