blob: fb2b0dfd6e5d3020be7ba82752312aecf173ad80 [file] [log] [blame]
//use deflate_state::DebugCounter;
use std::{mem, ptr};
pub const WINDOW_SIZE: usize = 32768;
pub const WINDOW_MASK: usize = WINDOW_SIZE - 1;
#[cfg(test)]
pub const HASH_BYTES: usize = 3;
const HASH_SHIFT: u16 = 5;
const HASH_MASK: u16 = WINDOW_MASK as u16;
/// Helper struct to let us allocate both head and prev in the same block.
struct Tables {
/// Starts of hash chains (in prev)
pub head: [u16; WINDOW_SIZE],
/// Link to previous occurence of this hash value
pub prev: [u16; WINDOW_SIZE],
}
impl Default for Tables {
#[inline]
fn default() -> Tables {
// # Unsafe
// This struct is not public and is only used in this module, and
// the values are immediately filled in after this struct is
// created.
unsafe {
Tables {
head: mem::uninitialized(),
prev: mem::uninitialized(),
}
}
}
}
impl Tables {
fn fill_prev(&mut self) {
assert_eq!(self.head.len(), self.prev.len());
// # Unsafe
//
// The arrays are created with the same length statically, so this should be safe.
// We use this rather than copy_from_slice as prev starts out unitialized.
unsafe {
ptr::copy_nonoverlapping(self.head.as_ptr(), self.prev.as_mut_ptr(), self.prev.len())
}
}
}
/// Create and box the hash chains.
fn create_tables() -> Box<Tables> {
// Using default here is a trick to get around the lack of box syntax on stable rust.
//
// Box::new([0u16,n]) ends up creating an temporary array on the stack which is not optimised
// but using default, which simply calls `box value` internally allows us to get around this.
//
// We could use vec instead, but using a boxed array helps the compiler optimise
// away bounds checks as `n & WINDOW_MASK < WINDOW_SIZE` will always be true.
let mut t: Box<Tables> = Box::default();
for (n, b) in t.head.iter_mut().enumerate() {
// # Unsafe
//
// Using ptr::write here since the values are uninitialised.
// u16 is a primitive and doesn't implement drop, so this would be safe either way.
unsafe {
ptr::write(b, n as u16);
}
}
t.fill_prev();
t
}
/// Returns a new hash value based on the previous value and the next byte
#[inline]
pub fn update_hash(current_hash: u16, to_insert: u8) -> u16 {
update_hash_conf(current_hash, to_insert, HASH_SHIFT, HASH_MASK)
}
#[inline]
fn update_hash_conf(current_hash: u16, to_insert: u8, shift: u16, mask: u16) -> u16 {
((current_hash << shift) ^ (to_insert as u16)) & mask
}
#[inline]
fn reset_array(arr: &mut [u16; WINDOW_SIZE]) {
for (n, b) in arr.iter_mut().enumerate() {
*b = n as u16;
}
}
pub struct ChainedHashTable {
// Current running hash value of the last 3 bytes
current_hash: u16,
// Hash chains.
c: Box<Tables>,
// Used for testing
// count: DebugCounter,
}
impl ChainedHashTable {
pub fn new() -> ChainedHashTable {
ChainedHashTable {
current_hash: 0,
c: create_tables(),
//count: DebugCounter::default(),
}
}
#[cfg(test)]
pub fn from_starting_values(v1: u8, v2: u8) -> ChainedHashTable {
let mut t = ChainedHashTable::new();
t.current_hash = update_hash(t.current_hash, v1);
t.current_hash = update_hash(t.current_hash, v2);
t
}
/// Resets the hash value and hash chains
pub fn reset(&mut self) {
self.current_hash = 0;
reset_array(&mut self.c.head);
{
let h = self.c.head;
let mut c = self.c.prev;
c[..].copy_from_slice(&h[..]);
}
/*if cfg!(debug_assertions) {
self.count.reset();
}*/
}
pub fn add_initial_hash_values(&mut self, v1: u8, v2: u8) {
self.current_hash = update_hash(self.current_hash, v1);
self.current_hash = update_hash(self.current_hash, v2);
}
/// Insert a byte into the hash table
#[inline]
pub fn add_hash_value(&mut self, position: usize, value: u8) {
// Check that all bytes are input in order and at the correct positions.
// Disabled for now as it breaks when sync flushing.
/*debug_assert_eq!(
position & WINDOW_MASK,
self.count.get() as usize & WINDOW_MASK
);*/
debug_assert!(
position < WINDOW_SIZE * 2,
"Position is larger than 2 * window size! {}",
position
);
// Storing the hash in a temporary variable here makes the compiler avoid the
// bounds checks in this function.
let new_hash = update_hash(self.current_hash, value);
self.add_with_hash(position, new_hash);
// Update the stored hash value with the new hash.
self.current_hash = new_hash;
}
/// Directly set the current hash value
#[inline]
pub fn set_hash(&mut self, hash: u16) {
self.current_hash = hash;
}
/// Update the tables directly, providing the hash.
#[inline]
pub fn add_with_hash(&mut self, position: usize, hash: u16) {
/*if cfg!(debug_assertions) {
self.count.add(1);
}*/
self.c.prev[position & WINDOW_MASK] = self.c.head[hash as usize];
// Ignoring any bits over 16 here is deliberate, as we only concern ourselves about
// where in the buffer (which is 64k bytes) we are referring to.
self.c.head[hash as usize] = position as u16;
}
// Get the head of the hash chain for the current hash value
#[cfg(test)]
#[inline]
pub fn current_head(&self) -> u16 {
self.c.head[self.current_hash as usize]
}
#[inline]
pub fn current_hash(&self) -> u16 {
self.current_hash
}
#[inline]
pub fn get_prev(&self, bytes: usize) -> u16 {
self.c.prev[bytes & WINDOW_MASK]
}
#[cfg(test)]
#[inline]
pub fn farthest_next(&self, match_pos: usize, match_len: usize) -> usize {
let to_check = match_len.saturating_sub(2);
let mut n = 0;
let mut smallest_prev =
self.get_prev(match_pos);
let mut smallest_pos = 0;
while n < to_check {
let prev =
self.get_prev(match_pos + n);
if prev < smallest_prev {
smallest_prev = prev;
smallest_pos = n;
}
n += 1;
}
smallest_pos
}
#[inline]
fn slide_value(b: u16, pos: u16, bytes: u16) -> u16 {
if b >= bytes {
b - bytes
} else {
pos
}
}
#[inline]
fn slide_table(table: &mut [u16; WINDOW_SIZE], bytes: u16) {
for (n, b) in table.iter_mut().enumerate() {
*b = ChainedHashTable::slide_value(*b, n as u16, bytes);
}
}
pub fn slide(&mut self, bytes: usize) {
/*if cfg!(debug_assertions) && bytes != WINDOW_SIZE {
// This should only happen in tests in this file.
self.count.reset();
}*/
ChainedHashTable::slide_table(&mut self.c.head, bytes as u16);
ChainedHashTable::slide_table(&mut self.c.prev, bytes as u16);
}
}
#[cfg(test)]
pub fn filled_hash_table(data: &[u8]) -> ChainedHashTable {
assert!(data.len() <= (WINDOW_SIZE * 2) + 2);
let mut hash_table = ChainedHashTable::from_starting_values(data[0], data[1]);
for (n, b) in data[2..].iter().enumerate() {
hash_table.add_hash_value(n, *b);
}
hash_table
}
#[cfg(test)]
mod test {
use super::{filled_hash_table, ChainedHashTable};
#[test]
fn chained_hash() {
use std::str;
let test_string = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do \
eiusmod tempor. rum. incididunt ut labore et dolore magna aliqua. Ut \
enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi \
ut aliquip ex ea commodo consequat. rum. Duis aute irure dolor in \
reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla \
pariatur. Excepteur sint occaecat cupidatat non proident, sunt in \
culpa qui officia deserunt mollit anim id est laborum.";
let test_data = test_string.as_bytes();
let current_bytes = &test_data[test_data.len() - super::HASH_BYTES..test_data.len()];
let num_iters = test_string
.matches(str::from_utf8(current_bytes).unwrap())
.count();
let hash_table = filled_hash_table(test_data);
// Test that the positions in the chain are valid
let mut prev_value = hash_table.get_prev(hash_table.current_head() as usize) as usize;
let mut count = 0;
let mut current = hash_table.current_head() as usize;
while current != prev_value {
count += 1;
current = prev_value;
prev_value = hash_table.get_prev(prev_value) as usize;
}
// There should be at least as many occurences of the hash of the checked bytes as the
// numbers of occurences of the checked bytes themselves. As the hashes are not large enough
// to store 8 * 3 = 24 bits, there could be more with different input data.
assert!(count >= num_iters);
}
#[test]
fn table_unique() {
let mut test_data = Vec::new();
test_data.extend(0u8..255);
test_data.extend(255u8..0);
let hash_table = filled_hash_table(&test_data);
let prev_pos = hash_table.get_prev(hash_table.current_head() as usize);
// Since all sequences in the input are unique, there shouldn't be any previous values.
assert_eq!(prev_pos, hash_table.current_hash());
}
#[test]
fn table_slide() {
use std::fs::File;
use std::io::Read;
let window_size = super::WINDOW_SIZE;
let window_size16 = super::WINDOW_SIZE as u16;
let mut input = Vec::new();
let mut f = File::open("tests/pg11.txt").unwrap();
f.read_to_end(&mut input).unwrap();
let mut hash_table = filled_hash_table(&input[..window_size + 2]);
for (n, b) in input[2..window_size + 2].iter().enumerate() {
hash_table.add_hash_value(n + window_size, *b);
}
hash_table.slide(window_size);
{
let max_head = hash_table.c.head.iter().max().unwrap();
// After sliding there should be no hashes referring to values
// higher than the window size
assert!(*max_head < window_size16);
assert!(*max_head > 0);
let pos = hash_table.get_prev(hash_table.current_head() as usize);
// There should be a previous occurence since we inserted the data 3 times
assert!(pos < window_size16);
assert!(pos > 0);
}
for (n, b) in input[2..(window_size / 2)].iter().enumerate() {
hash_table.add_hash_value(n + window_size, *b);
}
// There should hashes referring to values in the upper part of the input window
// at this point
let max_prev = hash_table.c.prev.iter().max().unwrap();
assert!(*max_prev > window_size16);
let mut pos = hash_table.current_head();
// There should be a previous occurence since we inserted the data 3 times
assert!(pos > window_size16);
let end_byte = input[(window_size / 2) - 1 - 2];
let mut iterations = 0;
while pos > window_size16 && iterations < 5000 {
assert_eq!(input[pos as usize & window_size - 1], end_byte);
pos = hash_table.get_prev(pos as usize);
iterations += 1;
}
}
#[test]
/// Ensure that the initial hash values are correct.
fn initial_chains() {
let t = ChainedHashTable::new();
for (n, &b) in t.c.head.iter().enumerate() {
assert_eq!(n, b as usize);
}
for (n, &b) in t.c.prev.iter().enumerate() {
assert_eq!(n, b as usize);
}
}
}