switch to use only 3 kinds of hashtable

use only hashtables with fixed sizes and bit shifts, that allow to remove bounds checks.
PSeitz · Feb 6, 2023 · 20bb1ab · 20bb1ab
1 parent 8032df4
commit 20bb1ab
Show file tree

Hide file tree

Showing 3 changed files with 64 additions and 149 deletions.
diff --git a/src/block/compress.rs b/src/block/compress.rs
@@ -4,9 +4,7 @@
 //! high performance. It has fixed memory usage, which contrary to other approachs, makes it less
 //! memory hungry.
 
-use crate::block::hashtable::get_table_size;
 use crate::block::hashtable::HashTable;
-use crate::block::hashtable::{HashTableU16, HashTableU32, HashTableUsize};
 use crate::block::END_OFFSET;
 use crate::block::LZ4_MIN_LENGTH;
 use crate::block::MAX_DISTANCE;
@@ -18,6 +16,8 @@ use alloc::vec::Vec;
 #[cfg(feature = "safe-encode")]
 use core::convert::TryInto;
 
+use super::hashtable::HashTable4KU16;
+use super::hashtable::HashTable8K;
 use super::{CompressError, WINDOW_SIZE};
 
 pub(crate) fn get_vec_with_size(size: usize) -> Vec<u8> {
@@ -346,7 +346,7 @@ fn backtrack_match(
 /// show significant improvement though.
 // Intentionally avoid inlining.
 // Empirical tests revealed it to be rarely better but often significantly detrimental.
-#[inline(never)]
+#[inline]
 pub(crate) fn compress_internal<T: HashTable, const USE_DICT: bool>(
     input: &[u8],
     input_pos: usize,
@@ -596,17 +596,13 @@ pub(crate) fn compress_into_sink_with_dict<const USE_DICT: bool>(
     output: &mut SliceSink,
     mut dict_data: &[u8],
 ) -> Result<usize, CompressError> {
-    let (dict_size, dict_bitshift) = get_table_size(input.len());
     if dict_data.len() + input.len() < u16::MAX as usize {
-        let mut dict = HashTableU16::new(dict_size, dict_bitshift);
-        init_dict(&mut dict, &mut dict_data);
-        compress_internal::<_, USE_DICT>(input, 0, output, &mut dict, dict_data, dict_data.len())
-    } else if dict_data.len() + input.len() < u32::MAX as usize {
-        let mut dict = HashTableU32::new(dict_size, dict_bitshift);
+        let mut dict = HashTable4KU16::new();
         init_dict(&mut dict, &mut dict_data);
         compress_internal::<_, USE_DICT>(input, 0, output, &mut dict, dict_data, dict_data.len())
     } else {
-        let mut dict = HashTableUsize::new(dict_size, dict_bitshift);
+        // For some reason using a 4K hashtable causes a performance regression (memory layout?)
+        let mut dict = HashTable8K::new();
         init_dict(&mut dict, &mut dict_data);
         compress_internal::<_, USE_DICT>(input, 0, output, &mut dict, dict_data, dict_data.len())
     }

diff --git a/src/block/hashtable.rs b/src/block/hashtable.rs
@@ -1,6 +1,4 @@
 use alloc::boxed::Box;
-use alloc::vec::Vec;
-#[cfg(feature = "frame")]
 use core::convert::TryInto;
 
 /// The Hashtable trait used by the compression to store hashed bytes to their position.
@@ -51,64 +49,55 @@ pub trait HashTable {
     }
 }
 
+const HASHTABLE_SIZE_4K: usize = 4 * 1024;
+const HASHTABLE_BIT_SHIFT_4K: usize = 4;
+
 #[derive(Debug)]
-pub struct HashTableUsize {
-    dict: Vec<usize>,
-    /// Shift the hash value for the dictionary to the right, to match the dictionary size.
-    dict_bitshift: usize,
+#[repr(align(64))]
+pub struct HashTable4KU16 {
+    dict: Box<[u16; HASHTABLE_SIZE_4K]>,
 }
-
-impl HashTableUsize {
+impl HashTable4KU16 {
     #[inline]
-    pub fn new(dict_size: usize, dict_bitshift: usize) -> Self {
-        let dict = alloc::vec![0; dict_size];
-        Self {
-            dict,
-            dict_bitshift,
-        }
+    pub fn new() -> Self {
+        // This generates more efficient assembly in contrast to Box::new(slice), because of an
+        // optmized call alloc_zeroed, vs. alloc + memset
+        // try_into is optimized away
+        let dict = alloc::vec![0; HASHTABLE_SIZE_4K]
+            .into_boxed_slice()
+            .try_into()
+            .unwrap();
+        Self { dict }
     }
 }
-
-impl HashTable for HashTableUsize {
-    #[inline]
-    #[cfg(feature = "safe-encode")]
-    fn get_at(&self, hash: usize) -> usize {
-        self.dict[hash >> self.dict_bitshift] as usize
-    }
+impl HashTable for HashTable4KU16 {
     #[inline]
-    #[cfg(not(feature = "safe-encode"))]
     fn get_at(&self, hash: usize) -> usize {
-        unsafe { *self.dict.get_unchecked(hash >> self.dict_bitshift) as usize }
-    }
-
-    #[inline]
-    #[cfg(feature = "safe-encode")]
-    fn put_at(&mut self, hash: usize, val: usize) {
-        self.dict[hash >> self.dict_bitshift] = val;
+        self.dict[hash >> HASHTABLE_BIT_SHIFT_4K] as usize
     }
     #[inline]
-    #[cfg(not(feature = "safe-encode"))]
     fn put_at(&mut self, hash: usize, val: usize) {
-        (*unsafe { self.dict.get_unchecked_mut(hash >> self.dict_bitshift) }) = val;
+        self.dict[hash >> HASHTABLE_BIT_SHIFT_4K] = val as u16;
     }
-
     #[inline]
     fn clear(&mut self) {
         self.dict.fill(0);
     }
+    #[inline]
+    fn get_hash_at(input: &[u8], pos: usize) -> usize {
+        hash(super::get_batch(input, pos)) as usize
+    }
 }
 
-const HASHTABLE_SIZE_4K: usize = 4 * 1024;
-const HASHTABLE_BIT_SHIFT_4K: usize = 4;
-
 #[derive(Debug)]
 #[repr(align(64))]
+#[cfg(feature = "frame")]
 pub struct HashTable4K {
     dict: Box<[u32; HASHTABLE_SIZE_4K]>,
 }
+#[cfg(feature = "frame")]
 impl HashTable4K {
     #[inline]
-    #[cfg(feature = "frame")]
     pub fn new() -> Self {
         let dict = alloc::vec![0; HASHTABLE_SIZE_4K]
             .into_boxed_slice()
@@ -125,6 +114,7 @@ impl HashTable4K {
         }
     }
 }
+#[cfg(feature = "frame")]
 impl HashTable for HashTable4K {
     #[inline]
     fn get_at(&self, hash: usize) -> usize {
@@ -140,122 +130,36 @@ impl HashTable for HashTable4K {
     }
 }
 
-#[derive(Debug)]
-#[repr(align(64))]
-pub struct HashTableU32 {
-    dict: Vec<u32>,
-    /// Shift the hash value for the dictionary to the right, to match the dictionary size.
-    dict_bitshift: usize,
-}
-impl HashTableU32 {
-    #[inline]
-    pub fn new(dict_size: usize, dict_bitshift: usize) -> Self {
-        let dict = alloc::vec![0; dict_size];
-        Self {
-            dict,
-            dict_bitshift,
-        }
-    }
-}
-impl HashTable for HashTableU32 {
-    #[inline]
-    #[cfg(feature = "safe-encode")]
-    fn get_at(&self, hash: usize) -> usize {
-        self.dict[hash >> self.dict_bitshift] as usize
-    }
-    #[inline]
-    #[cfg(not(feature = "safe-encode"))]
-    fn get_at(&self, hash: usize) -> usize {
-        unsafe { *self.dict.get_unchecked(hash >> self.dict_bitshift) as usize }
-    }
-    #[inline]
-    #[cfg(feature = "safe-encode")]
-    fn put_at(&mut self, hash: usize, val: usize) {
-        self.dict[hash >> self.dict_bitshift] = val as u32;
-    }
-    #[inline]
-    #[cfg(not(feature = "safe-encode"))]
-    fn put_at(&mut self, hash: usize, val: usize) {
-        (*unsafe { self.dict.get_unchecked_mut(hash >> self.dict_bitshift) }) = val as u32;
-    }
-    #[inline]
-    fn clear(&mut self) {
-        self.dict.fill(0);
-    }
-}
+const HASHTABLE_SIZE_8K: usize = 8 * 1024;
+const HASH_TABLE_BIT_SHIFT_8K: usize = 3;
 
 #[derive(Debug)]
 #[repr(align(64))]
-pub struct HashTableU16 {
-    dict: Vec<u16>,
-    /// Shift the hash value for the dictionary to the right, to match the dictionary size.
-    dict_bitshift: usize,
+pub struct HashTable8K {
+    dict: Box<[u32; HASHTABLE_SIZE_8K]>,
 }
-impl HashTableU16 {
+impl HashTable8K {
     #[inline]
-    pub fn new(dict_size: usize, dict_bitshift: usize) -> Self {
-        let dict = alloc::vec![0; dict_size];
-        Self {
-            dict,
-            dict_bitshift,
-        }
+    pub fn new() -> Self {
+        let dict = alloc::vec![0; HASHTABLE_SIZE_8K]
+            .into_boxed_slice()
+            .try_into()
+            .unwrap();
+
+        Self { dict }
     }
 }
-impl HashTable for HashTableU16 {
-    #[inline]
-    #[cfg(feature = "safe-encode")]
-    fn get_at(&self, hash: usize) -> usize {
-        self.dict[hash >> self.dict_bitshift] as usize
-    }
+impl HashTable for HashTable8K {
     #[inline]
-    #[cfg(not(feature = "safe-encode"))]
     fn get_at(&self, hash: usize) -> usize {
-        unsafe { *self.dict.get_unchecked(hash >> self.dict_bitshift) as usize }
+        self.dict[hash >> HASH_TABLE_BIT_SHIFT_8K] as usize
     }
     #[inline]
-    #[cfg(feature = "safe-encode")]
     fn put_at(&mut self, hash: usize, val: usize) {
-        self.dict[hash >> self.dict_bitshift] = val as u16;
-    }
-    #[inline]
-    #[cfg(not(feature = "safe-encode"))]
-    fn put_at(&mut self, hash: usize, val: usize) {
-        (*unsafe { self.dict.get_unchecked_mut(hash >> self.dict_bitshift) }) = val as u16;
+        self.dict[hash >> HASH_TABLE_BIT_SHIFT_8K] = val as u32;
     }
     #[inline]
     fn clear(&mut self) {
         self.dict.fill(0);
     }
-    #[inline]
-    fn get_hash_at(input: &[u8], pos: usize) -> usize {
-        hash(super::get_batch(input, pos)) as usize
-    }
-}
-
-#[inline]
-pub fn get_table_size(input_len: usize) -> (usize, usize) {
-    let (dict_size, dict_bitshift) = match input_len {
-        // U16 Positions
-        0..=65535 => {
-            // Considering we want a table with up to 16K bytes and each slot takes 2 bytes.
-            // Calculate size the matching table size according to the input size,
-            // so the overhead of "zeroing" the table is not too large for small inputs.
-            let size = input_len.next_power_of_two().clamp(256, 16 * 1024) / 2;
-            (size, 16 - size.trailing_zeros() as usize)
-        }
-        // U32 positions => 16KB table
-        // Usize (U64) positions => 32KB table
-        _ => (4096, 4),
-    };
-    (dict_size, dict_bitshift)
-}
-
-#[test]
-fn test_get_table_size() {
-    const MAX_HASH: usize = u16::MAX as usize;
-    for i in 0..32 {
-        let input_len = 2usize.pow(i);
-        let (size, shift) = get_table_size(input_len);
-        assert_eq!(size, (MAX_HASH >> shift) + 1);
-    }
 }
diff --git a/tests/tests.rs b/tests/tests.rs
@@ -3,6 +3,8 @@
 #[macro_use]
 extern crate more_asserts;
 
+use std::iter;
+
 use lz4_compress::compress as lz4_rust_compress;
 #[cfg(feature = "frame")]
 use lz4_flex::frame::BlockMode;
@@ -158,6 +160,13 @@ fn test_minimum_compression_ratio() {
     let ratio = compressed.len() as f64 / COMPRESSION34K.len() as f64;
     assert_lt!(ratio, 0.585); // TODO check why compression is not deterministic (fails in ci for
                               // 0.58)
+    let compressed = compress(COMPRESSION65);
+    let ratio = compressed.len() as f64 / COMPRESSION65.len() as f64;
+    assert_lt!(ratio, 0.574);
+
+    let compressed = compress(COMPRESSION66JSON);
+    let ratio = compressed.len() as f64 / COMPRESSION66JSON.len() as f64;
+    assert_lt!(ratio, 0.229);
 }
 
 use lz_fear::raw::compress2;
@@ -407,6 +416,12 @@ fn buf_fuzz_5() {
     test_roundtrip(data);
 }
 
+#[test]
+fn test_so_many_zeros() {
+    let data: Vec<u8> = iter::repeat(0).take(30_000).collect();
+    test_roundtrip(data);
+}
+
 #[test]
 fn compression_works() {
     let s = r#"An iterator that knows its exact length.
@@ -432,9 +447,9 @@ fn compression_works() {
 #[ignore]
 #[test]
 fn big_compression() {
-    let mut s = Vec::with_capacity(80_000000);
+    let mut s = Vec::with_capacity(80_000_000);
 
-    for n in 0..80_000000 {
+    for n in 0..80_000_000 {
         s.push((n as u8).wrapping_mul(0xA).wrapping_add(33) ^ 0xA2);
     }