src/connectivity/lib/internet-checksum/src/lib.rs - fuchsia - Git at Google

 // Copyright 2019 The Fuchsia Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 //! RFC 1071 "internet checksum" computation.
 //!
 //! This crate implements the "internet checksum" defined in [RFC 1071] and
 //! updated in [RFC 1141] and [RFC 1624], which is used by many different
 //! protocols' packet formats. The checksum operates by computing the 1s
 //! complement of the 1s complement sum of successive 16-bit words of the input.
 //!
 //! SIMD acceleration is used on some platforms (currently, x86_64 with the avx2
 //! extensions).
 //!
 //! # Benchmarks
 //!
 //! The following microbenchmarks were performed on a 2018 Google Pixelbook.
 //! Each benchmark constructs a [`Checksum`] object, calls
 //! [`Checksum::add_bytes`] with an input of the given number of bytes, and then
 //! calls [`Checksum::checksum`] to finalize. Benchmarks were performed with
 //! SIMD both enabled and disabled. Average values were calculated over 3
 //! trials.
 //!
 //! Bytes | Time w/o SIMD | Rate w/o SIMD | Time w/ SIMD | Rate w/ SIMD | Ratio (time w / time w/o)
 //! ----- | ------------- | ------------- | ------------ | ------------ | -------------------------
 //!    31 |      3,657 ns |     8.48 MB/s |     3,692 ns |    8.40 MB/s |                      1.01
 //!    32 |      3,735 ns |     8.57 MB/s |     3,767 ns |    8.50 MB/s |                      1.01
 //!    64 |      7,092 ns |     9.02 MB/s |     6,580 ns |    9.73 MB/s |                      0.93
 //!   128 |     13,790 ns |     9.28 MB/s |     7,428 ns |    17.2 MB/s |                      0.54
 //!   256 |     27,169 ns |     9.42 MB/s |     9,224 ns |    27.8 MB/s |                      0.34
 //!  1024 |    107,609 ns |     9.52 MB/s |    20,071 ns |    51.0 MB/s |                      0.19
 //!
 //! [RFC 1071]: https://tools.ietf.org/html/rfc1071
 //! [RFC 1141]: https://tools.ietf.org/html/rfc1141
 //! [RFC 1624]: https://tools.ietf.org/html/rfc1624

 // Optimizations applied:
 //
 // 0. Byteorder independence: as described in RFC 1071 section 2.(B)
 //    The sum of 16-bit integers can be computed in either byte order,
 //    so this actually saves us from the unnecessary byte swapping on
 //    an LE machine. As perfed on a gLinux workstation, that swapping
 //    can account for ~20% of the runtime.
 //
 // 1. Widen the accumulator: doing so enables us to process a bigger
 //    chunk of data once at a time, achieving some kind of poor man's
 //    SIMD. Currently a u128 counter is used on x86-64 and a u64 is
 //    used conservatively on other architectures.
 //
 // 2. Process more at a time: the old implementation uses a u32 accumulator
 //    but it only adds one u16 each time to implement deferred carry. In
 //    the current implementation we are processing a u128 once at a time
 //    on x86-64, which is 8 u16's. On other platforms, we are processing
 //    a u64 at a time, which is 4 u16's.
 //
 // 3. Induce the compiler to produce `adc` instruction: this is a very
 //    useful instruction to implement 1's complement addition and available
 //    on both x86 and ARM. The functions `adc_uXX` are for this use.
 //
 // 4. Eliminate branching as much as possible: the old implementation has
 //    if statements for detecting overflow of the u32 accumulator which
 //    is not needed when we can access the carry flag with `adc`. The old
 //    `normalize` function used to have a while loop to fold the u32,
 //    however, we can unroll that loop because we know ahead of time how
 //    much additions we need.
 //
 // 5. In the loop of `add_bytes`, the `adc_u64` is not used, instead,
 //    the `overflowing_add` is directly used. `adc_u64`'s carry flag
 //    comes from the current number being added while the slightly
 //    convoluted version in `add_bytes`, adding each number depends on
 //    the carry flag of the previous computation. I checked under release
 //    mode this issues 3 instructions instead of 4 for x86 and it should
 //    theoretically be beneficial, however, measurement showed me that it
 //    helps only a little. So this trick is not used for `update`.
 //
 // 6. When the input is small, fallback to deferred carry method. Deferred
 //    carry turns out to be very efficient when dealing with small buffers:
 //    If the input is small, the cost to deal with the tail may already
 //    outweigh the benefit of the unrolling itself. Some measurement
 //    confirms this theory.
 //
 // Results:
 //
 // Micro-benchmarks are run on an x86-64 gLinux workstation. In summary,
 // compared the baseline 0 which is prior to the byteorder independence
 // patch, there is a ~4x speedup and the current non-simd version is faster
 // than the simd version of that baseline version.
 //
 // TODO: run this optimization on other platforms. I would expect
 // the situation on ARM a bit different because I am not sure
 // how much penalty there will be for misaligned read on ARM, or
 // whether it is even supported (On x86 there is generally no
 // penalty for misaligned read). If there will be penalties, we
 // should consider alignment as an optimization opportunity on ARM.

 // TODO(joshlf): Right-justify the columns above

 #![cfg_attr(feature = "benchmark", feature(test))]

 #[cfg(all(test, feature = "benchmark"))]
 extern crate test;

 #[cfg(target_arch = "x86_64")]
 use core::arch::x86_64;

 use byteorder::{ByteOrder, NativeEndian};

 // TODO(joshlf):
 // - Investigate optimizations proposed in RFC 1071 Section 2. The most
 //   promising on modern hardware is probably (C) Parallel Summation, although
 //   that needs to be balanced against (1) Deferred Carries. Benchmarks will
 //   need to be performed to determine which is faster in practice, and under
 //   what scenarios.

 /// Compute the checksum of "bytes".
 ///
 /// `checksum(bytes)` is shorthand for:
 ///
 /// ```rust
 /// # use internet_checksum::Checksum;
 /// # let bytes = &[];
 /// # let _ = {
 /// let mut c = Checksum::new();
 /// c.add_bytes(bytes);
 /// c.checksum()
 /// # };
 /// ```
 #[inline]
 pub fn checksum(bytes: &[u8]) -> [u8; 2] {
     let mut c = Checksum::new();
     c.add_bytes(bytes);
     c.checksum()
 }

 #[cfg(target_arch = "x86_64")]
 type Accumulator = u128;
 #[cfg(not(target_arch = "x86_64"))]
 type Accumulator = u64;

 /// The threshold for small buffers, if the buffer is too small,
 /// fall back to the normal deferred carry method where a wide
 /// accumulator is used but one `u16` is added once at a time.
 // TODO: `64` works fine on x86_64, but this value may be different
 // on other platforms.
 const SMALL_BUF_THRESHOLD: usize = 64;

 /// The following macro unrolls operations on u16's to wider integers.
 ///
 /// # Arguments
 ///
 /// * `$arr`  - The byte slice being processed.
 /// * `$body` - The operation to operate on the wider integer. It should
 ///             be a macro because functions are not options here.
 ///
 ///
 /// This macro will choose the "wide integer" for you, on x86-64,
 /// it will choose u128 as the "wide integer" and u64 anywhere else.
 macro_rules! loop_unroll {
     (@inner $arr: ident, 16, $body:ident) => {
         while $arr.len() >= 16 {
             $body!(16, read_u128);
         }
         unroll_tail!($arr, 16, $body);
     };

     (@inner $arr: ident, 8, $body:ident) => {
         while $arr.len() >= 8 {
             $body!(8, read_u64);
         }
         unroll_tail!($arr, 8, $body);
     };

     ($arr: ident, $body: ident) => {
         #[cfg(target_arch = "x86_64")]
         loop_unroll!(@inner $arr, 16, $body);
         #[cfg(not(target_arch = "x86_64"))]
         loop_unroll!(@inner $arr, 8, $body);
     };
 }

 /// At the the end of loop unrolling, we have to take care of bytes
 /// that are left over. For example, `unroll_tail!(bytes, 4, body)`
 /// expands to
 /// ```
 /// if bytes.len & 2 != 0 {
 ///   body!(2, read_u16);
 /// }
 /// ```
 macro_rules! unroll_tail {
     ($arr: ident, $n: literal, $read: ident, $body: ident) => {
         if $arr.len() & $n != 0 {
             $body!($n, $read);
         }
     };

     ($arr: ident, 4, $body: ident) => {
         unroll_tail!($arr, 2, read_u16, $body);
     };

     ($arr: ident, 8, $body: ident) => {
         unroll_tail!($arr, 4, read_u32, $body);
         unroll_tail!($arr, 4, $body);
     };

     ($arr: ident, 16, $body: ident) => {
         unroll_tail!($arr, 8, read_u64, $body);
         unroll_tail!($arr, 8, $body);
     };
 }

 /// Updates bytes in an existing checksum.
 ///
 /// `update` updates a checksum to reflect that the already-checksummed bytes
 /// `old` have been updated to contain the values in `new`. It implements the
 /// algorithm described in Equation 3 in [RFC 1624]. The first byte must be at
 /// an even number offset in the original input. If an odd number offset byte
 /// needs to be updated, the caller should simply include the preceding byte as
 /// well. If an odd number of bytes is given, it is assumed that these are the
 /// last bytes of the input. If an odd number of bytes in the middle of the
 /// input needs to be updated, the preceding or following byte of the input
 /// should be added to make an even number of bytes.
 ///
 /// # Panics
 ///
 /// `update` panics if `old.len() != new.len()`.
 ///
 /// [RFC 1624]: https://tools.ietf.org/html/rfc1624
 #[inline]
 pub fn update(checksum: [u8; 2], old: &[u8], new: &[u8]) -> [u8; 2] {
     assert_eq!(old.len(), new.len());
     // We compute on the sum, not the one's complement of the sum. checksum
     // is the one's complement of the sum, so we need to get back to the
     // sum. Thus, we negate checksum.
     let mut sum = !NativeEndian::read_u16(&checksum[..]) as Accumulator;

     // First, process as much as we can with SIMD.
     let (mut old, mut new) = Checksum::update_simd(&mut sum, old, new);

     // Continue with the normal algorithm to finish up whatever we couldn't
     // process with SIMD.
     macro_rules! handle_chunk {
         ($read: ident, $old: expr, $new: expr) => {
             let o = NativeEndian::$read($old);
             let n = NativeEndian::$read($new);
             // RFC 1624 Eqn. 3
             sum = adc_accumulator(sum, !o as Accumulator);
             sum = adc_accumulator(sum, n as Accumulator);
         };
         ($n: literal, $read: ident) => {
             handle_chunk!($read, old, new);
             old = &old[$n..];
             new = &new[$n..];
         };
     }
     loop_unroll!(old, handle_chunk);

     if old.len() == 1 {
         handle_chunk!(read_u16, &[old[0], 0], &[new[0], 0]);
     }

     let mut cksum = [0u8; 2];
     NativeEndian::write_u16(&mut cksum[..], !normalize(sum));
     cksum
 }

 /// RFC 1071 "internet checksum" computation.
 ///
 /// `Checksum` implements the "internet checksum" defined in [RFC 1071] and
 /// updated in [RFC 1141] and [RFC 1624], which is used by many different
 /// protocols' packet formats. The checksum operates by computing the 1s
 /// complement of the 1s complement sum of successive 16-bit words of the input.
 ///
 /// [RFC 1071]: https://tools.ietf.org/html/rfc1071
 /// [RFC 1141]: https://tools.ietf.org/html/rfc1141
 /// [RFC 1624]: https://tools.ietf.org/html/rfc1624
 #[derive(Default)]
 pub struct Checksum {
     sum: Accumulator,
     // Since odd-length inputs are treated specially, we store the trailing byte
     // for use in future calls to add_bytes(), and only treat it as a true
     // trailing byte in checksum().
     trailing_byte: Option<u8>,
 }

 impl Checksum {
     /// Minimum number of bytes in a buffer to run the SIMD algorithm.
     ///
     /// Running the algorithm with less than `MIN_BYTES_FOR_SIMD` bytes will
     /// cause the benefits of SIMD to be dwarfed by the overhead (performing
     /// worse than the normal/non-SIMD algorithm). This value was chosen after
     /// several benchmarks which showed that the algorithm performed worse than
     /// the normal/non-SIMD algorithm when the number of bytes was less than 256.
     // TODO: 256 may not perform the best on other platforms such as ARM.
     #[cfg(target_arch = "x86_64")]
     const MIN_BYTES_FOR_SIMD: usize = 256;

     /// Initialize a new checksum.
     #[inline]
     pub const fn new() -> Self {
         Checksum { sum: 0, trailing_byte: None }
     }

     /// Add bytes to the checksum.
     ///
     /// If `bytes` does not contain an even number of bytes, a single zero byte
     /// will be added to the end before updating the checksum.
     ///
     /// Note that `add_bytes` has some fixed overhead regardless of the size of
     /// `bytes`. Additionally, SIMD optimizations are only available for inputs
     /// of a certain size. Where performance is a concern, prefer fewer calls to
     /// `add_bytes` with larger input over more calls with smaller input.
     #[inline]
     pub fn add_bytes(&mut self, mut bytes: &[u8]) {
         if bytes.len() < SMALL_BUF_THRESHOLD {
             self.add_bytes_small(bytes);
             return;
         }

         let mut sum = self.sum;
         let mut carry = false;

         // We are not using `adc_uXX` functions here, instead,
         // we manually track the carry flag. This is because
         // in `adc_uXX` functions, the carry flag depends on
         // addition itself. So the assembly for that function
         // reads as follows:
         //
         // mov %rdi, %rcx
         // mov %rsi, %rax
         // add %rcx, %rsi -- waste! only used to generate CF.
         // adc %rdi, $rax -- the real useful instruction.
         //
         // So we had better to make us depend on the CF generated
         // by the addition of the previous 16-bit word. The ideal
         // assembly should look like:
         // add 0(%rdi), %rax
         // adc 8(%rdi), %rax
         // adc 16(%rdi), %rax
         // .... and so on ...
         //
         // Sadly, there are too many instructions that can affect
         // the carry flag, and LLVM is not that optimized to find
         // out the pattern and let all these adc instructions not
         // interleaved. However, doing so results in 3 instructions
         // instead of the original 4 instructions (the two mov's are
         // still there) and it makes a difference on input size like
         // 1023. And measurements showed little improvement on the
         // update operation. Considering `update` is expected to be
         // used on small inputs, and for readability issues, this
         // trick is not employed there.

         // The following macro is used as a `body` when invoking a
         // `loop_unroll` macro. `$step` means how many bytes to handle
         // at once; `$read` is supposed to be `read_u16`, `read_u32`
         // and so on, it is used to get an unsigned integer of `$step`
         // width from a byte slice; `$bytes` is the byte slice mentioned
         // before, if omitted, it defaults to be `bytes`, which is the
         // argument of the surrounding function.
         macro_rules! update_sum_carry {
             ($step: literal, $read: ident, $bytes: expr) => {
                 let (s, c) = sum.overflowing_add(NativeEndian::$read($bytes) as Accumulator);
                 sum = s + (carry as Accumulator);
                 carry = c;
                 bytes = &bytes[$step..];
             };
             ($step: literal, $read: ident) => {
                 update_sum_carry!($step, $read, bytes);
             };
         }

         // if there's a trailing byte, consume it first
         if let Some(byte) = self.trailing_byte {
             update_sum_carry!(1, read_u16, &[byte, bytes[0]]);
             self.trailing_byte = None;
         }

         // First, process as much as we can with SIMD.
         bytes = Self::add_bytes_simd(&mut sum, bytes);

         loop_unroll!(bytes, update_sum_carry);

         if bytes.len() == 1 {
             self.trailing_byte = Some(bytes[0]);
         }

         self.sum = sum + (carry as Accumulator);
     }

     /// The efficient fallback when the buffer is small.
     ///
     /// In this implementation, one `u16` is added once a
     /// time, so we don't waste time on dealing with the
     /// tail of the buffer. Besides, given that the accumulator
     /// is large enough, when inputs are small, there should
     /// hardly be overflows, so for any modern architecture,
     /// there is little chance in misprediction.
     // The inline attribute is needed here, micro benchmarks showed
     // that it speeds up things.
     #[inline(always)]
     fn add_bytes_small(&mut self, mut bytes: &[u8]) {
         if bytes.is_empty() {
             return;
         }

         let mut sum = self.sum;
         fn update_sum(acc: Accumulator, rhs: u16) -> Accumulator {
             if let Some(updated) = acc.checked_add(rhs as Accumulator) {
                 updated
             } else {
                 (normalize(acc) + rhs) as Accumulator
             }
         }

         if let Some(byte) = self.trailing_byte {
             sum = update_sum(sum, NativeEndian::read_u16(&[byte, bytes[0]]));
             bytes = &bytes[1..];
             self.trailing_byte = None;
         }

         while bytes.len() >= 2 {
             sum = update_sum(sum, NativeEndian::read_u16(bytes));
             bytes = &bytes[2..];
         }

         if bytes.len() == 1 {
             self.trailing_byte = Some(bytes[0]);
         }

         self.sum = sum;
     }

     /// Computes the checksum, but in big endian byte order.
     fn checksum_inner(&self) -> u16 {
         let mut sum = self.sum;
         if let Some(byte) = self.trailing_byte {
             sum = adc_accumulator(sum, NativeEndian::read_u16(&[byte, 0]) as Accumulator);
         }
         !normalize(sum)
     }

     /// Computes the checksum, and returns the array representation.
     ///
     /// `checksum` returns the checksum of all data added using `add_bytes` so
     /// far. Calling `checksum` does *not* reset the checksum. More bytes may be
     /// added after calling `checksum`, and they will be added to the checksum
     /// as expected.
     ///
     /// If an odd number of bytes have been added so far, the checksum will be
     /// computed as though a single 0 byte had been added at the end in order to
     /// even out the length of the input.
     #[inline]
     pub fn checksum(&self) -> [u8; 2] {
         let mut cksum = [0u8; 2];
         NativeEndian::write_u16(&mut cksum[..], self.checksum_inner());
         cksum
     }

     /// Adds bytes to a running sum using architecture specific SIMD
     /// instructions.
     ///
     /// `add_bytes_simd` updates `sum` with the sum of `bytes` using
     /// architecture-specific SIMD instructions. It may not process all bytes,
     /// and whatever bytes are not processed will be returned. If no
     /// implementation exists for the target architecture and run-time CPU
     /// features, `add_bytes_simd` does nothing and simply returns `bytes`
     /// directly.
     #[inline(always)]
     fn add_bytes_simd<'a>(sum: &mut Accumulator, bytes: &'a [u8]) -> &'a [u8] {
         #[cfg(target_arch = "x86_64")]
         {
             if is_x86_feature_detected!("avx2") && bytes.len() >= Self::MIN_BYTES_FOR_SIMD {
                 return unsafe { Self::add_bytes_x86_64(sum, bytes) };
             }
         }

         // Suppress unused variable warning when we don't compile the preceding
         // block.
         #[cfg(not(target_arch = "x86_64"))]
         let _ = sum;

         bytes
     }

     /// Adds bytes to a running sum using x86_64's avx2 SIMD instructions.
     ///
     /// # Safety
     ///
     /// `add_bytes_x86_64` should never be called unless the run-time CPU
     /// features include 'avx2'. If `add_bytes_x86_64` is called and the
     /// run-time CPU features do not include 'avx2', it is considered undefined
     /// behaviour.
     #[cfg(target_arch = "x86_64")]
     #[target_feature(enable = "avx2")]
     unsafe fn add_bytes_x86_64<'a>(sum: &mut Accumulator, mut bytes: &'a [u8]) -> &'a [u8] {
         // TODO(ghanan): Use safer alternatives to achieve SIMD algorithm once
         // they stabilize.

         let zeros: x86_64::__m256i = x86_64::_mm256_setzero_si256();
         let mut c0: x86_64::__m256i;
         let mut c1: x86_64::__m256i;
         let mut acc: x86_64::__m256i;
         let data: [u8; 32] = [0; 32];

         while bytes.len() >= 32 {
             let mut add_count: u32 = 0;

             // Reset accumulator.
             acc = zeros;

             // We can do (2^16 + 1) additions to the accumulator (`acc`) that
             // starts off at 0 without worrying about overflow. Since each
             // iteration of this loop does 2 additions to the accumulator,
             // `add_count` must be less than or equal to U16_MAX(= 2 ^ 16 - 1 =
             // 2 ^ 16 - 1 - 2) to guarantee no overflows during this loop
             // iteration. We know we can do 2^16 + 1 additions to the
             // accumulator because we are using 32bit integers which can hold a
             // max value of U32_MAX (2^32 - 1), and we are adding 16bit values
             // with a max value of U16_MAX (2^16 - 1). U32_MAX = (U16_MAX << 16
             // + U16_MAX) = U16_MAX * (2 ^ 16 + 1)
             while bytes.len() >= 32 && add_count <= u32::from(std::u16::MAX) {
                 // Load 32 bytes from memory (16 16bit values to add to `sum`)
                 //
                 // `_mm256_lddqu_si256` does not require the memory address to
                 // be aligned so remove the linter check for casting from a less
                 // strictly-aligned pointer to a more strictly-aligned pointer.
                 // https://doc.rust-lang.org/core/arch/x86_64/fn._mm256_lddqu_si256.html
                 #[allow(clippy::cast_ptr_alignment)]
                 {
                     c0 = x86_64::_mm256_lddqu_si256(bytes.as_ptr() as *const x86_64::__m256i);
                 }

                 // Create 32bit words with most significant 16 bits = 0, least
                 // significant 16 bits set to a new 16 bit word to add to
                 // checksum from bytes. Setting the most significant 16 bits to
                 // 0 allows us to do 2^16 simd additions (2^20 16bit word
                 // additions) without worrying about overflows.
                 c1 = x86_64::_mm256_unpackhi_epi16(c0, zeros);
                 c0 = x86_64::_mm256_unpacklo_epi16(c0, zeros);

                 // Sum 'em up!
                 // `acc` being treated as a vector of 8x 32bit words.
                 acc = x86_64::_mm256_add_epi32(acc, c1);
                 acc = x86_64::_mm256_add_epi32(acc, c0);

                 // We did 2 additions to the accumulator in this iteration of
                 // the loop.
                 add_count += 2;

                 bytes = &bytes[32..];
             }

             // Store the results of our accumlator of 8x 32bit words to our
             // temporary buffer `data` so that we can iterate over data 16 bits
             // at a time and add the values to `sum`. Since `acc` is a 256bit
             // value, it requires 32 bytes, provided by `data`.
             //
             // `_mm256_storeu_si256` does not require the memory address to be
             // aligned on any particular boundary so remove the linter check for
             // casting from a less strictly-aligned pointer to a more strictly-
             // aligned pointer.
             // https://doc.rust-lang.org/core/arch/x86_64/fn._mm256_storeu_si256.html
             #[allow(clippy::cast_ptr_alignment)]
             x86_64::_mm256_storeu_si256(data.as_ptr() as *mut x86_64::__m256i, acc);

             let mut fold = *sum;
             // Iterate over the accumulator data accumulator-width bytes at a time,
             // and add it to `sum`.
             macro_rules! fold {
                 ($step: literal, $read: ident) => {
                     for x in (0..32).step_by($step) {
                         fold = adc_accumulator(
                             fold,
                             NativeEndian::$read(&data[x..x + $step]) as Accumulator,
                         );
                     }
                 };
             }
             #[cfg(not(target_arch = "x86_64"))]
             fold!(8, read_u64);
             #[cfg(target_arch = "x86_64")]
             fold!(16, read_u128);
             *sum = fold;
         }

         bytes
     }

     /// Updates bytes in an existing checksum using architecture-specific SIMD
     /// instructions.
     ///
     /// `update_simd` updates a checksum to reflect that the already-checksumed
     /// bytes `old_bytes` have been updated to contain the values in `new_bytes`
     /// using architecture-specific SIMD instructions. It may not process all
     /// the bytes, and whatever bytes are not processed will be returned. If no
     /// implementation exists for the target architecture and run-time CPU
     /// features, `update_simd` does nothing and simply returns `old_bytes` and
     /// `new_bytes' directly.
     #[inline(always)]
     fn update_simd<'a, 'b>(
         sum: &mut Accumulator,
         old_bytes: &'a [u8],
         new_bytes: &'b [u8],
     ) -> (&'a [u8], &'b [u8]) {
         #[cfg(target_arch = "x86_64")]
         {
             if is_x86_feature_detected!("avx2") && old_bytes.len() >= Self::MIN_BYTES_FOR_SIMD {
                 return unsafe { Self::update_x86_64(sum, old_bytes, new_bytes) };
             }
         }

         // Suppress unused variable warning when we don't compile the preceding
         // block.
         #[cfg(not(target_arch = "x86_64"))]
         let _ = sum;

         (old_bytes, new_bytes)
     }

     /// Updates bytes in an existing checksum using x86_64's avx2 instructions.
     ///
     /// # Safety
     ///
     /// `update_x86_64` should never be called unless the run-time CPU features
     /// include 'avx2'. If `update_x86_64` is called and the run-time CPU
     /// features do not include 'avx2', it is considered undefined behaviour.
     ///
     /// # Panics
     ///
     /// `update_x86_64` panics if `old_bytes.len() != new_bytes.len()`.
     #[cfg(target_arch = "x86_64")]
     unsafe fn update_x86_64<'a, 'b>(
         sum: &mut Accumulator,
         old_bytes: &'a [u8],
         new_bytes: &'b [u8],
     ) -> (&'a [u8], &'b [u8]) {
         assert_eq!(new_bytes.len(), old_bytes.len());

         // Instead of gettings the 1s complement of each 16bit word before
         // adding it to sum, we can get the sum of just `old_bytes` to a
         // temporary variable `old_sum`. We can then add it as a normal 16bit
         // word to the current sum (`sum`) after normalizng it and getting the
         // 1s complement. This will 'remove' `old_bytes` from `sum`.
         let mut old_sum = 0;
         let old_bytes = Self::add_bytes_x86_64(&mut old_sum, old_bytes);
         *sum = adc_accumulator(*sum, !old_sum as Accumulator);

         // Add `new_bytes` to `sum` using SIMD as normal.
         let new_bytes = Self::add_bytes_x86_64(sum, new_bytes);

         // We should have the exact same number of bytes left over for both
         // `new_bytes` and `old_bytes`.
         assert_eq!(new_bytes.len(), old_bytes.len());

         (old_bytes, new_bytes)
     }
 }

 macro_rules! impl_adc {
     ($name: ident, $t: ty) => {
         /// implements 1's complement addition for $t,
         /// exploiting the carry flag on a 2's complement machine.
         /// In practice, the adc instruction will be generated.
         fn $name(a: $t, b: $t) -> $t {
             let (s, c) = a.overflowing_add(b);
             s + (c as $t)
         }
     };
 }

 impl_adc!(adc_u16, u16);
 impl_adc!(adc_u32, u32);
 #[cfg(target_arch = "x86_64")]
 impl_adc!(adc_u64, u64);
 impl_adc!(adc_accumulator, Accumulator);

 /// Normalizes the accumulator by mopping up the
 /// overflow until it fits in a `u16`.
 fn normalize(a: Accumulator) -> u16 {
     #[cfg(target_arch = "x86_64")]
     return normalize_64(adc_u64(a as u64, (a >> 64) as u64));
     #[cfg(not(target_arch = "x86_64"))]
     return normalize_64(a);
 }

 fn normalize_64(a: u64) -> u16 {
     let t = adc_u32(a as u32, (a >> 32) as u32);
     adc_u16(t as u16, (t >> 16) as u16)
 }

 #[cfg(all(test, feature = "benchmark"))]
 mod benchmarks {
     // Benchmark results for comparing checksum calculation with and without
     // SIMD implementation, running on Google's Pixelbook. Average values were
     // calculated over 3 trials.
     //
     // Number of | Average time  | Average time | Ratio
     //   bytes   | (ns) w/o SIMD | (ns) w/ SIMD | (w / w/o)
     // --------------------------------------------------
     //        31 |          3657 |         3692 |   1.01
     //        32 |          3735 |         3767 |   1.01
     //        64 |          7092 |         6580 |   0.93
     //       128 |         13790 |         7428 |   0.54
     //       256 |         27169 |         9224 |   0.34
     //      1024 |        107609 |        20071 |   0.19

     extern crate test;
     use super::*;

     /// Benchmark time to calculate checksum with a single call to `add_bytes`
     /// with 31 bytes.
     #[bench]
     fn bench_checksum_31(b: &mut test::Bencher) {
         b.iter(|| {
             let buf = test::black_box([0xFF; 31]);
             let mut c = Checksum::new();
             c.add_bytes(&buf);
             test::black_box(c.checksum());
         });
     }

     /// Benchmark time to calculate checksum with a single call to `add_bytes`
     /// with 32 bytes.
     #[bench]
     fn bench_checksum_32(b: &mut test::Bencher) {
         b.iter(|| {
             let buf = test::black_box([0xFF; 32]);
             let mut c = Checksum::new();
             c.add_bytes(&buf);
             test::black_box(c.checksum());
         });
     }

     /// Benchmark time to calculate checksum with a single call to `add_bytes`
     /// with 64 bytes.
     #[bench]
     fn bench_checksum_64(b: &mut test::Bencher) {
         b.iter(|| {
             let buf = test::black_box([0xFF; 64]);
             let mut c = Checksum::new();
             c.add_bytes(&buf);
             test::black_box(c.checksum());
         });
     }

     /// Benchmark time to calculate checksum with a single call to `add_bytes`
     /// with 128 bytes.
     #[bench]
     fn bench_checksum_128(b: &mut test::Bencher) {
         b.iter(|| {
             let buf = test::black_box([0xFF; 128]);
             let mut c = Checksum::new();
             c.add_bytes(&buf);
             test::black_box(c.checksum());
         });
     }

     /// Benchmark time to calculate checksum with a single call to `add_bytes`
     /// with 256 bytes.
     #[bench]
     fn bench_checksum_256(b: &mut test::Bencher) {
         b.iter(|| {
             let buf = test::black_box([0xFF; 256]);
             let mut c = Checksum::new();
             c.add_bytes(&buf);
             test::black_box(c.checksum());
         });
     }

     /// Benchmark time to calculate checksum with a single call to `add_bytes`
     /// with 1024 bytes.
     #[bench]
     fn bench_checksum_1024(b: &mut test::Bencher) {
         b.iter(|| {
             let buf = test::black_box([0xFF; 1024]);
             let mut c = Checksum::new();
             c.add_bytes(&buf);
             test::black_box(c.checksum());
         });
     }

     /// Benchmark time to calculate checksum with a single call to `add_bytes`
     /// with 1023 bytes.
     #[bench]
     fn bench_checksum_1023(b: &mut test::Bencher) {
         b.iter(|| {
             let buf = test::black_box([0xFF; 1023]);
             let mut c = Checksum::new();
             c.add_bytes(&buf);
             test::black_box(c.checksum());
         });
     }

     #[bench]
     fn bench_checksum_20(b: &mut test::Bencher) {
         b.iter(|| {
             let buf = test::black_box([0xFF; 20]);
             let mut c = Checksum::new();
             c.add_bytes(&buf);
             test::black_box(c.checksum());
         });
     }

     #[bench]
     fn bench_checksum_small_20(b: &mut test::Bencher) {
         b.iter(|| {
             let buf = test::black_box([0xFF; 20]);
             let mut c = Checksum::new();
             c.add_bytes_small(&buf);
             test::black_box(c.checksum());
         });
     }

     #[bench]
     fn bench_checksum_small_31(b: &mut test::Bencher) {
         b.iter(|| {
             let buf = test::black_box([0xFF; 31]);
             let mut c = Checksum::new();
             c.add_bytes_small(&buf);
             test::black_box(c.checksum());
         });
     }

     #[bench]
     fn bench_update_1024(b: &mut test::Bencher) {
         b.iter(|| {
             let old = test::black_box([0x42; 1024]);
             let new = test::black_box([0xa0; 1024]);
             test::black_box(update([42; 2], &old[..], &new[..]));
         });
     }

     #[bench]
     fn bench_update_1023(b: &mut test::Bencher) {
         b.iter(|| {
             let old = test::black_box([0x42; 1023]);
             let new = test::black_box([0xa0; 1023]);
             test::black_box(update([42; 2], &old[..], &new[..]));
         });
     }

     #[bench]
     fn bench_update_256(b: &mut test::Bencher) {
         b.iter(|| {
             let old = test::black_box([0x42; 256]);
             let new = test::black_box([0xa0; 256]);
             test::black_box(update([42; 2], &old[..], &new[..]));
         });
     }

     #[bench]
     fn bench_update_128(b: &mut test::Bencher) {
         b.iter(|| {
             let old = test::black_box([0x42; 128]);
             let new = test::black_box([0xa0; 128]);
             test::black_box(update([42; 2], &old[..], &new[..]));
         });
     }

     #[bench]
     fn bench_update_64(b: &mut test::Bencher) {
         b.iter(|| {
             let old = test::black_box([0x42; 64]);
             let new = test::black_box([0xa0; 64]);
             test::black_box(update([42; 2], &old[..], &new[..]));
         });
     }

     #[bench]
     fn bench_update_32(b: &mut test::Bencher) {
         b.iter(|| {
             let old = test::black_box([0x42; 32]);
             let new = test::black_box([0xa0; 32]);
             test::black_box(update([42; 2], &old[..], &new[..]));
         });
     }

     #[bench]
     fn bench_update_31(b: &mut test::Bencher) {
         b.iter(|| {
             let old = test::black_box([0x42; 31]);
             let new = test::black_box([0xa0; 31]);
             test::black_box(update([42; 2], &old[..], &new[..]));
         });
     }

     #[bench]
     fn bench_update_16(b: &mut test::Bencher) {
         b.iter(|| {
             let old = test::black_box([0x42; 16]);
             let new = test::black_box([0xa0; 16]);
             test::black_box(update([42; 2], &old[..], &new[..]));
         });
     }
 }

 #[cfg(test)]
 mod tests {
     use core::iter;

     use byteorder::NativeEndian;
     use rand::{Rng, SeedableRng};

     use rand_xorshift::XorShiftRng;

     use super::*;

     /// Create a new deterministic RNG from a seed.
     fn new_rng(mut seed: u64) -> XorShiftRng {
         if seed == 0 {
             // XorShiftRng can't take 0 seeds
             seed = 1;
         }
         let mut bytes = [0; 16];
         NativeEndian::write_u32(&mut bytes[0..4], seed as u32);
         NativeEndian::write_u32(&mut bytes[4..8], (seed >> 32) as u32);
         NativeEndian::write_u32(&mut bytes[8..12], seed as u32);
         NativeEndian::write_u32(&mut bytes[12..16], (seed >> 32) as u32);
         XorShiftRng::from_seed(bytes)
     }

     #[test]
     fn test_checksum() {
         for buf in IPV4_HEADERS {
             // compute the checksum as normal
             let mut c = Checksum::new();
             c.add_bytes(&buf);
             assert_eq!(c.checksum(), [0u8; 2]);
             // compute the checksum one byte at a time to make sure our
             // trailing_byte logic works
             let mut c = Checksum::new();
             for byte in *buf {
                 c.add_bytes(&[*byte]);
             }
             assert_eq!(c.checksum(), [0u8; 2]);

             // Make sure that it works even if we overflow u32. Performing this
             // loop 2 * 2^16 times is guaranteed to cause such an overflow
             // because 0xFFFF + 0xFFFF > 2^16, and we're effectively adding
             // (0xFFFF + 0xFFFF) 2^16 times. We verify the overflow as well by
             // making sure that, at least once, the sum gets smaller from one
             // loop iteration to the next.
             let mut c = Checksum::new();
             c.add_bytes(&[0xFF, 0xFF]);
             for _ in 0..((2 * (1 << 16)) - 1) {
                 c.add_bytes(&[0xFF, 0xFF]);
             }
             assert_eq!(c.checksum(), [0u8; 2]);
         }

         // Make sure that checksum works with add_bytes taking a buffer big
         // enough to test implementation with simd instructions and cause
         // overflow within the implementation.
         let mut c = Checksum::new();
         // 2 bytes/word * 8 words/additions * 2^16 additions/overflow
         // * 1 overflow + 79 extra bytes = 2^20 + 79
         let buf = vec![0xFF; (1 << 20) + 79];
         c.add_bytes(&buf);
         assert_eq!(c.checksum(), [0, 0xFF]);
     }

     #[test]
     fn test_checksum_simd_rand() {
         let mut rng = new_rng(70812476915813);

         // Test simd implementation with random values and buffer big enough to
         // cause an overflow within the implementation..
         // 2 bytes/word * 8 words/additions * 2^16 additions/overflow
         // * 1 overflow + 79 extra bytes
         // = 2^20 + 79
         const BUF_LEN: usize = (1 << 20) + 79;
         let buf: Vec<u8> = iter::repeat_with(|| rng.gen()).take(BUF_LEN).collect();

         let single_bytes = {
             // Add 1 byte at a time to make sure we do not enter implementation
             // with simd instructions
             let mut c = Checksum::new();
             for i in 0..BUF_LEN {
                 c.add_bytes(&buf[i..=i]);
             }
             c.checksum()
         };
         let all_bytes = {
             // Calculate checksum with same buffer, but this time test the
             // implementation with simd instructions
             let mut c = Checksum::new();
             c.add_bytes(&buf);
             c.checksum()
         };
         assert_eq!(single_bytes, all_bytes);
     }

     #[test]
     fn test_update() {
         for b in IPV4_HEADERS {
             let mut buf = Vec::new();
             buf.extend_from_slice(b);

             let mut c = Checksum::new();
             c.add_bytes(&buf);
             assert_eq!(c.checksum(), [0u8; 2]);

             // replace the destination IP with the loopback address
             let old = [buf[16], buf[17], buf[18], buf[19]];
             (&mut buf[16..20]).copy_from_slice(&[127, 0, 0, 1]);
             let updated = update(c.checksum(), &old, &[127, 0, 0, 1]);
             let from_scratch = {
                 let mut c = Checksum::new();
                 c.add_bytes(&buf);
                 c.checksum()
             };
             assert_eq!(updated, from_scratch);
         }

         // Test update with bytes big enough to test simd implementation with
         // overflow.
         const BUF_LEN: usize = (1 << 20) + 79;
         let buf = vec![0xFF; BUF_LEN];
         let mut new_buf = buf.to_vec();
         let (begin, end) = (4, BUF_LEN);

         for i in begin..end {
             new_buf[i] = i as u8;
         }

         let updated = {
             let mut c = Checksum::new();
             c.add_bytes(&buf);
             update(c.checksum(), &buf[begin..end], &new_buf[begin..end])
         };
         let from_scratch = {
             let mut c = Checksum::new();
             c.add_bytes(&new_buf);
             c.checksum()
         };
         assert_eq!(updated, from_scratch);
     }

     #[test]
     fn test_smoke_update() {
         let mut rng = new_rng(70_812_476_915_813);

         for _ in 0..2048 {
             // use an odd length so we test the odd length logic
             const BUF_LEN: usize = 31;
             let buf: [u8; BUF_LEN] = rng.gen();
             let mut c = Checksum::new();
             c.add_bytes(&buf);

             let (begin, end) = loop {
                 let begin = rng.gen::<usize>() % BUF_LEN;
                 let end = begin + (rng.gen::<usize>() % (BUF_LEN + 1 - begin));
                 // update requires that begin is even and end is either even or
                 // the end of the input
                 if begin % 2 == 0 && (end % 2 == 0 || end == BUF_LEN) {
                     break (begin, end);
                 }
             };

             let mut new_buf = buf;
             for i in begin..end {
                 new_buf[i] = rng.gen();
             }
             let updated = update(c.checksum(), &buf[begin..end], &new_buf[begin..end]);
             let from_scratch = {
                 let mut c = Checksum::new();
                 c.add_bytes(&new_buf);
                 c.checksum()
             };
             assert_eq!(updated, from_scratch);
         }
     }

     #[test]
     fn test_update_simd_rand() {
         let mut rng = new_rng(70812476915813);

         // Test updating with random values and update size big enough to test
         // simd implementation with overflow
         const MIN_BYTES: usize = 1 << 20;
         const BUF_LEN: usize = (1 << 21) + 79;
         let buf: Vec<u8> = iter::repeat_with(|| rng.gen()).take(BUF_LEN).collect();
         let orig_checksum = {
             let mut c = Checksum::new();
             c.add_bytes(&buf);
             c.checksum()
         };

         let (begin, end) = loop {
             let begin = rng.gen::<usize>() % ((BUF_LEN - MIN_BYTES) / 2);
             let end = begin
                 + MIN_BYTES
                 + (rng.gen::<usize>() % (((BUF_LEN - MIN_BYTES) / 2) + 1 - begin));
             // update requires that begin is even and end is either even or the
             // end of the input
             if begin % 2 == 0 && (end % 2 == 0 || end == BUF_LEN) {
                 break (begin, end);
             }
         };

         let mut new_buf: Vec<u8> = buf.to_vec();

         for i in begin..end {
             new_buf[i] = rng.gen();
         }

         let from_update = update(orig_checksum, &buf[begin..end], &new_buf[begin..end]);
         let from_scratch = {
             let mut c = Checksum::new();
             c.add_bytes(&new_buf);
             c.checksum()
         };
         assert_eq!(from_scratch, from_update);
     }

     #[test]
     fn test_add_bytes_small_prop_test() {
         // Since we have two independent implementations
         // Now it is time for us to write a property test
         // to ensure the checksum algorithm(s) are indeed correct.

         let mut rng = new_rng(123478012483);
         let mut c1 = Checksum::new();
         let mut c2 = Checksum::new();
         for len in 64..1_025 {
             for _ in 0..4 {
                 let mut buf = vec![];
                 for _ in 0..len {
                     buf.push(rng.gen());
                 }
                 c1.add_bytes(&buf[..]);
                 c2.add_bytes_small(&buf[..]);
                 assert_eq!(c1.checksum(), c2.checksum());
                 let n1 = c1.checksum_inner();
                 let n2 = c2.checksum_inner();
                 assert_eq!(n1, n2);
                 let mut t1 = Checksum::new();
                 let mut t2 = Checksum::new();
                 let mut t3 = Checksum::new();
                 t3.add_bytes(&buf[..]);
                 if buf.len() % 2 == 1 {
                     buf.push(0);
                 }
                 assert_eq!(buf.len() % 2, 0);
                 buf.extend_from_slice(&t3.checksum());
                 t1.add_bytes(&buf[..]);
                 t2.add_bytes_small(&buf[..]);
                 assert_eq!(t1.checksum(), [0, 0]);
                 assert_eq!(t2.checksum(), [0, 0]);
             }
         }
     }

     /// IPv4 headers.
     ///
     /// This data was obtained by capturing live network traffic.
     const IPV4_HEADERS: &[&[u8]] = &[
         &[
             0x45, 0x00, 0x00, 0x34, 0x00, 0x00, 0x40, 0x00, 0x40, 0x06, 0xae, 0xea, 0xc0, 0xa8,
             0x01, 0x0f, 0xc0, 0xb8, 0x09, 0x6a,
         ],
         &[
             0x45, 0x20, 0x00, 0x74, 0x5b, 0x6e, 0x40, 0x00, 0x37, 0x06, 0x5c, 0x1c, 0xc0, 0xb8,
             0x09, 0x6a, 0xc0, 0xa8, 0x01, 0x0f,
         ],
         &[
             0x45, 0x20, 0x02, 0x8f, 0x00, 0x00, 0x40, 0x00, 0x3b, 0x11, 0xc9, 0x3f, 0xac, 0xd9,
             0x05, 0x6e, 0xc0, 0xa8, 0x01, 0x0f,
         ],
     ];
 }