| // Copyright 2021 The Wuffs Authors. |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // https://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| // -------- |
| |
| // See "SIMD Implementations" in README.md for a link to Gopal et al. "Fast CRC |
| // Computation for Generic Polynomials Using PCLMULQDQ Instruction". |
| |
| // up_x86_avx2 is exactly the same as up_x86_sse42 except for the "choose |
| // cpu_arch >= x86_avx2". With AVX, PCLMULQDQ has a three-operand form, not |
| // just a two-operand form: https://www.felixcloutier.com/x86/pclmulqdq |
| pri func ieee_hasher.up_x86_avx2!(x: slice base.u8), |
| choose cpu_arch >= x86_avx2, |
| { |
| var s : base.u32 |
| var p : slice base.u8 |
| |
| var util : base.x86_sse42_utility |
| var k : base.x86_m128i |
| var x0 : base.x86_m128i |
| var x1 : base.x86_m128i |
| var x2 : base.x86_m128i |
| var x3 : base.x86_m128i |
| var y0 : base.x86_m128i |
| var y1 : base.x86_m128i |
| var y2 : base.x86_m128i |
| var y3 : base.x86_m128i |
| |
| var tail_index : base.u64 |
| |
| s = 0xFFFF_FFFF ^ this.state |
| |
| // Align to a 16-byte boundary. |
| while (args.x.length() > 0) and ((15 & args.x.uintptr_low_12_bits()) <> 0) { |
| s = IEEE_TABLE[0][((s & 0xFF) as base.u8) ^ args.x[0]] ^ (s >> 8) |
| args.x = args.x[1 ..] |
| } endwhile |
| |
| // For short inputs, just do a simple loop. |
| if args.x.length() < 64 { |
| iterate (p = args.x)(length: 1, advance: 1, unroll: 1) { |
| s = IEEE_TABLE[0][((s & 0xFF) as base.u8) ^ p[0]] ^ (s >> 8) |
| } |
| this.state = 0xFFFF_FFFF ^ s |
| return nothing |
| } |
| |
| // Load 128×4 = 512 bits from the first 64-byte chunk. |
| x0 = util.make_m128i_slice128(a: args.x[0x00 .. 0x10]) |
| x1 = util.make_m128i_slice128(a: args.x[0x10 .. 0x20]) |
| x2 = util.make_m128i_slice128(a: args.x[0x20 .. 0x30]) |
| x3 = util.make_m128i_slice128(a: args.x[0x30 .. 0x40]) |
| |
| // Combine with the initial state. |
| x0 = x0._mm_xor_si128(b: util.make_m128i_single_u32(a: s)) |
| |
| // Process the remaining 64-byte chunks. |
| k = util.make_m128i_slice128(a: IEEE_X86_SSE42_K1K2[.. 16]) |
| iterate (p = args.x[64 ..])(length: 64, advance: 64, unroll: 1) { |
| y0 = x0._mm_clmulepi64_si128(b: k, imm8: 0x00) |
| y1 = x1._mm_clmulepi64_si128(b: k, imm8: 0x00) |
| y2 = x2._mm_clmulepi64_si128(b: k, imm8: 0x00) |
| y3 = x3._mm_clmulepi64_si128(b: k, imm8: 0x00) |
| |
| x0 = x0._mm_clmulepi64_si128(b: k, imm8: 0x11) |
| x1 = x1._mm_clmulepi64_si128(b: k, imm8: 0x11) |
| x2 = x2._mm_clmulepi64_si128(b: k, imm8: 0x11) |
| x3 = x3._mm_clmulepi64_si128(b: k, imm8: 0x11) |
| |
| x0 = x0._mm_xor_si128(b: y0)._mm_xor_si128(b: util.make_m128i_slice128(a: p[0x00 .. 0x10])) |
| x1 = x1._mm_xor_si128(b: y1)._mm_xor_si128(b: util.make_m128i_slice128(a: p[0x10 .. 0x20])) |
| x2 = x2._mm_xor_si128(b: y2)._mm_xor_si128(b: util.make_m128i_slice128(a: p[0x20 .. 0x30])) |
| x3 = x3._mm_xor_si128(b: y3)._mm_xor_si128(b: util.make_m128i_slice128(a: p[0x30 .. 0x40])) |
| } |
| |
| // Reduce 128×4 = 512 bits to 128 bits. |
| k = util.make_m128i_slice128(a: IEEE_X86_SSE42_K3K4[.. 16]) |
| y0 = x0._mm_clmulepi64_si128(b: k, imm8: 0x00) |
| x0 = x0._mm_clmulepi64_si128(b: k, imm8: 0x11) |
| x0 = x0._mm_xor_si128(b: x1) |
| x0 = x0._mm_xor_si128(b: y0) |
| y0 = x0._mm_clmulepi64_si128(b: k, imm8: 0x00) |
| x0 = x0._mm_clmulepi64_si128(b: k, imm8: 0x11) |
| x0 = x0._mm_xor_si128(b: x2) |
| x0 = x0._mm_xor_si128(b: y0) |
| y0 = x0._mm_clmulepi64_si128(b: k, imm8: 0x00) |
| x0 = x0._mm_clmulepi64_si128(b: k, imm8: 0x11) |
| x0 = x0._mm_xor_si128(b: x3) |
| x0 = x0._mm_xor_si128(b: y0) |
| |
| // Reduce 128 bits to 64 bits. |
| x1 = x0._mm_clmulepi64_si128(b: k, imm8: 0x10) |
| x2 = util.make_m128i_multiple_u32( |
| a00: 0xFFFF_FFFF, |
| a01: 0x0000_0000, |
| a02: 0xFFFF_FFFF, |
| a03: 0x0000_0000) |
| x0 = x0._mm_srli_si128(imm8: 8) |
| x0 = x0._mm_xor_si128(b: x1) |
| k = util.make_m128i_slice128(a: IEEE_X86_SSE42_K5ZZ[.. 16]) |
| x1 = x0._mm_srli_si128(imm8: 4) |
| x0 = x0._mm_and_si128(b: x2) |
| x0 = x0._mm_clmulepi64_si128(b: k, imm8: 0x00) |
| x0 = x0._mm_xor_si128(b: x1) |
| |
| // Reduce 64 bits to 32 bits (Barrett Reduction) and extract. |
| // |
| // Barrett Reduction is Algorithm 1 (page 14) of Gopal et al., after |
| // adjusting for bit-reflection as per Figure 12 (page 21). |
| k = util.make_m128i_slice128(a: IEEE_X86_SSE42_PXMU[.. 16]) |
| x1 = x0._mm_and_si128(b: x2) |
| x1 = x1._mm_clmulepi64_si128(b: k, imm8: 0x10) |
| x1 = x1._mm_and_si128(b: x2) |
| x1 = x1._mm_clmulepi64_si128(b: k, imm8: 0x00) |
| x0 = x0._mm_xor_si128(b: x1) |
| s = x0._mm_extract_epi32(imm8: 1) |
| |
| // Handle the tail of args.x that wasn't a complete 64-byte chunk. |
| tail_index = args.x.length() & 0xFFFF_FFFF_FFFF_FFC0 // And-not 64. |
| if tail_index < args.x.length() { |
| iterate (p = args.x[tail_index ..])(length: 1, advance: 1, unroll: 1) { |
| s = IEEE_TABLE[0][((s & 0xFF) as base.u8) ^ p[0]] ^ (s >> 8) |
| } |
| } |
| |
| this.state = 0xFFFF_FFFF ^ s |
| } |