blob: 53a1f6daafe5236a0d925e4ccc39fb4b1026b02c [file] [log] [blame]
// Copyright 2021 The Wuffs Authors.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
//
// SPDX-License-Identifier: Apache-2.0 OR MIT
pri func ieee_hasher.up_x86_sse42!(x: roslice base.u8),
choose cpu_arch >= x86_sse42,
{
var s : base.u32
var util : base.x86_sse42_utility
var kk : base.x86_m128i
var x0 : base.x86_m128i
var x1 : base.x86_m128i
var x2 : base.x86_m128i
var x3 : base.x86_m128i
var x4 : base.x86_m128i
var x5 : base.x86_m128i
var x6 : base.x86_m128i
var x7 : base.x86_m128i
var y0 : base.x86_m128i
var y1 : base.x86_m128i
var y2 : base.x86_m128i
var y3 : base.x86_m128i
var y4 : base.x86_m128i
var y5 : base.x86_m128i
var y6 : base.x86_m128i
var y7 : base.x86_m128i
s = 0xFFFF_FFFF ^ this.state
// Align to a 16-byte boundary.
while (args.x.length() > 0) and ((15 & args.x.uintptr_low_12_bits()) <> 0) {
s = IEEE_TABLE[0][((s & 0xFF) as base.u8) ^ args.x[0]] ^ (s >> 8)
args.x = args.x[1 ..]
} endwhile
if args.x.length() >= 128 {
// BEGIN script/print-crc32-x86-sse42-code.go generated code.
x0 = util.make_m128i_slice128(a: args.x[0x00 .. 0x10])
x1 = util.make_m128i_slice128(a: args.x[0x10 .. 0x20])
x2 = util.make_m128i_slice128(a: args.x[0x20 .. 0x30])
x3 = util.make_m128i_slice128(a: args.x[0x30 .. 0x40])
x4 = util.make_m128i_slice128(a: args.x[0x40 .. 0x50])
x5 = util.make_m128i_slice128(a: args.x[0x50 .. 0x60])
x6 = util.make_m128i_slice128(a: args.x[0x60 .. 0x70])
x7 = util.make_m128i_slice128(a: args.x[0x70 .. 0x80])
kk = util.make_m128i_multiple_u32(a00: 0x33FF_F533, a01: 0, a02: 0x910E_EEC1, a03: 0)
x0 = x0._mm_xor_si128(b: util.make_m128i_single_u32(a: s))
args.x = args.x[128 ..]
while args.x.length() >= 128 {
y0 = x0._mm_clmulepi64_si128(b: kk, imm8: 0x00)
x0 = x0._mm_clmulepi64_si128(b: kk, imm8: 0x11)
y1 = x1._mm_clmulepi64_si128(b: kk, imm8: 0x00)
x1 = x1._mm_clmulepi64_si128(b: kk, imm8: 0x11)
y2 = x2._mm_clmulepi64_si128(b: kk, imm8: 0x00)
x2 = x2._mm_clmulepi64_si128(b: kk, imm8: 0x11)
y3 = x3._mm_clmulepi64_si128(b: kk, imm8: 0x00)
x3 = x3._mm_clmulepi64_si128(b: kk, imm8: 0x11)
y4 = x4._mm_clmulepi64_si128(b: kk, imm8: 0x00)
x4 = x4._mm_clmulepi64_si128(b: kk, imm8: 0x11)
y5 = x5._mm_clmulepi64_si128(b: kk, imm8: 0x00)
x5 = x5._mm_clmulepi64_si128(b: kk, imm8: 0x11)
y6 = x6._mm_clmulepi64_si128(b: kk, imm8: 0x00)
x6 = x6._mm_clmulepi64_si128(b: kk, imm8: 0x11)
y7 = x7._mm_clmulepi64_si128(b: kk, imm8: 0x00)
x7 = x7._mm_clmulepi64_si128(b: kk, imm8: 0x11)
y0 = y0._mm_xor_si128(b: util.make_m128i_slice128(a: args.x[0x00 .. 0x10]))
x0 = x0._mm_xor_si128(b: y0)
y1 = y1._mm_xor_si128(b: util.make_m128i_slice128(a: args.x[0x10 .. 0x20]))
x1 = x1._mm_xor_si128(b: y1)
y2 = y2._mm_xor_si128(b: util.make_m128i_slice128(a: args.x[0x20 .. 0x30]))
x2 = x2._mm_xor_si128(b: y2)
y3 = y3._mm_xor_si128(b: util.make_m128i_slice128(a: args.x[0x30 .. 0x40]))
x3 = x3._mm_xor_si128(b: y3)
y4 = y4._mm_xor_si128(b: util.make_m128i_slice128(a: args.x[0x40 .. 0x50]))
x4 = x4._mm_xor_si128(b: y4)
y5 = y5._mm_xor_si128(b: util.make_m128i_slice128(a: args.x[0x50 .. 0x60]))
x5 = x5._mm_xor_si128(b: y5)
y6 = y6._mm_xor_si128(b: util.make_m128i_slice128(a: args.x[0x60 .. 0x70]))
x6 = x6._mm_xor_si128(b: y6)
y7 = y7._mm_xor_si128(b: util.make_m128i_slice128(a: args.x[0x70 .. 0x80]))
x7 = x7._mm_xor_si128(b: y7)
args.x = args.x[128 ..]
} endwhile
kk = util.make_m128i_multiple_u32(a00: 0xAE68_9191, a01: 0, a02: 0xCCAA_009E, a03: 0)
y0 = x0._mm_clmulepi64_si128(b: kk, imm8: 0x00)
x0 = x0._mm_clmulepi64_si128(b: kk, imm8: 0x11)
y2 = x2._mm_clmulepi64_si128(b: kk, imm8: 0x00)
x2 = x2._mm_clmulepi64_si128(b: kk, imm8: 0x11)
y4 = x4._mm_clmulepi64_si128(b: kk, imm8: 0x00)
x4 = x4._mm_clmulepi64_si128(b: kk, imm8: 0x11)
y6 = x6._mm_clmulepi64_si128(b: kk, imm8: 0x00)
x6 = x6._mm_clmulepi64_si128(b: kk, imm8: 0x11)
y0 = y0._mm_xor_si128(b: x1)
x0 = x0._mm_xor_si128(b: y0)
y2 = y2._mm_xor_si128(b: x3)
x2 = x2._mm_xor_si128(b: y2)
y4 = y4._mm_xor_si128(b: x5)
x4 = x4._mm_xor_si128(b: y4)
y6 = y6._mm_xor_si128(b: x7)
x6 = x6._mm_xor_si128(b: y6)
kk = util.make_m128i_multiple_u32(a00: 0xF1DA_05AA, a01: 0, a02: 0x8125_6527, a03: 0)
y0 = x0._mm_clmulepi64_si128(b: kk, imm8: 0x00)
x0 = x0._mm_clmulepi64_si128(b: kk, imm8: 0x11)
y4 = x4._mm_clmulepi64_si128(b: kk, imm8: 0x00)
x4 = x4._mm_clmulepi64_si128(b: kk, imm8: 0x11)
y0 = y0._mm_xor_si128(b: x2)
x0 = x0._mm_xor_si128(b: y0)
y4 = y4._mm_xor_si128(b: x6)
x4 = x4._mm_xor_si128(b: y4)
kk = util.make_m128i_multiple_u32(a00: 0x8F35_2D95, a01: 0, a02: 0x1D95_13D7, a03: 0)
y0 = x0._mm_clmulepi64_si128(b: kk, imm8: 0x00)
x0 = x0._mm_clmulepi64_si128(b: kk, imm8: 0x11)
y0 = y0._mm_xor_si128(b: x4)
x0 = x0._mm_xor_si128(b: y0)
kk = util.make_m128i_multiple_u32(a00: 0xF701_1641, a01: 0xB4E5_B025, a02: 0xDB71_0641, a03: 1)
s = util.make_m128i_single_u64(a: x0._mm_extract_epi64(imm8: 0)).
_mm_clmulepi64_si128(b: kk, imm8: 0x00).
_mm_clmulepi64_si128(b: kk, imm8: 0x10).
_mm_extract_epi32(imm8: 2)
kk = util.make_m128i_multiple_u32(a00: 0xF701_1641, a01: 0xB4E5_B025, a02: 0xDB71_0641, a03: 1)
s = util.make_m128i_single_u64(a: x0._mm_extract_epi64(imm8: 1) ^ (s as base.u64)).
_mm_clmulepi64_si128(b: kk, imm8: 0x00).
_mm_clmulepi64_si128(b: kk, imm8: 0x10).
_mm_extract_epi32(imm8: 2)
// END script/print-crc32-x86-sse42-code.go generated code.
}
while args.x.length() >= 8 {
kk = util.make_m128i_multiple_u32(a00: 0xF701_1641, a01: 0xB4E5_B025, a02: 0xDB71_0641, a03: 1)
s = util.make_m128i_single_u64(a: args.x.peek_u64le() ^ (s as base.u64)).
_mm_clmulepi64_si128(b: kk, imm8: 0x00).
_mm_clmulepi64_si128(b: kk, imm8: 0x10).
_mm_extract_epi32(imm8: 2)
args.x = args.x[8 ..]
} endwhile
while args.x.length() > 0 {
s = IEEE_TABLE[0][((s & 0xFF) as base.u8) ^ args.x[0]] ^ (s >> 8)
args.x = args.x[1 ..]
} endwhile
this.state = 0xFFFF_FFFF ^ s
}