Add ACCUM_ROUND macro to crc32_chorba_sse2

GitOrigin-RevId: e6956e717a81ab281b1a068a33e9b9d2a02320ee
Change-Id: I58c61abe0c8d452026746db403e28526b6f780fc
diff --git a/arch/x86/crc32_chorba_sse2.c b/arch/x86/crc32_chorba_sse2.c
index 271adbd..93ec5d5 100644
--- a/arch/x86/crc32_chorba_sse2.c
+++ b/arch/x86/crc32_chorba_sse2.c
@@ -8,19 +8,36 @@
 #include "arch/x86/x86_intrins.h"
 #include "arch_functions.h"
 
+#define LSHIFT_QWORD(x)     _mm_unpacklo_epi64(_mm_setzero_si128(), (x))
+#define RSHIFT_QWORD(x)     _mm_unpackhi_epi64((x), _mm_setzero_si128())
+#define ALIGNR_QWORD(a, b)  _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(a), _mm_castsi128_pd(b), 1))
+
 #define READ_NEXT(in, off, a, b) \
     do { \
         a = _mm_load_si128((__m128i*)(in + off / sizeof(uint64_t))); \
         b = _mm_load_si128((__m128i*)(in + off / sizeof(uint64_t) + 2)); \
-    } while (0);
+    } while (0)
 
 #define NEXT_ROUND(invec, a, b, c, d) \
     do { \
         a = _mm_xor_si128(_mm_slli_epi64(invec, 17), _mm_slli_epi64(invec, 55)); \
         b = _mm_xor_si128(_mm_xor_si128(_mm_srli_epi64(invec, 47), _mm_srli_epi64(invec, 9)), _mm_slli_epi64(invec, 19)); \
         c = _mm_xor_si128(_mm_srli_epi64(invec, 45), _mm_slli_epi64(invec, 44)); \
-        d  = _mm_srli_epi64(invec, 20); \
-    } while (0);
+        d = _mm_srli_epi64(invec, 20); \
+    } while (0)
+
+#define ACCUM_ROUND() \
+    do { \
+        __m128i a4_ = _mm_unpacklo_epi64(next56, ab4); \
+        __m128i b2c2 = ALIGNR_QWORD(ab2, cd2); \
+        __m128i b4c4 = ALIGNR_QWORD(ab4, cd4); \
+        __m128i d2_ = RSHIFT_QWORD(cd2); \
+        next12 = _mm_xor_si128(_mm_xor_si128(a4_, ab3), cd1); \
+        next12 = _mm_xor_si128(next12, b2c2); \
+        next34 = _mm_xor_si128(b4c4, cd3); \
+        next34 = _mm_xor_si128(next34, d2_); \
+        next56 = RSHIFT_QWORD(cd4); \
+    } while (0)
 
 Z_INTERNAL uint32_t chorba_small_nondestructive_sse2(uint32_t crc, const uint8_t *buf, size_t len) {
     /* The calling function ensured that this is aligned correctly */
@@ -28,10 +45,7 @@
     ALIGNED_(16) uint64_t final[9] = {0};
     uint64_t next1 = ~crc;
     crc = 0;
-    uint64_t next2 = 0;
-    uint64_t next3 = 0;
-    uint64_t next4 = 0;
-    uint64_t next5 = 0;
+    uint64_t next2, next3, next4, next5;
 
     __m128i next12 = _mm_cvtsi64_si128(next1);
     __m128i next34 = _mm_setzero_si128();
@@ -55,40 +69,32 @@
         chorba34 = _mm_xor_si128(chorba34, next34);
         chorba56 = _mm_xor_si128(chorba56, next56);
         chorba78 = _mm_xor_si128(chorba78, chorba12);
-        __m128i chorba45 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(chorba34), _mm_castsi128_pd(chorba56), 1));
-        __m128i chorba23 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(chorba12),
-                                                           _mm_castsi128_pd(chorba34), 1));
- 
+        __m128i chorba45 = ALIGNR_QWORD(chorba34, chorba56);
+        __m128i chorba23 = ALIGNR_QWORD(chorba12, chorba34);
+
         i += 8 * 8;
 
         /* 0-3 */
         READ_NEXT(input, i, in1in2, in3in4);
 
-        __m128i chorba34xor = _mm_xor_si128(chorba34, _mm_unpacklo_epi64(_mm_setzero_si128(), chorba12));
+        __m128i chorba34xor = _mm_xor_si128(chorba34, LSHIFT_QWORD(chorba12));
         in1in2 = _mm_xor_si128(in1in2, chorba34xor);
 
         NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4);
 
         in3in4 = _mm_xor_si128(in3in4, ab1);
         /* _hopefully_ we don't get a huge domain switching penalty for this. This seems to be the best sequence */
-        __m128i chorba56xor = _mm_xor_si128(chorba56, _mm_unpacklo_epi64(_mm_setzero_si128(), ab2));
+        __m128i chorba56xor = _mm_xor_si128(chorba56, LSHIFT_QWORD(ab2));
 
         in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba56xor, chorba23));
         in3in4 = _mm_xor_si128(in3in4, chorba12);
 
         NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4);
 
-        __m128i b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1));
-        __m128i a4_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab4);
-        a4_ = _mm_xor_si128(b2c2, a4_);
-        next12 = _mm_xor_si128(ab3, a4_);
-        next12 = _mm_xor_si128(next12, cd1);
-
-        __m128i d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128());
-        __m128i b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1));
-
-        next34 = _mm_xor_si128(cd3, _mm_xor_si128(b4c4, d2_));
-        next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128());
+        /* chorba56 already consumed next56, clear it so ACCUM_ROUND
+           does not xor the stale value into next12 */
+        next56 = _mm_setzero_si128();
+        ACCUM_ROUND();
 
         i += 32;
 
@@ -105,30 +111,19 @@
         in3in4 = _mm_xor_si128(in3in4, next34);
         in3in4 = _mm_xor_si128(in3in4, ab1);
         in3in4 = _mm_xor_si128(in3in4, chorba56);
-        __m128i chorba67 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(chorba56), _mm_castsi128_pd(chorba78), 1));
-        in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba67, _mm_unpacklo_epi64(_mm_setzero_si128(), ab2)));
+        __m128i chorba67 = ALIGNR_QWORD(chorba56, chorba78);
+        in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba67, LSHIFT_QWORD(ab2)));
 
         NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4);
 
-        b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1));
-        a4_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab4);
-        a4_ = _mm_xor_si128(b2c2, a4_);
-        next12 = _mm_xor_si128(ab3, cd1);
-
-        next12 = _mm_xor_si128(next12, a4_);
-        next12 = _mm_xor_si128(next12, next56);
-        b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1));
-        next34 = _mm_xor_si128(b4c4, cd3);
-        d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128());
-        next34 = _mm_xor_si128(next34, d2_);
-        next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128());
+        ACCUM_ROUND();
 
         i += 32;
 
         /* 8-11 */
         READ_NEXT(input, i, in1in2, in3in4);
 
-        __m128i chorba80 = _mm_unpackhi_epi64(chorba78, _mm_setzero_si128());
+        __m128i chorba80 = RSHIFT_QWORD(chorba78);
         __m128i next12_chorba12 = _mm_xor_si128(next12, chorba12);
         in1in2 = _mm_xor_si128(in1in2, chorba80);
         in1in2 = _mm_xor_si128(in1in2, chorba78);
@@ -138,22 +133,13 @@
 
         in3in4 = _mm_xor_si128(next34, in3in4);
         in3in4 = _mm_xor_si128(in3in4, ab1);
-        __m128i a2_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab2);
+        __m128i a2_ = LSHIFT_QWORD(ab2);
         in3in4 = _mm_xor_si128(in3in4, chorba34);
         in3in4 = _mm_xor_si128(in3in4, a2_);
 
         NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4);
 
-        a4_ = _mm_unpacklo_epi64(next56, ab4);
-        next12 = _mm_xor_si128(a4_, ab3);
-        next12 = _mm_xor_si128(next12, cd1);
-        b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1));
-        b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1));
-        d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128());
-        next12 = _mm_xor_si128(next12, b2c2);
-        next34 = _mm_xor_si128(b4c4, cd3);
-        next34 = _mm_xor_si128(next34, d2_);
-        next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128());
+        ACCUM_ROUND();
 
         i += 32;
 
@@ -163,7 +149,7 @@
         in1in2 = _mm_xor_si128(in1in2, next12);
         __m128i chorb56xorchorb12 = _mm_xor_si128(chorba56, chorba12);
         in1in2 = _mm_xor_si128(in1in2, chorb56xorchorb12);
-        __m128i chorb1_ = _mm_unpacklo_epi64(_mm_setzero_si128(), chorba12);
+        __m128i chorb1_ = LSHIFT_QWORD(chorba12);
         in1in2 = _mm_xor_si128(in1in2, chorb1_);
 
         NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4);
@@ -171,33 +157,25 @@
         in3in4 = _mm_xor_si128(next34, in3in4);
         in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(ab1, chorba78));
         in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba34, chorba12));
-        in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba23, _mm_unpacklo_epi64(_mm_setzero_si128(), ab2)));
+        in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba23, LSHIFT_QWORD(ab2)));
 
         NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4);
 
-        a4_ = _mm_unpacklo_epi64(next56, ab4);
-        next12 = _mm_xor_si128(_mm_xor_si128(a4_, ab3), cd1);
-        b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1));
-        b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1));
-        d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128());
-        next12 = _mm_xor_si128(next12, b2c2);
-        next34 = _mm_xor_si128(b4c4, cd3);
-        next34 = _mm_xor_si128(next34, d2_);
-        next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128());
+        ACCUM_ROUND();
 
         i += 32;
 
         /* 16-19 */
         READ_NEXT(input, i, in1in2, in3in4);
 
-        __m128i chorba1_ = _mm_unpacklo_epi64(_mm_setzero_si128(), chorba12);
+        __m128i chorba1_ = LSHIFT_QWORD(chorba12);
         in1in2 = _mm_xor_si128(_mm_xor_si128(next12, in1in2), _mm_xor_si128(chorba56, chorba45));
         in1in2 = _mm_xor_si128(in1in2, _mm_xor_si128(chorba12, chorba34));
         in1in2 = _mm_xor_si128(chorba1_, in1in2);
 
         NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4);
 
-        a2_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab2);
+        a2_ = LSHIFT_QWORD(ab2);
         in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(ab1, chorba78));
         in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba56, chorba34));
         in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba23, chorba67));
@@ -206,15 +184,7 @@
 
         NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4);
 
-        a4_ = _mm_unpacklo_epi64(next56, ab4);
-        next12 = _mm_xor_si128(_mm_xor_si128(a4_, ab3), cd1);
-        b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1));
-        b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1));
-        d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128());
-        next12 = _mm_xor_si128(next12, b2c2);
-        next34 = _mm_xor_si128(b4c4, cd3);
-        next34 = _mm_xor_si128(next34, d2_);
-        next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128());
+        ACCUM_ROUND();
 
         i += 32;
 
@@ -228,7 +198,7 @@
 
         NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4);
 
-        a2_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab2);
+        a2_ = LSHIFT_QWORD(ab2);
         in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(next34, ab1));
         in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba78, chorba67));
         in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba45, chorba34));
@@ -237,15 +207,7 @@
 
         NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4);
 
-        a4_ = _mm_unpacklo_epi64(next56, ab4);
-        next12 = _mm_xor_si128(_mm_xor_si128(a4_, ab3), cd1);
-        b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1));
-        b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1));
-        d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128());
-        next12 = _mm_xor_si128(next12, b2c2);
-        next34 = _mm_xor_si128(b4c4, cd3);
-        next34 = _mm_xor_si128(next34, d2_);
-        next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128());
+        ACCUM_ROUND();
 
         i += 32;
 
@@ -259,7 +221,7 @@
 
         NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4);
 
-        a2_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab2);
+        a2_ = LSHIFT_QWORD(ab2);
         in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(next34, ab1));
         in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba78, chorba56));
         in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba45, chorba34));
@@ -267,15 +229,7 @@
 
         NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4);
 
-        a4_ = _mm_unpacklo_epi64(next56, ab4);
-        next12 = _mm_xor_si128(_mm_xor_si128(a4_, ab3), cd1);
-        b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1));
-        b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1));
-        d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128());
-        next12 = _mm_xor_si128(next12, b2c2);
-        next34 = _mm_xor_si128(b4c4, cd3);
-        next34 = _mm_xor_si128(next34, d2_);
-        next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128());
+        ACCUM_ROUND();
 
         i += 32;
 
@@ -287,22 +241,14 @@
 
         NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4);
 
-        a2_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab2);
+        a2_ = LSHIFT_QWORD(ab2);
         in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(next34, ab1));
         in3in4 = _mm_xor_si128(in3in4, _mm_xor_si128(chorba78, chorba80));
         in3in4 = _mm_xor_si128(a2_, in3in4);
 
         NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4);
 
-        a4_ = _mm_unpacklo_epi64(next56, ab4);
-        next12 = _mm_xor_si128(_mm_xor_si128(a4_, ab3), cd1);
-        b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1));
-        b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1));
-        d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128());
-        next12 = _mm_xor_si128(next12, b2c2);
-        next34 = _mm_xor_si128(b4c4, cd3);
-        next34 = _mm_xor_si128(next34, d2_);
-        next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128());
+        ACCUM_ROUND();
     }
 
     for (; (i + 40 + 32) < len; i += 32) {
@@ -314,24 +260,14 @@
 
         NEXT_ROUND(in1in2, ab1, ab2, ab3, ab4);
 
-        __m128i a2_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab2);
+        __m128i a2_ = LSHIFT_QWORD(ab2);
         __m128i ab1_next34 = _mm_xor_si128(next34, ab1);
         in3in4 = _mm_xor_si128(in3in4, ab1_next34);
         in3in4 = _mm_xor_si128(a2_, in3in4);
 
         NEXT_ROUND(in3in4, cd1, cd2, cd3, cd4);
 
-        __m128i b2c2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab2), _mm_castsi128_pd(cd2), 1));
-        __m128i a4_ = _mm_unpacklo_epi64(_mm_setzero_si128(), ab4);
-        a4_ = _mm_xor_si128(b2c2, a4_);
-        next12 = _mm_xor_si128(ab3, a4_);
-        next12 = _mm_xor_si128(next12, cd1);
-
-        __m128i d2_ = _mm_unpackhi_epi64(cd2, _mm_setzero_si128());
-        __m128i b4c4 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(ab4), _mm_castsi128_pd(cd4), 1));
-        next12 = _mm_xor_si128(next12, next56);
-        next34 = _mm_xor_si128(cd3, _mm_xor_si128(b4c4, d2_));
-        next56 = _mm_unpackhi_epi64(cd4, _mm_setzero_si128());
+        ACCUM_ROUND();
     }
 
     next1 = _mm_cvtsi128_si64(next12);