util/bufferiszero: improve avx2 accelerator By increasing avx2 length_to_accel to 128, we can simplify its logic and reduce a branch. The authorship of this patch actually belongs to Richard Henderson <richard.henderson@linaro.org>, I just fixed a boundary case on his original patch. Suggested-by: Richard Henderson <richard.henderson@linaro.org> Signed-off-by: Robert Hoo <robert.hu@linux.intel.com> Message-Id: <1585119021-46593-2-git-send-email-robert.hu@linux.intel.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>

commit: 8f13a39dc02ea8a3e923102a8444185630c635ea [log] [tgz]
author: Robert Hoo <robert.hu@linux.intel.com> Wed Mar 25 14:50:21 2020 +0800
committer: Paolo Bonzini <pbonzini@redhat.com> Wed Apr 01 14:24:03 2020 -0400
tree: 21825595ebc0e81e8e102033b0c12031bfb84983
parent: b87c99d0731fa30f1f455b211cbcf385b0fe427c [diff]
diff --git a/util/bufferiszero.c b/util/bufferiszero.c
index b801253..695bb4c 100644
--- a/util/bufferiszero.c
+++ b/util/bufferiszero.c

@@ -158,27 +158,19 @@
     __m256i *p = (__m256i *)(((uintptr_t)buf + 5 * 32) & -32);
     __m256i *e = (__m256i *)(((uintptr_t)buf + len) & -32);
 
-    if (likely(p <= e)) {
-        /* Loop over 32-byte aligned blocks of 128.  */
-        do {
-            __builtin_prefetch(p);
-            if (unlikely(!_mm256_testz_si256(t, t))) {
-                return false;
-            }
-            t = p[-4] | p[-3] | p[-2] | p[-1];
-            p += 4;
-        } while (p <= e);
-    } else {
-        t |= _mm256_loadu_si256(buf + 32);
-        if (len <= 128) {
-            goto last2;
+    /* Loop over 32-byte aligned blocks of 128.  */
+    while (p <= e) {
+        __builtin_prefetch(p);
+        if (unlikely(!_mm256_testz_si256(t, t))) {
+            return false;
         }
-    }
+        t = p[-4] | p[-3] | p[-2] | p[-1];
+        p += 4;
+    } ;
 
     /* Finish the last block of 128 unaligned.  */
     t |= _mm256_loadu_si256(buf + len - 4 * 32);
     t |= _mm256_loadu_si256(buf + len - 3 * 32);
- last2:
     t |= _mm256_loadu_si256(buf + len - 2 * 32);
     t |= _mm256_loadu_si256(buf + len - 1 * 32);
 
@@ -263,7 +255,7 @@
     }
     if (cache & CACHE_AVX2) {
         fn = buffer_zero_avx2;
-        length_to_accel = 64;
+        length_to_accel = 128;
     }
 #endif
 #ifdef CONFIG_AVX512F_OPT
commit	8f13a39dc02ea8a3e923102a8444185630c635ea	[log] [tgz]
author	Robert Hoo <robert.hu@linux.intel.com>	Wed Mar 25 14:50:21 2020 +0800
committer	Paolo Bonzini <pbonzini@redhat.com>	Wed Apr 01 14:24:03 2020 -0400
tree	21825595ebc0e81e8e102033b0c12031bfb84983
parent	b87c99d0731fa30f1f455b211cbcf385b0fe427c [diff]