[X86] Bugfix for rL349334 adding a check function for two intrinsics

 _mm_mmask_i64gather_epi32 and _mm_mmask_i64gather_ps operates the
 lower 64 bits and zeroes the higher 64 bits of the return value.
 The old test suite steps 64 bits in the do_intrin_loop, which will
 overlap higher 64 bits of other data address in the last iteration.
 Especially when compiler malloc dst128_f and mask128 's address
 adjacent, this test will run fail. A new check function is added
 to check these two intrinsics.

 Differential Revision: https://reviews.llvm.org/D79158
diff --git a/SingleSource/UnitTests/Vector/AVX512VL/i64gather_32.c b/SingleSource/UnitTests/Vector/AVX512VL/i64gather_32.c
index b510ab6..49941bf 100644
--- a/SingleSource/UnitTests/Vector/AVX512VL/i64gather_32.c
+++ b/SingleSource/UnitTests/Vector/AVX512VL/i64gather_32.c
@@ -46,22 +46,22 @@
 
 void do_mm_mmask_i64gather_epi32() {
   int i;
-  for (i = 0; i < NUM; i += 2) {
+  for (i = 0; i < NUM; i += 4) {
     __m128i ind = _mm_loadu_si128((const __m128i *)(g_index + i));
     __m128i old_dst = _mm_loadu_si128((__m128i const *)(dst_i + i));
     __m128i gtr =
-        _mm_mmask_i64gather_epi32(old_dst, mask128[i / 2], ind, src_i, SCALE);
+        _mm_mmask_i64gather_epi32(old_dst, mask128[i / 4], ind, src_i, SCALE);
     _mm_storeu_si128((__m128i *)(dst128_i + i), gtr);
   }
 }
 
 void do_mm_mmask_i64gather_ps() {
   int i;
-  for (i = 0; i < NUM; i += 2) {
+  for (i = 0; i < NUM; i += 4) {
     __m128i ind = _mm_loadu_si128((const __m128i *)(g_index + i));
     __m128 old_dst = _mm_loadu_ps(dst_f + i);
     __m128 gtr =
-        _mm_mmask_i64gather_ps(old_dst, mask128[i / 2], ind, src_f, SCALE);
+        _mm_mmask_i64gather_ps(old_dst, mask128[i / 4], ind, src_f, SCALE);
     _mm_storeu_ps(dst128_f + i, gtr);
   }
 }
@@ -110,16 +110,42 @@
   return 0;
 }
 
+int checkh(int id, int *res_dst, int *pass_thru_vals, int *mask, int *src,
+          int elems_in_vector) {
+  int i;
+  for (i = 0; i < NUM; i++) {
+    int kmask = mask[i / elems_in_vector];
+    int kmask_bit = kmask & (1 << (i % elems_in_vector));
+
+    int v;
+    if (i % elems_in_vector >= elems_in_vector / 2)
+      v = 0;
+    else
+      v = kmask_bit ? src[g_index[i]] : pass_thru_vals[i];
+    // printf("v= %d, g_index[i] = %ld, src[g_index[i]]= %d, res_dst[i]=%d\n ",
+    // v, g_index[i], src[g_index[i]], res_dst[i]);
+
+    if (v != res_dst[i]) {
+      printf("The testcase #%d FAILed at %d iteration\n", id, i);
+
+      printf("Expected value %d, actual %d\n", v, res_dst[i]);
+
+      return -1;
+    }
+  }
+  return 0;
+}
+
 int main() {
   int error = 0;
 
   init_data();
 
   do_mm_mmask_i64gather_epi32();
-  error |= check(1, dst128_i, dst_i, mask128, src_i, 2);
+  error |= checkh(1, dst128_i, dst_i, mask128, src_i, 4);
 
   do_mm_mmask_i64gather_ps();
-  error |= check(2, (int *)dst128_f, (int *)dst_f, mask128, (int *)src_f, 2);
+  error |= checkh(2, (int *)dst128_f, (int *)dst_f, mask128, (int *)src_f, 4);
 
   do_mm256_mmask_i64gather_epi32();
   error |= check(3, dst256_i, dst_i, mask256, src_i, 4);