blob: 35cad3a00e4aa34b5942cacb659c1909d4f7fe48 [file] [log] [blame]
/*
* Test 128 and 256-bit load and store intrinsics,
* with masked and zero-masked forms, by comparing
* their output with the corresponding 512-bit intrinsic.
*
* This test was created to check the correctness
* of the following intrinsics support:
* _mm512_load_si512()
* _mm512_store_si512()
* _mm256_storeu_ps()
* _mm256_storeu_si256()
* _mm_storeu_ps()
* _mm_storeu_si128()
*/
#include "m512_test_util.h"
#include <stdio.h>
V512 fsrc1;
V512 fsrc2;
V512 fsrc_non_negative;
V512 fsrc_arr[2];
V512 dsrc1;
V512 dsrc2;
V512 dsrc_non_negative;
V512 dsrc_arr[2];
V512 isrc1;
V512 isrc2;
V512 isrc_arr[2];
static void NOINLINE init() {
volatile int i;
for (i = 0; i < 16; i++) {
fsrc1.f32[i] = (float)(i + 1);
fsrc2.f32[i] = 4.0f * (float)(i + 1);
fsrc_non_negative.f32[i] = 2.0f * (float)(i);
isrc2.s32[i] = (3 * i) - 17;
}
fsrc_arr[0] = fsrc2;
fsrc_arr[1] = fsrc1;
for (i = 0; i < 64; i++) {
isrc1.s8[i] = 2 * i + 1;
if (i % 3) {
isrc1.s8[i] = -isrc1.s8[i];
}
}
isrc_arr[0] = isrc2;
isrc_arr[1] = isrc1;
for (i = 0; i < 8; i++) {
dsrc1.f64[i] = (double)(-i - 1);
dsrc2.f64[i] = 3.0 * (double)(-i - 1);
dsrc_non_negative.f64[i] = 4.0f * (double)(i);
}
dsrc_arr[0] = dsrc2;
dsrc_arr[1] = dsrc1;
}
/*
* Use "soft update" between tests to make compiler think src was updated.
* Prevents PRE'ing a load of src, thus allowing ciscization.
* Also prevents PRE'ing intrinsic operations, ensuring we
* execute the intended instructions.
*/
volatile int vol0 = 0;
#define soft_v512_update(var) (var).xmmi[vol0] = (var).xmmi[vol0]
void NOINLINE do_load_and_loadu_pd() {
V512 xmm_res, ymm_res, zmm_res;
__mmask8 k8 = 0x7e;
/* Non-masked. */
soft_v512_update(dsrc1);
zmm_res.zmmd = _mm512_load_pd(&dsrc1.zmmd);
soft_v512_update(dsrc_arr[0]);
zmm_res.zmmd = _mm512_loadu_pd(&dsrc_arr[0].f64[1]);
/* Masked. */
zmm_res.zmmd = _mm512_setzero_pd();
ymm_res = zmm_res;
xmm_res = zmm_res;
soft_v512_update(dsrc1);
zmm_res.zmmd = _mm512_mask_load_pd(zmm_res.zmmd, k8, &dsrc1.zmmd);
zmm_res.zmmd = _mm512_setzero_pd();
ymm_res = zmm_res;
xmm_res = zmm_res;
soft_v512_update(dsrc_arr[0]);
zmm_res.zmmd = _mm512_mask_loadu_pd(zmm_res.zmmd, k8, &dsrc_arr[0].f64[3]);
/* Zero-masked. */
zmm_res.zmmd = _mm512_set1_pd(1.0);
ymm_res = zmm_res;
xmm_res = zmm_res;
soft_v512_update(dsrc1);
zmm_res.zmmd = _mm512_maskz_load_pd(k8, &dsrc1.zmmd);
soft_v512_update(dsrc1);
zmm_res.zmmd = _mm512_set1_pd(1.0);
ymm_res = zmm_res;
xmm_res = zmm_res;
}
void NOINLINE do_store_and_storeu_pd() {
V512 xmm_res[2], ymm_res[2], zmm_res[2];
__mmask8 k8 = 0xef;
/* Non-masked. */
zmm_res[0].zmmd = _mm512_set1_pd(1.0);
zmm_res[1] = zmm_res[0];
ymm_res[0] = zmm_res[0];
ymm_res[1] = zmm_res[0];
xmm_res[0] = zmm_res[0];
xmm_res[1] = zmm_res[0];
soft_v512_update(dsrc1);
_mm512_store_pd(&zmm_res[0].zmmd, dsrc1.zmmd);
soft_v512_update(dsrc1);
_mm512_storeu_pd(&zmm_res[0].f64[1], dsrc1.zmmd);
/* Masked. */
zmm_res[0].zmmd = _mm512_set1_pd(1.0);
zmm_res[1] = zmm_res[0];
ymm_res[0] = zmm_res[0];
ymm_res[1] = zmm_res[0];
xmm_res[0] = zmm_res[0];
xmm_res[1] = zmm_res[0];
soft_v512_update(dsrc1);
_mm512_mask_store_pd(&zmm_res[0].zmmd, k8, dsrc1.zmmd);
zmm_res[0].zmmd = _mm512_set1_pd(1.0);
zmm_res[1] = zmm_res[0];
ymm_res[0] = zmm_res[0];
ymm_res[1] = zmm_res[0];
xmm_res[0] = zmm_res[0];
xmm_res[1] = zmm_res[0];
soft_v512_update(dsrc1);
_mm512_mask_storeu_pd(&zmm_res[0].f64[1], k8, dsrc1.zmmd);
}
void NOINLINE do_load_and_loadu_ps() {
V512 xmm_res, ymm_res, zmm_res;
__mmask16 k16 = 0xff7e;
__mmask8 k8 = (__mmask8)k16;
/* Non-masked. */
soft_v512_update(fsrc1);
zmm_res.zmm = _mm512_load_ps(&fsrc1.zmm);
soft_v512_update(fsrc_arr[0]);
zmm_res.zmm = _mm512_loadu_ps(&fsrc_arr[0].f32[3]);
/* Masked. */
zmm_res.zmm = _mm512_setzero_ps();
ymm_res = zmm_res;
xmm_res = zmm_res;
soft_v512_update(fsrc1);
zmm_res.zmm = _mm512_mask_load_ps(zmm_res.zmm, k16, &fsrc1.zmm);
zmm_res.zmm = _mm512_setzero_ps();
ymm_res = zmm_res;
xmm_res = zmm_res;
soft_v512_update(fsrc_arr[0]);
zmm_res.zmm = _mm512_mask_loadu_ps(zmm_res.zmm, k16, &fsrc_arr[0].f32[5]);
/* Zero-masked. */
zmm_res.zmm = _mm512_set1_ps(1.0f);
ymm_res = zmm_res;
xmm_res = zmm_res;
soft_v512_update(fsrc1);
zmm_res.zmm = _mm512_maskz_load_ps(k16, &fsrc1.zmm);
zmm_res.zmm = _mm512_set1_ps(1.0f);
ymm_res = zmm_res;
xmm_res = zmm_res;
soft_v512_update(fsrc_arr[0]);
zmm_res.zmm = _mm512_maskz_loadu_ps(k16, &fsrc_arr[0].f32[5]);
}
void NOINLINE do_store_and_storeu_ps() {
V512 xmm_res[2], ymm_res[2], zmm_res[2];
__mmask16 k16 = 0xffef;
__mmask8 k8 = (__mmask8)k16;
/* Non-masked. */
zmm_res[0].zmm = _mm512_set1_ps(1.0f);
zmm_res[1] = zmm_res[0];
ymm_res[0] = zmm_res[0];
ymm_res[1] = zmm_res[0];
xmm_res[0] = zmm_res[0];
xmm_res[1] = zmm_res[0];
soft_v512_update(fsrc1);
_mm512_store_ps(&zmm_res[0].zmm, fsrc1.zmm);
soft_v512_update(fsrc1);
_mm512_storeu_ps(&zmm_res[0].f32[1], fsrc1.zmm);
soft_v512_update(fsrc1);
_mm256_storeu_ps(&ymm_res[0].f32[1], fsrc1.ymm[0]);
soft_v512_update(fsrc1);
_mm_storeu_ps(&xmm_res[0].f32[1], fsrc1.xmm[0]);
check_equal_nsf(&ymm_res[0].f32[1], &zmm_res[0].f32[1], 8, "_mm256_storeu_ps",
__LINE__);
check_equal_nsf(&xmm_res[0].f32[1], &zmm_res[0].f32[1], 4, "_mm_storeu_ps",
__LINE__);
/* Masked. */
zmm_res[0].zmm = _mm512_set1_ps(1.0f);
zmm_res[1] = zmm_res[0];
ymm_res[0] = zmm_res[0];
ymm_res[1] = zmm_res[0];
xmm_res[0] = zmm_res[0];
xmm_res[1] = zmm_res[0];
soft_v512_update(fsrc1);
_mm512_mask_store_ps(&zmm_res[0].zmm, k8, fsrc1.zmm);
zmm_res[0].zmm = _mm512_set1_ps(1.0f);
zmm_res[1] = zmm_res[0];
ymm_res[0] = zmm_res[0];
ymm_res[1] = zmm_res[0];
xmm_res[0] = zmm_res[0];
xmm_res[1] = zmm_res[0];
soft_v512_update(fsrc1);
_mm512_mask_storeu_ps(&zmm_res[0].f32[1], k8, fsrc1.zmm);
}
void NOINLINE do_load_and_loadu_epi32() {
V512 xmm_res, ymm_res, zmm_res;
__mmask16 k16 = 0xffef;
__mmask8 k8 = (__mmask8)k16;
/* Non-masked. */
soft_v512_update(isrc1);
zmm_res.zmmi = _mm512_load_epi32(&isrc1.zmmi);
soft_v512_update(isrc1);
ymm_res.zmmi = _mm512_load_si512(&isrc1.zmmi);
check_equal_nd(&ymm_res, &zmm_res, 16, "_mm512_load_si512", __LINE__);
soft_v512_update(isrc_arr[0]);
zmm_res.zmmi = _mm512_loadu_si512(&isrc_arr[0].s32[1]);
/* Masked. */
zmm_res.zmmi = _mm512_setzero_epi32();
ymm_res = zmm_res;
xmm_res = zmm_res;
soft_v512_update(isrc1);
zmm_res.zmmi = _mm512_mask_load_epi32(zmm_res.zmmi, k16, &isrc1.zmmi);
zmm_res.zmmi = _mm512_setzero_epi32();
ymm_res = zmm_res;
xmm_res = zmm_res;
soft_v512_update(isrc_arr[0]);
zmm_res.zmmi =
_mm512_mask_loadu_epi32(zmm_res.zmmi, k16, &isrc_arr[0].s32[3]);
/* Zero-masked. */
zmm_res.zmmi = _mm512_set1_epi32(-7);
ymm_res = zmm_res;
xmm_res = zmm_res;
soft_v512_update(isrc1);
zmm_res.zmmi = _mm512_maskz_load_epi32(k16, &isrc1.zmmi);
zmm_res.zmmi = _mm512_set1_epi32(11);
ymm_res = zmm_res;
xmm_res = zmm_res;
soft_v512_update(isrc_arr[0]);
zmm_res.zmmi = _mm512_maskz_loadu_epi32(k16, &isrc_arr[0].s32[1]);
}
void NOINLINE do_store_and_storeu_epi32() {
V512 xmm_res[2], ymm_res[2], zmm_res[2];
__mmask16 k16 = 0xfeff;
__mmask8 k8 = (__mmask8)k16;
/* Non-masked. */
zmm_res[0].zmmi = _mm512_set1_epi32(-101);
zmm_res[1] = zmm_res[0];
ymm_res[0] = zmm_res[0];
ymm_res[1] = zmm_res[0];
xmm_res[0] = zmm_res[0];
xmm_res[1] = zmm_res[0];
soft_v512_update(isrc1);
_mm512_store_epi32(&zmm_res[0].zmmi, isrc1.zmmi);
soft_v512_update(isrc1);
_mm256_store_si256((__m256i *)&ymm_res[0].s32[0], isrc1.ymmi[0]);
soft_v512_update(isrc1);
_mm_store_si128((__m128i *)&xmm_res[0].s32[0], isrc1.xmmi[0]);
check_equal_nd(&ymm_res, &zmm_res, 8, "_mm256_store_epi32", __LINE__);
check_equal_nd(&xmm_res, &zmm_res, 4, "_mm_store_epi32", __LINE__);
soft_v512_update(isrc1);
ymm_res[0].zmmi = _mm512_setzero_si512();
_mm512_store_si512(&ymm_res[0].zmmi, isrc1.zmmi);
check_equal_nd(&ymm_res, &zmm_res, 16, "_mm512_store_si512", __LINE__);
soft_v512_update(isrc1);
_mm512_storeu_si512(&zmm_res[0].s32[1], isrc1.zmmi);
soft_v512_update(isrc1);
_mm256_storeu_si256((__m256i *)&ymm_res[0].s32[1], isrc1.ymmi[0]);
soft_v512_update(isrc1);
_mm_storeu_si128((__m128i *)&xmm_res[0].s32[1], isrc1.xmmi[0]);
check_equal_nd(&ymm_res[0].s32[1], &zmm_res[0].s32[1], 8,
"_mm256_storeu_si256", __LINE__);
check_equal_nd(&xmm_res[0].s32[1], &zmm_res[0].s32[1], 4, "_mm_storeu_si128",
__LINE__);
/* Masked. */
zmm_res[0].zmmi = _mm512_set1_epi32(999);
zmm_res[1] = zmm_res[0];
ymm_res[0] = zmm_res[0];
ymm_res[1] = zmm_res[0];
xmm_res[0] = zmm_res[0];
xmm_res[1] = zmm_res[0];
soft_v512_update(isrc1);
_mm512_mask_store_epi32(&zmm_res[0].zmmi, k16, isrc1.zmmi);
zmm_res[0].zmmi = _mm512_set1_epi32(-3);
zmm_res[1] = zmm_res[0];
ymm_res[0] = zmm_res[0];
ymm_res[1] = zmm_res[0];
xmm_res[0] = zmm_res[0];
xmm_res[1] = zmm_res[0];
soft_v512_update(isrc1);
_mm512_mask_storeu_epi32(&zmm_res[0].s32[1], k16, isrc1.zmmi);
}
void NOINLINE do_load_and_loadu_epi64() {
V512 xmm_res, ymm_res, zmm_res;
__mmask16 k8 = 0xef;
/* Non-masked. */
soft_v512_update(isrc1);
zmm_res.zmmi = _mm512_load_epi64(&isrc1.zmmi);
soft_v512_update(isrc_arr[0]);
zmm_res.zmmi = _mm512_loadu_si512(&isrc_arr[0].s64[1]);
/* Masked. */
zmm_res.zmmi = _mm512_setzero_epi32();
ymm_res = zmm_res;
xmm_res = zmm_res;
soft_v512_update(isrc1);
zmm_res.zmmi = _mm512_mask_load_epi64(zmm_res.zmmi, k8, &isrc1.zmmi);
zmm_res.zmmi = _mm512_setzero_epi32();
ymm_res = zmm_res;
xmm_res = zmm_res;
soft_v512_update(isrc_arr[0]);
zmm_res.zmmi = _mm512_mask_loadu_epi64(zmm_res.zmmi, k8, &isrc_arr[0].s64[3]);
/* Zero-masked. */
zmm_res.zmmi = _mm512_set1_epi64(-7);
ymm_res = zmm_res;
xmm_res = zmm_res;
}
void NOINLINE do_store_and_storeu_epi64() {
V512 xmm_res[2], ymm_res[2], zmm_res[2];
__mmask8 k8 = 0xfe;
/* Non-masked. */
zmm_res[0].zmmi = _mm512_set1_epi32(-101);
zmm_res[1] = zmm_res[0];
ymm_res[0] = zmm_res[0];
ymm_res[1] = zmm_res[0];
xmm_res[0] = zmm_res[0];
xmm_res[1] = zmm_res[0];
soft_v512_update(isrc1);
_mm512_store_epi64(&zmm_res[0].zmmi, isrc1.zmmi);
soft_v512_update(isrc1);
_mm256_store_si256((__m256i *)&ymm_res[0].s64[0], isrc1.ymmi[0]);
soft_v512_update(isrc1);
_mm_store_si128((__m128i *)&xmm_res[0].s64[0], isrc1.xmmi[0]);
check_equal_nq(&ymm_res, &zmm_res, 4, "_mm256_store_epi64", __LINE__);
check_equal_nq(&xmm_res, &zmm_res, 2, "_mm_store_si256", __LINE__);
soft_v512_update(isrc1);
_mm512_storeu_si512(&zmm_res[0].s64[1], isrc1.zmmi);
/* Masked. */
zmm_res[0].zmmi = _mm512_set1_epi32(999);
zmm_res[1] = zmm_res[0];
ymm_res[0] = zmm_res[0];
ymm_res[1] = zmm_res[0];
xmm_res[0] = zmm_res[0];
xmm_res[1] = zmm_res[0];
soft_v512_update(isrc1);
_mm512_mask_store_epi64(&zmm_res[0].zmmi, k8, isrc1.zmmi);
zmm_res[0].zmmi = _mm512_set1_epi32(-3);
zmm_res[1] = zmm_res[0];
ymm_res[0] = zmm_res[0];
ymm_res[1] = zmm_res[0];
xmm_res[0] = zmm_res[0];
xmm_res[1] = zmm_res[0];
soft_v512_update(isrc1);
_mm512_mask_storeu_epi64(&zmm_res[0].s64[1], k8, isrc1.zmmi);
}
int main() {
init();
do_load_and_loadu_pd();
do_load_and_loadu_ps();
do_load_and_loadu_epi32();
do_store_and_storeu_epi32();
do_load_and_loadu_epi64();
do_store_and_storeu_pd();
do_store_and_storeu_ps();
do_store_and_storeu_epi64();
if (n_errs != 0) {
printf("FAILED\n");
return 1;
}
printf("PASSED\n");
return 0;
}