blob: e36de64a249c435a9c710b987697a2c85bc6b967 [file] [log] [blame]
/*
* Test the unpack{hi,lo} and shuffle intrinsics.
* This test was created to check the correctness
* of the following intrinsics support:
* _mm512_mask_blend_*()
* _mm512_shuffle_*()
* _mm512_mask_shuffle_*()
* _mm_unpack*()
* _mm256_unpack*()
* _mm512_unpack*()
* _mm512_mask_unpack*()
* _mm512_maskz_unpack*()
*/
#include "m512_test_util.h"
#include <stdio.h>
#include <string.h>
volatile int vol0 = 0;
/*
* Use this between tests to make compiler think src was updated.
* Prevents PRE'ing of a load of src.
*/
#define soft_update(src) (src).xmmi[vol0] = (src).xmmi[vol0]
V512 in8;
V512 in8_neg;
V512 in32;
V512 in32_neg;
V512 in32_mix;
V512 in64;
V512 in64_neg;
V512 in64_mix;
void NOINLINE init() {
volatile int i;
for (i = 0; i < 64; i++) {
in8.s8[i] = i;
in8_neg.s8[i] = -i;
}
for (i = 0; i < 16; i++) {
in32.s32[i] = i;
in32_neg.s32[i] = -i;
in32_mix.s32[i] = ((i % 3) == 0) ? -i : i;
}
for (i = 0; i < 8; i++) {
in64.s64[i] = i;
in64_neg.s64[i] = -i;
in64_mix.s64[i] = ((i % 3) == 0) ? -i : i;
}
}
#define CHECK_UNPCKHBW(n_lanes, dest, mask, zeroing, name) \
{ \
volatile int i, j, lane; \
for (lane = 0; lane < n_lanes; lane++) { \
for (i = 0, j = 0; i < 16; i += 2, j++) { \
expected.s8[16 * lane + i] = in8.s8[16 * lane + 8 + j]; \
expected.s8[16 * lane + i + 1] = in8_neg.s8[16 * lane + 8 + j]; \
} \
} \
for (i = 0; i < n_lanes * 16; i++) { \
if ((mask & (1LL << i)) == 0) { \
if (zeroing) { \
expected.s8[i] = 0; \
} else { \
expected.s8[i] = dest.s8[i]; \
} \
} \
} \
check_equal_nd(&res, &expected, n_lanes * 4, name, __LINE__); \
in8_neg.ymmi[vol0] = in8_neg.ymmi[vol0]; \
}
#define CHECK_UNPCKH32(n_lanes, dest, mask, zeroing, name) \
{ \
volatile int i, j, lane; \
for (lane = 0; lane < n_lanes; lane++) { \
for (i = 0, j = 0; i < 4; i += 2, j++) { \
expected.s32[4 * lane + i] = in32.s32[4 * lane + 2 + j]; \
expected.s32[4 * lane + i + 1] = in32_neg.s32[4 * lane + 2 + j]; \
} \
} \
for (i = 0; i < n_lanes * 4; i++) { \
if ((mask & (1LL << i)) == 0) { \
if (zeroing) { \
expected.s32[i] = 0; \
} else { \
expected.s32[i] = dest.s32[i]; \
} \
} \
} \
check_equal_nd(&res, &expected, n_lanes * 4, name, __LINE__); \
in32_neg.ymmi[vol0] = in32_neg.ymmi[vol0]; \
}
#define CHECK_UNPCKH64(n_lanes, dest, mask, zeroing, name) \
{ \
volatile int i, j, lane; \
for (lane = 0; lane < n_lanes; lane++) { \
for (i = 0, j = 0; i < 2; i += 2, j++) { \
expected.s64[2 * lane + i] = in64.s64[2 * lane + 1 + j]; \
expected.s64[2 * lane + i + 1] = in64_neg.s64[2 * lane + 1 + j]; \
} \
} \
for (i = 0; i < n_lanes * 2; i++) { \
if ((mask & (1LL << i)) == 0) { \
if (zeroing) { \
expected.s64[i] = 0; \
} else { \
expected.s64[i] = dest.s64[i]; \
} \
} \
} \
check_equal_nd(&res, &expected, n_lanes * 4, name, __LINE__); \
in64_neg.ymmi[vol0] = in64_neg.ymmi[vol0]; \
}
#define CHECK_UNPCKL32(n_lanes, dest, mask, zeroing, name) \
{ \
volatile int i, j, lane; \
for (lane = 0; lane < n_lanes; lane++) { \
for (i = 0, j = 0; i < 4; i += 2, j++) { \
expected.s32[4 * lane + i] = in32.s32[4 * lane + j]; \
expected.s32[4 * lane + i + 1] = in32_neg.s32[4 * lane + j]; \
} \
} \
for (i = 0; i < n_lanes * 4; i++) { \
if ((mask & (1LL << i)) == 0) { \
if (zeroing) { \
expected.s32[i] = 0; \
} else { \
expected.s32[i] = dest.s32[i]; \
} \
} \
} \
check_equal_nd(&res, &expected, n_lanes * 4, name, __LINE__); \
in32_neg.ymmi[vol0] = in32_neg.ymmi[vol0]; \
}
#define CHECK_UNPCKL64(n_lanes, dest, mask, zeroing, name) \
{ \
volatile int i, j, lane; \
for (lane = 0; lane < n_lanes; lane++) { \
for (i = 0, j = 0; i < 2; i += 2, j++) { \
expected.s64[2 * lane + i] = in64.s64[2 * lane + j]; \
expected.s64[2 * lane + i + 1] = in64_neg.s64[2 * lane + j]; \
} \
} \
for (i = 0; i < n_lanes * 2; i++) { \
if ((mask & (1LL << i)) == 0) { \
if (zeroing) { \
expected.s64[i] = 0; \
} else { \
expected.s64[i] = dest.s64[i]; \
} \
} \
} \
check_equal_nd(&res, &expected, n_lanes * 4, name, __LINE__); \
in64_neg.ymmi[vol0] = in64_neg.ymmi[vol0]; \
}
void NOINLINE do_unpckps_lo() {
V512 res;
V512 expected;
__mmask16 k = 0xFFFF;
res.zmm = _mm512_unpacklo_ps(in32.zmm, in32_neg.zmm);
CHECK_UNPCKL32(4, in32_mix, k, 0, "_mm512_unpacklo_ps");
k = 0xA4A4;
res.zmm = _mm512_mask_unpacklo_ps(in32_mix.zmm, k, in32.zmm, in32_neg.zmm);
CHECK_UNPCKL32(4, in32_mix, k, 0, "_mm512_mask_unpacklo_ps");
res.zmm = _mm512_maskz_unpacklo_ps(k, in32.zmm, in32_neg.zmm);
CHECK_UNPCKL32(4, in32_mix, k, 1, "_mm512_maskz_unpacklo_ps");
}
void NOINLINE do_unpckps_hi() {
V512 res;
V512 expected;
__mmask16 k = 0xFFFF;
res.xmm[vol0] = _mm_unpackhi_ps(in32.xmm[vol0], in32_neg.xmm[vol0]);
CHECK_UNPCKH32(1, in32_mix, k, 0, "_mm_unpackhi_ps");
res.ymm[vol0] = _mm256_unpackhi_ps(in32.ymm[vol0], in32_neg.ymm[vol0]);
CHECK_UNPCKH32(2, in32_mix, k, 0, "_mm256_unpackhi_ps");
res.zmm = _mm512_unpackhi_ps(in32.zmm, in32_neg.zmm);
CHECK_UNPCKH32(4, in32_mix, k, 0, "_mm512_unpackhi_ps");
k = 0xA4A4;
res.zmm = _mm512_mask_unpackhi_ps(in32_mix.zmm, k, in32.zmm, in32_neg.zmm);
CHECK_UNPCKH32(4, in32_mix, k, 0, "_mm512_mask_unpackhi_ps");
res.zmm = _mm512_maskz_unpackhi_ps(k, in32.zmm, in32_neg.zmm);
CHECK_UNPCKH32(4, in32_mix, k, 1, "_mm512_maskz_unpackhi_ps");
}
void NOINLINE do_unpckdq_lo() {
V512 res;
V512 expected;
__mmask16 k = 0xFFFF;
res.xmmi[vol0] = _mm_unpacklo_epi32(in32.xmmi[vol0], in32_neg.xmmi[vol0]);
CHECK_UNPCKL32(1, in32_mix, k, 0, "_mm_unpacklo_epi32");
res.ymmi[vol0] = _mm256_unpacklo_epi32(in32.ymmi[vol0], in32_neg.ymmi[vol0]);
CHECK_UNPCKL32(2, in32_mix, k, 0, "_mm256_unpacklo_epi32");
res.zmmi = _mm512_unpacklo_epi32(in32.zmmi, in32_neg.zmmi);
CHECK_UNPCKL32(4, in32_mix, k, 0, "_mm512_unpacklo_epi32");
k = 0xA4A4;
res.zmmi = _mm512_mask_unpacklo_epi32(in32_mix.zmmi, k, in32.zmmi, in32_neg.zmmi);
CHECK_UNPCKL32(4, in32_mix, k, 0, "_mm512_mask_unpacklo_epi32");
res.zmmi = _mm512_maskz_unpacklo_epi32(k, in32.zmmi, in32_neg.zmmi);
CHECK_UNPCKL32(4, in32_mix, k, 1, "_mm512_maskz_unpacklo_epi32");
}
void NOINLINE do_unpckqdq_lo() {
V512 res;
V512 expected;
__mmask8 k8 = 0xFF;
res.zmmi = _mm512_unpacklo_epi64(in64.zmmi, in64_neg.zmmi);
CHECK_UNPCKL64(4, in64_mix, k8, 0, "_mm512_unpacklo_epi64");
k8 = 0x4A;
res.zmmi =
_mm512_mask_unpacklo_epi64(in64_mix.zmmi, k8, in64.zmmi, in64_neg.zmmi);
CHECK_UNPCKL64(4, in64_mix, k8, 0, "_mm512_mask_unpacklo_epi64");
res.zmmi = _mm512_maskz_unpacklo_epi64(k8, in64.zmmi, in64_neg.zmmi);
CHECK_UNPCKL64(4, in64_mix, k8, 1, "_mm512_maskz_unpacklo_epi64");
}
void NOINLINE do_unpckpd_lo() {
V512 res;
V512 expected;
__mmask8 k8 = 0xFF;
res.zmmd = _mm512_unpacklo_pd(in64.zmmd, in64_neg.zmmd);
CHECK_UNPCKL64(4, in64_mix, k8, 0, "_mm512_unpacklo_pd");
k8 = 0x4A;
res.zmmd = _mm512_mask_unpacklo_pd(in64_mix.zmmd, k8, in64.zmmd, in64_neg.zmmd);
CHECK_UNPCKL64(4, in64_mix, k8, 0, "_mm512_mask_unpacklo_pd");
res.zmmd = _mm512_maskz_unpacklo_pd(k8, in64.zmmd, in64_neg.zmmd);
CHECK_UNPCKL64(4, in64_mix, k8, 1, "_mm512_maskz_unpacklo_pd");
}
#define CHECK_BLENDM(n_elems, width, type, mask, src1, src2, name) \
{ \
volatile int i; \
for (i = 0; i < n_elems; ++i) { \
if ((mask & (1LL << i)) == 0) { \
expected.type[i] = src1.type[i]; \
} else { \
expected.type[i] = src2.type[i]; \
} \
} \
check_equal_nd(&res, &expected, (n_elems * width) / 4, name, __LINE__); \
src2.ymmi[vol0] = src2.ymmi[vol0]; \
}
void NOINLINE do_blendmd() {
V512 res;
V512 expected;
__mmask16 k = 0xA44A;
res.zmmi = _mm512_mask_blend_epi32(k, in32.zmmi, in32_neg.zmmi);
CHECK_BLENDM(16, 4, s32, k, in32, in32_neg, "_mm512_mask_blend_epi32");
res.zmmi = _mm512_mask_mov_epi32(in32.zmmi, k, in32_neg.zmmi);
CHECK_BLENDM(16, 4, s32, k, in32, in32_neg, "_mm512_mask_mov_epi32");
}
void NOINLINE do_blendmq() {
V512 res;
V512 expected;
__mmask8 k = 0xA4;
res.zmmi = _mm512_mask_blend_epi64(k, in64.zmmi, in64_neg.zmmi);
CHECK_BLENDM(8, 8, s64, k, in64, in64_neg, "_mm512_mask_blend_epi64");
res.zmmi = _mm512_mask_mov_epi64(in64.zmmi, k, in64_neg.zmmi);
CHECK_BLENDM(8, 8, s64, k, in64, in64_neg, "_mm512_mask_mov_epi64");
}
void NOINLINE do_unpckqdq_hi() {
V512 res;
V512 expected;
__mmask8 k8 = 0xFF;
res.zmmi = _mm512_unpackhi_epi64(in64.zmmi, in64_neg.zmmi);
CHECK_UNPCKH64(4, in64_mix, k8, 0, "_mm512_unpackhi_epi64");
k8 = 0x4A;
res.zmmi =
_mm512_mask_unpackhi_epi64(in64_mix.zmmi, k8, in64.zmmi, in64_neg.zmmi);
CHECK_UNPCKH64(4, in64_mix, k8, 0, "_mm512_mask_unpackhi_epi64");
res.zmmi = _mm512_maskz_unpackhi_epi64(k8, in64.zmmi, in64_neg.zmmi);
CHECK_UNPCKH64(4, in64_mix, k8, 1, "_mm512_maskz_unpackhi_epi64");
}
void NOINLINE do_unpckpd_hi() {
V512 res;
V512 expected;
__mmask8 k8 = 0xFF;
res.xmmd[vol0] = _mm_unpackhi_pd(in64.xmmd[vol0], in64_neg.xmmd[vol0]);
CHECK_UNPCKH64(1, in64_mix, k8, 0, "_mm_unpackhi_pd");
res.ymmd[vol0] = _mm256_unpackhi_pd(in64.ymmd[vol0], in64_neg.ymmd[vol0]);
CHECK_UNPCKH64(2, in64_mix, k8, 0, "_mm256_unpackhi_pd");
res.zmmd = _mm512_unpackhi_pd(in64.zmmd, in64_neg.zmmd);
CHECK_UNPCKH64(4, in64_mix, k8, 0, "_mm512_unpackhi_pd");
k8 = 0x4A;
res.zmmd = _mm512_mask_unpackhi_pd(in64_mix.zmmd, k8, in64.zmmd, in64_neg.zmmd);
CHECK_UNPCKH64(4, in64_mix, k8, 0, "_mm512_mask_unpackhi_pd");
res.zmmd = _mm512_maskz_unpackhi_pd(k8, in64.zmmd, in64_neg.zmmd);
CHECK_UNPCKH64(4, in64_mix, k8, 1, "_mm512_maskz_unpackhi_pd");
}
void NOINLINE do_shuf_ps() {
V512 res;
V512 expected;
volatile int i, lane;
__mmask16 k = 0x7e95;
#define PS_IMM 0x5c
res.zmm = _mm512_shuffle_ps(in32.zmm, in32_neg.zmm, PS_IMM);
for (lane = 0; lane < 4; lane++) {
for (i = 0; i < 1; i++) {
expected.s32[4 * lane + i] = in32.s32[4 * lane + (PS_IMM & 3)];
expected.s32[4 * lane + i + 1] = in32.s32[4 * lane + ((PS_IMM >> 2) & 3)];
expected.s32[4 * lane + 2 + i] =
in32_neg.s32[4 * lane + ((PS_IMM >> 4) & 3)];
expected.s32[4 * lane + 2 + i + 1] =
in32_neg.s32[4 * lane + ((PS_IMM >> 6) & 3)];
}
}
check_equal_nd(&res, &expected, 16, "_mm512_shuffle_ps", __LINE__);
res.zmmi = _mm512_setzero_epi32();
res.zmm = _mm512_mask_shuffle_ps(res.zmm, k, in32.zmm, in32_neg.zmm, PS_IMM);
expected.zmmi = _mm512_setzero_epi32();
for (lane = 0; lane < 4; lane++) {
for (i = 0; i < 1; i++) {
int m = 4 * lane;
if ((1 << (m + i)) & k) {
expected.s32[m + i] = in32.s32[m + (PS_IMM & 3)];
}
if ((1 << (m + i + 1)) & k) {
expected.s32[m + i + 1] = in32.s32[m + ((PS_IMM >> 2) & 3)];
}
if ((1 << (m + 2 + i)) & k) {
expected.s32[m + 2 + i] = in32_neg.s32[m + ((PS_IMM >> 4) & 3)];
}
if ((1 << (m + 2 + i + 1)) & k) {
expected.s32[m + 2 + i + 1] = in32_neg.s32[m + ((PS_IMM >> 6) & 3)];
}
}
}
check_equal_nd(&res, &expected, 16, "_mm512_mask_shuffle_ps", __LINE__);
}
void NOINLINE do_unpckdq_hi() {
V512 res;
V512 expected;
__mmask16 k = 0xFFFF;
res.zmmi = _mm512_unpackhi_epi32(in32.zmmi, in32_neg.zmmi);
CHECK_UNPCKH32(4, in32_mix, k, 0, "_mm512_unpackhi_epi32");
k = 0xA4A4;
res.zmmi = _mm512_mask_unpackhi_epi32(in32_mix.zmmi, k, in32.zmmi, in32_neg.zmmi);
CHECK_UNPCKH32(4, in32_mix, k, 0, "_mm512_mask_unpackhi_epi32");
res.zmmi = _mm512_maskz_unpackhi_epi32(k, in32.zmmi, in32_neg.zmmi);
CHECK_UNPCKH32(4, in32_mix, k, 1, "_mm512_maskz_unpackhi_epi32");
}
void NOINLINE do_shuf_pd() {
V512 res;
V512 expected;
volatile int i, lane;
__mmask8 k = 0xba;
#define PD_IMM 0x7b
res.zmmd = _mm512_shuffle_pd(in64.zmmd, in64_neg.zmmd, PD_IMM);
for (lane = 0; lane < 4; lane++) {
int m = 2 * lane;
for (i = 0; i < 1; i++) {
expected.s64[m + i] = in64.s64[m + ((PD_IMM >> m) & 1)];
expected.s64[m + i + 1] = in64_neg.s64[m + ((PD_IMM >> (m + 1)) & 1)];
}
}
check_equal_nd(&res, &expected, 16, "_mm512_shuffle_pd", __LINE__);
res.zmmi = _mm512_setzero_epi32();
res.zmmd =
_mm512_mask_shuffle_pd(res.zmmd, k, in64.zmmd, in64_neg.zmmd, PD_IMM);
expected.zmmi = _mm512_setzero_epi32();
for (lane = 0; lane < 4; lane++) {
int m = 2 * lane;
for (i = 0; i < 1; i++) {
if ((1 << (m + i)) & k) {
expected.s64[m + i] = in64.s64[m + ((PD_IMM >> m) & 1)];
}
if ((1 << (m + i + 1)) & k) {
expected.s64[m + i + 1] = in64_neg.s64[m + ((PD_IMM >> (m + 1)) & 1)];
}
}
}
check_equal_nd(&res, &expected, 16, "_mm512_mask_shuffle_pd", __LINE__);
}
void NOINLINE do_shuf_f32x4() {
V512 res;
V512 expected;
V512 tmp;
volatile int i, j, lane;
__mmask16 k = 0x7e95;
#define F32X4_IMM 0x5c
res.zmm = _mm512_shuffle_f32x4(in32.zmm, in32_neg.zmm, F32X4_IMM);
// This code was copied from shuffle_ps, need to update for f32x4.
for (lane = 0; lane < 4; lane++) {
j = ((F32X4_IMM >> 2 * lane) & 0x3);
if (lane < 2) {
expected.xmmi[lane] = in32.xmmi[j];
} else {
expected.xmmi[lane] = in32_neg.xmmi[j];
}
}
check_equal_nd(&res, &expected, 16, "_mm512_shuffle_f32x4", __LINE__);
res.zmmi = _mm512_setzero_epi32();
res.zmm =
_mm512_mask_shuffle_f32x4(res.zmm, k, in32.zmm, in32_neg.zmm, F32X4_IMM);
expected.zmmi = _mm512_setzero_epi32();
for (lane = 0; lane < 4; lane++) {
int m = 4 * lane;
j = ((F32X4_IMM >> 2 * lane) & 0x3);
if (lane < 2) {
tmp.xmmi[lane] = in32.xmmi[j];
} else {
tmp.xmmi[lane] = in32_neg.xmmi[j];
}
for (i = 0; i < 4; i++) {
if ((1 << (m + i)) & k) {
expected.s32[m + i] = tmp.s32[m + i];
}
}
}
check_equal_nd(&res, &expected, 16, "_mm512_mask_shuffle_f32x4", __LINE__);
}
void NOINLINE do_blendmpd() {
V512 res;
V512 expected;
__mmask8 k = 0x4A;
soft_update(in64_neg);
res.zmmd = _mm512_mask_blend_pd(k, in64.zmmd, in64_neg.zmmd);
CHECK_BLENDM(8, 8, s64, k, in64, in64_neg, "_mm512_mask_blend_pd");
res.zmmd = _mm512_mask_mov_pd(in64.zmmd, k, in64_neg.zmmd);
CHECK_BLENDM(8, 8, s64, k, in64, in64_neg, "_mm512_mask_blend_pd");
}
void NOINLINE do_blendmps() {
V512 res;
V512 expected;
__mmask16 k = 0xA44A;
res.zmm = _mm512_mask_blend_ps(k, in32.zmm, in32_neg.zmm);
CHECK_BLENDM(16, 4, s32, k, in32, in32_neg, "_mm512_mask_blend_ps");
res.zmm = _mm512_mask_mov_ps(in32.zmm, k, in32_neg.zmm);
CHECK_BLENDM(16, 4, s32, k, in32, in32_neg, "_mm512_mask_mov_ps");
}
int main(int argc, char *argv[]) {
init();
do_shuf_f32x4();
do_shuf_pd();
do_shuf_ps();
do_unpckdq_hi();
do_unpckps_lo();
do_unpckps_hi();
do_unpckdq_lo();
do_unpckqdq_lo();
do_unpckpd_lo();
do_unpckpd_hi();
do_unpckqdq_hi();
do_blendmd();
do_blendmq();
do_blendmpd();
do_blendmps();
if (n_errs != 0) {
printf("FAILED\n");
return 1;
}
printf("PASSED\n");
return 0;
}