commit | 67ea94281d8726619e3bc95e94ec57cc8d61efb7 | [log] [tgz] |
---|---|---|
author | Victoria <niva213@gmail.com> | Fri Sep 18 18:18:34 2020 +0300 |
committer | Zhislina <Victoria.Zhislina@intel.com> | Fri Sep 18 19:17:23 2020 +0300 |
tree | d201cb67d407d38d916d238fdccbdb6c8e4c871e | |
parent | 42b2bebacee25452e150095ef4480b3fa26e30f5 [diff] |
fix: round position for vr{add,sub}hn_{s,u}{16,32,64} (#47) ut: int64_t r[12]; int8x8_t a0 = vraddhn_s16(vdupq_n_s16(1UL<< 7), vdupq_n_s16(0)); r[ 0] = vget_lane_s8 (a0, 0); int16x4_t a1 = vraddhn_s32(vdupq_n_s32(1UL<<15), vdupq_n_s32(0)); r[ 1] = vget_lane_s16(a1, 0); int32x2_t a2 = vraddhn_s64(vdupq_n_s64(1UL<<31), vdupq_n_s64(0)); r[ 2] = vget_lane_s32(a2, 0); uint8x8_t a3 = vraddhn_u16(vdupq_n_u16(1UL<< 7), vdupq_n_u16(0)); r[ 3] = vget_lane_u8 (a3, 0); uint16x4_t a4 = vraddhn_u32(vdupq_n_u32(1UL<<15), vdupq_n_u32(0)); r[ 4] = vget_lane_u16(a4, 0); uint32x2_t a5 = vraddhn_u64(vdupq_n_u64(1UL<<31), vdupq_n_u64(0)); r[ 5] = vget_lane_u32(a5, 0); int8x8_t s0 = vrsubhn_s16(vdupq_n_s16(1UL<< 7), vdupq_n_s16(0)); r[ 6] = vget_lane_s8 (s0, 0); int16x4_t s1 = vrsubhn_s32(vdupq_n_s32(1UL<<15), vdupq_n_s32(0)); r[ 7] = vget_lane_s16(s1, 0); int32x2_t s2 = vrsubhn_s64(vdupq_n_s64(1UL<<31), vdupq_n_s64(0)); r[ 8] = vget_lane_s32(s2, 0); uint8x8_t s3 = vrsubhn_u16(vdupq_n_u16(1UL<< 7), vdupq_n_u16(0)); r[ 9] = vget_lane_u8 (s3, 0); uint16x4_t s4 = vrsubhn_u32(vdupq_n_u32(1UL<<15), vdupq_n_u32(0)); r[10] = vget_lane_u16(s4, 0); uint32x2_t s5 = vrsubhn_u64(vdupq_n_u64(1UL<<31), vdupq_n_u64(0)); r[11] = vget_lane_u32(s5, 0); for(unsigned i = 0; i < 12; i++) { assert(r[i] == 1); } Co-authored-by: Dima <yudind@gmail.com>
The NEON_2_SSE.h file is intended to simplify ARM->IA32 porting. It makes the correspondence (or a real porting) of ARM NEON intrinsics as defined in “arm_neon.h” header and x86 SSE (up to SSE4.2) intrinsic functions as defined in corresponding x86 compilers headers files.
To take advantage of this file just include it in your project that uses ARM NEON intinsics instead of “arm_neon.h”, compile it as usual and enjoy the result.
For significant performance improvement in some cases you might need to define USE_SSE4 in your project settings. Otherwise SIMD up to SSSE3 to be used.
If NEON2SSE_DISABLE_PERFORMANCE_WARNING macro is defined, then the performance warnings are disabled.
For more information and license please read the NEON_2_SSE.h content.
The unit tests set used for ARM NEON - x86 SSE conformance verification is https://github.com/christophe-lyon/arm-neon-tests