blob: 7961d85107a26571040670f7a6481aeac170fea6 [file] [log] [blame]
#ifndef SSE2NEON_H
#define SSE2NEON_H
// This header file provides a simple API translation layer
// between SSE intrinsics to their corresponding Arm/Aarch64 NEON versions
//
// This header file does not yet translate all of the SSE intrinsics.
//
// Contributors to this work are:
// John W. Ratcliff <jratcliffscarab@gmail.com>
// Brandon Rowlett <browlett@nvidia.com>
// Ken Fast <kfast@gdeb.com>
// Eric van Beurden <evanbeurden@nvidia.com>
// Alexander Potylitsin <apotylitsin@nvidia.com>
// Hasindu Gamaarachchi <hasindu2008@gmail.com>
// Jim Huang <jserv@biilabs.io>
// Mark Cheng <marktwtn@biilabs.io>
// Malcolm James MacLeod <malcolm@gulden.com>
// Devin Hussey (easyaspi314) <husseydevin@gmail.com>
// Sebastian Pop <spop@amazon.com>
// Developer Ecosystem Engineering <DeveloperEcosystemEngineering@apple.com>
// Danila Kutenin <danilak@google.com>
// Fran├žois Turban (JishinMaster) <francois.turban@gmail.com>
// Pei-Hsuan Hung <afcidk@gmail.com>
// Yang-Hao Yuan <yanghau@biilabs.io>
// Syoyo Fujita <syoyo@lighttransport.com>
// Brecht Van Lommel <brecht@blender.org>
/*
* sse2neon is freely redistributable under the MIT License.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
/* Tunable configurations */
/* Enable precise implementation of math operations
* This would slow down the computation a bit, but gives consistent result with
* x86 SSE2. (e.g. would solve a hole or NaN pixel in the rendering result)
*/
/* _mm_min_ps and _mm_max_ps */
#ifndef SSE2NEON_PRECISE_MINMAX
#define SSE2NEON_PRECISE_MINMAX (0)
#endif
/* _mm_rcp_ps and _mm_div_ps */
#ifndef SSE2NEON_PRECISE_DIV
#define SSE2NEON_PRECISE_DIV (0)
#endif
/* _mm_sqrt_ps and _mm_rsqrt_ps */
#ifndef SSE2NEON_PRECISE_SQRT
#define SSE2NEON_PRECISE_SQRT (0)
#endif
#if defined(__GNUC__) || defined(__clang__)
#pragma push_macro("FORCE_INLINE")
#pragma push_macro("ALIGN_STRUCT")
#define FORCE_INLINE static inline __attribute__((always_inline))
#define ALIGN_STRUCT(x) __attribute__((aligned(x)))
#ifndef likely
#define likely(x) __builtin_expect(!!(x), 1)
#endif
#ifndef unlikely
#define unlikely(x) __builtin_expect(!!(x), 0)
#endif
#else
#error "Macro name collisions may happen with unsupported compiler."
#ifdef FORCE_INLINE
#undef FORCE_INLINE
#endif
#define FORCE_INLINE static inline
#ifndef ALIGN_STRUCT
#define ALIGN_STRUCT(x) __declspec(align(x))
#endif
#endif
#ifndef likely
#define likely(x) (x)
#endif
#ifndef unlikely
#define unlikely(x) (x)
#endif
#include <stdint.h>
#include <stdlib.h>
/* Architecture-specific build options */
/* FIXME: #pragma GCC push_options is only available on GCC */
#if defined(__GNUC__)
#if defined(__arm__) && __ARM_ARCH == 7
/* According to ARM C Language Extensions Architecture specification,
* __ARM_NEON is defined to a value indicating the Advanced SIMD (NEON)
* architecture supported.
*/
#if !defined(__ARM_NEON) || !defined(__ARM_NEON__)
#error "You must enable NEON instructions (e.g. -mfpu=neon) to use SSE2NEON."
#endif
#if !defined(__clang__)
#pragma GCC push_options
#pragma GCC target("fpu=neon")
#endif
#elif defined(__aarch64__)
#if !defined(__clang__)
#pragma GCC push_options
#pragma GCC target("+simd")
#endif
#else
#error "Unsupported target. Must be either ARMv7-A+NEON or ARMv8-A."
#endif
#endif
#include <arm_neon.h>
/* Rounding functions require either Aarch64 instructions or libm failback */
#if !defined(__aarch64__)
#include <math.h>
#endif
/* "__has_builtin" can be used to query support for built-in functions
* provided by gcc/clang and other compilers that support it.
*/
#ifndef __has_builtin /* GCC prior to 10 or non-clang compilers */
/* Compatibility with gcc <= 9 */
#if __GNUC__ <= 9
#define __has_builtin(x) HAS##x
#define HAS__builtin_popcount 1
#define HAS__builtin_popcountll 1
#else
#define __has_builtin(x) 0
#endif
#endif
/**
* MACRO for shuffle parameter for _mm_shuffle_ps().
* Argument fp3 is a digit[0123] that represents the fp from argument "b"
* of mm_shuffle_ps that will be placed in fp3 of result. fp2 is the same
* for fp2 in result. fp1 is a digit[0123] that represents the fp from
* argument "a" of mm_shuffle_ps that will be places in fp1 of result.
* fp0 is the same for fp0 of result.
*/
#define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
(((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
/* Rounding mode macros. */
#define _MM_FROUND_TO_NEAREST_INT 0x00
#define _MM_FROUND_TO_NEG_INF 0x01
#define _MM_FROUND_TO_POS_INF 0x02
#define _MM_FROUND_TO_ZERO 0x03
#define _MM_FROUND_CUR_DIRECTION 0x04
#define _MM_FROUND_NO_EXC 0x08
#define _MM_ROUND_NEAREST 0x0000
#define _MM_ROUND_DOWN 0x2000
#define _MM_ROUND_UP 0x4000
#define _MM_ROUND_TOWARD_ZERO 0x6000
/* indicate immediate constant argument in a given range */
#define __constrange(a, b) const
/* A few intrinsics accept traditional data types like ints or floats, but
* most operate on data types that are specific to SSE.
* If a vector type ends in d, it contains doubles, and if it does not have
* a suffix, it contains floats. An integer vector type can contain any type
* of integer, from chars to shorts to unsigned long longs.
*/
typedef int64x1_t __m64;
typedef float32x4_t __m128; /* 128-bit vector containing 4 floats */
// On ARM 32-bit architecture, the float64x2_t is not supported.
// The data type __m128d should be represented in a different way for related
// intrinsic conversion.
#if defined(__aarch64__)
typedef float64x2_t __m128d; /* 128-bit vector containing 2 doubles */
#else
typedef float32x4_t __m128d;
#endif
typedef int64x2_t __m128i; /* 128-bit vector containing integers */
/* type-safe casting between types */
#define vreinterpretq_m128_f16(x) vreinterpretq_f32_f16(x)
#define vreinterpretq_m128_f32(x) (x)
#define vreinterpretq_m128_f64(x) vreinterpretq_f32_f64(x)
#define vreinterpretq_m128_u8(x) vreinterpretq_f32_u8(x)
#define vreinterpretq_m128_u16(x) vreinterpretq_f32_u16(x)
#define vreinterpretq_m128_u32(x) vreinterpretq_f32_u32(x)
#define vreinterpretq_m128_u64(x) vreinterpretq_f32_u64(x)
#define vreinterpretq_m128_s8(x) vreinterpretq_f32_s8(x)
#define vreinterpretq_m128_s16(x) vreinterpretq_f32_s16(x)
#define vreinterpretq_m128_s32(x) vreinterpretq_f32_s32(x)
#define vreinterpretq_m128_s64(x) vreinterpretq_f32_s64(x)
#define vreinterpretq_f16_m128(x) vreinterpretq_f16_f32(x)
#define vreinterpretq_f32_m128(x) (x)
#define vreinterpretq_f64_m128(x) vreinterpretq_f64_f32(x)
#define vreinterpretq_u8_m128(x) vreinterpretq_u8_f32(x)
#define vreinterpretq_u16_m128(x) vreinterpretq_u16_f32(x)
#define vreinterpretq_u32_m128(x) vreinterpretq_u32_f32(x)
#define vreinterpretq_u64_m128(x) vreinterpretq_u64_f32(x)
#define vreinterpretq_s8_m128(x) vreinterpretq_s8_f32(x)
#define vreinterpretq_s16_m128(x) vreinterpretq_s16_f32(x)
#define vreinterpretq_s32_m128(x) vreinterpretq_s32_f32(x)
#define vreinterpretq_s64_m128(x) vreinterpretq_s64_f32(x)
#define vreinterpretq_m128i_s8(x) vreinterpretq_s64_s8(x)
#define vreinterpretq_m128i_s16(x) vreinterpretq_s64_s16(x)
#define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x)
#define vreinterpretq_m128i_s64(x) (x)
#define vreinterpretq_m128i_u8(x) vreinterpretq_s64_u8(x)
#define vreinterpretq_m128i_u16(x) vreinterpretq_s64_u16(x)
#define vreinterpretq_m128i_u32(x) vreinterpretq_s64_u32(x)
#define vreinterpretq_m128i_u64(x) vreinterpretq_s64_u64(x)
#define vreinterpretq_f32_m128i(x) vreinterpretq_f32_s64(x)
#define vreinterpretq_f64_m128i(x) vreinterpretq_f64_s64(x)
#define vreinterpretq_s8_m128i(x) vreinterpretq_s8_s64(x)
#define vreinterpretq_s16_m128i(x) vreinterpretq_s16_s64(x)
#define vreinterpretq_s32_m128i(x) vreinterpretq_s32_s64(x)
#define vreinterpretq_s64_m128i(x) (x)
#define vreinterpretq_u8_m128i(x) vreinterpretq_u8_s64(x)
#define vreinterpretq_u16_m128i(x) vreinterpretq_u16_s64(x)
#define vreinterpretq_u32_m128i(x) vreinterpretq_u32_s64(x)
#define vreinterpretq_u64_m128i(x) vreinterpretq_u64_s64(x)
#define vreinterpret_m64_s8(x) vreinterpret_s64_s8(x)
#define vreinterpret_m64_s16(x) vreinterpret_s64_s16(x)
#define vreinterpret_m64_s32(x) vreinterpret_s64_s32(x)
#define vreinterpret_m64_s64(x) (x)
#define vreinterpret_m64_u8(x) vreinterpret_s64_u8(x)
#define vreinterpret_m64_u16(x) vreinterpret_s64_u16(x)
#define vreinterpret_m64_u32(x) vreinterpret_s64_u32(x)
#define vreinterpret_m64_u64(x) vreinterpret_s64_u64(x)
#define vreinterpret_m64_f16(x) vreinterpret_s64_f16(x)
#define vreinterpret_m64_f32(x) vreinterpret_s64_f32(x)
#define vreinterpret_m64_f64(x) vreinterpret_s64_f64(x)
#define vreinterpret_u8_m64(x) vreinterpret_u8_s64(x)
#define vreinterpret_u16_m64(x) vreinterpret_u16_s64(x)
#define vreinterpret_u32_m64(x) vreinterpret_u32_s64(x)
#define vreinterpret_u64_m64(x) vreinterpret_u64_s64(x)
#define vreinterpret_s8_m64(x) vreinterpret_s8_s64(x)
#define vreinterpret_s16_m64(x) vreinterpret_s16_s64(x)
#define vreinterpret_s32_m64(x) vreinterpret_s32_s64(x)
#define vreinterpret_s64_m64(x) (x)
#define vreinterpret_f32_m64(x) vreinterpret_f32_s64(x)
#if defined(__aarch64__)
#define vreinterpretq_m128d_s32(x) vreinterpretq_f64_s32(x)
#define vreinterpretq_m128d_s64(x) vreinterpretq_f64_s64(x)
#define vreinterpretq_m128d_u64(x) vreinterpretq_f64_u64(x)
#define vreinterpretq_m128d_f32(x) vreinterpretq_f64_f32(x)
#define vreinterpretq_m128d_f64(x) (x)
#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f64(x)
#define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f64(x)
#define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f64(x)
#define vreinterpretq_f64_m128d(x) (x)
#define vreinterpretq_f32_m128d(x) vreinterpretq_f32_f64(x)
#else
#define vreinterpretq_m128d_s32(x) vreinterpretq_f32_s32(x)
#define vreinterpretq_m128d_s64(x) vreinterpretq_f32_s64(x)
#define vreinterpretq_m128d_u32(x) vreinterpretq_f32_u32(x)
#define vreinterpretq_m128d_u64(x) vreinterpretq_f32_u64(x)
#define vreinterpretq_m128d_f32(x) (x)
#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f32(x)
#define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f32(x)
#define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f32(x)
#define vreinterpretq_f32_m128d(x) (x)
#endif
// A struct is defined in this header file called 'SIMDVec' which can be used
// by applications which attempt to access the contents of an _m128 struct
// directly. It is important to note that accessing the __m128 struct directly
// is bad coding practice by Microsoft: @see:
// https://msdn.microsoft.com/en-us/library/ayeb3ayc.aspx
//
// However, some legacy source code may try to access the contents of an __m128
// struct directly so the developer can use the SIMDVec as an alias for it. Any
// casting must be done manually by the developer, as you cannot cast or
// otherwise alias the base NEON data type for intrinsic operations.
//
// union intended to allow direct access to an __m128 variable using the names
// that the MSVC compiler provides. This union should really only be used when
// trying to access the members of the vector as integer values. GCC/clang
// allow native access to the float members through a simple array access
// operator (in C since 4.6, in C++ since 4.8).
//
// Ideally direct accesses to SIMD vectors should not be used since it can cause
// a performance hit. If it really is needed however, the original __m128
// variable can be aliased with a pointer to this union and used to access
// individual components. The use of this union should be hidden behind a macro
// that is used throughout the codebase to access the members instead of always
// declaring this type of variable.
typedef union ALIGN_STRUCT(16) SIMDVec {
float m128_f32[4]; // as floats - DON'T USE. Added for convenience.
int8_t m128_i8[16]; // as signed 8-bit integers.
int16_t m128_i16[8]; // as signed 16-bit integers.
int32_t m128_i32[4]; // as signed 32-bit integers.
int64_t m128_i64[2]; // as signed 64-bit integers.
uint8_t m128_u8[16]; // as unsigned 8-bit integers.
uint16_t m128_u16[8]; // as unsigned 16-bit integers.
uint32_t m128_u32[4]; // as unsigned 32-bit integers.
uint64_t m128_u64[2]; // as unsigned 64-bit integers.
} SIMDVec;
// casting using SIMDVec
#define vreinterpretq_nth_u64_m128i(x, n) (((SIMDVec *) &x)->m128_u64[n])
#define vreinterpretq_nth_u32_m128i(x, n) (((SIMDVec *) &x)->m128_u32[n])
#define vreinterpretq_nth_u8_m128i(x, n) (((SIMDVec *) &x)->m128_u8[n])
// Function declaration
// SSE
FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE();
FORCE_INLINE __m128 _mm_move_ss(__m128, __m128);
// SSE2
FORCE_INLINE __m128i _mm_cvtps_epi32(__m128);
FORCE_INLINE __m128d _mm_move_sd(__m128d, __m128d);
FORCE_INLINE __m128i _mm_set_epi32(int, int, int, int);
FORCE_INLINE __m128i _mm_set_epi64x(int64_t, int64_t);
FORCE_INLINE __m128d _mm_set_pd(double, double);
// SSE4.1
FORCE_INLINE __m128d _mm_ceil_pd(__m128d);
FORCE_INLINE __m128 _mm_ceil_ps(__m128);
FORCE_INLINE __m128d _mm_floor_pd(__m128d);
FORCE_INLINE __m128 _mm_floor_ps(__m128);
FORCE_INLINE __m128d _mm_round_pd(__m128d, int);
FORCE_INLINE __m128 _mm_round_ps(__m128, int);
// SSE4.2
FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t, uint8_t);
/* Backwards compatibility for compilers with lack of specific type support */
// Older gcc does not define vld1q_u8_x4 type
#if defined(__GNUC__) && !defined(__clang__) && \
((__GNUC__ <= 10 && defined(__arm__)) || \
(__GNUC__ == 10 && __GNUC_MINOR__ < 3 && defined(__aarch64__)) || \
(__GNUC__ <= 9 && defined(__aarch64__)))
FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
{
uint8x16x4_t ret;
ret.val[0] = vld1q_u8(p + 0);
ret.val[1] = vld1q_u8(p + 16);
ret.val[2] = vld1q_u8(p + 32);
ret.val[3] = vld1q_u8(p + 48);
return ret;
}
#else
// Wraps vld1q_u8_x4
FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
{
return vld1q_u8_x4(p);
}
#endif
/* Function Naming Conventions
* The naming convention of SSE intrinsics is straightforward. A generic SSE
* intrinsic function is given as follows:
* _mm_<name>_<data_type>
*
* The parts of this format are given as follows:
* 1. <name> describes the operation performed by the intrinsic
* 2. <data_type> identifies the data type of the function's primary arguments
*
* This last part, <data_type>, is a little complicated. It identifies the
* content of the input values, and can be set to any of the following values:
* + ps - vectors contain floats (ps stands for packed single-precision)
* + pd - vectors cantain doubles (pd stands for packed double-precision)
* + epi8/epi16/epi32/epi64 - vectors contain 8-bit/16-bit/32-bit/64-bit
* signed integers
* + epu8/epu16/epu32/epu64 - vectors contain 8-bit/16-bit/32-bit/64-bit
* unsigned integers
* + si128 - unspecified 128-bit vector or 256-bit vector
* + m128/m128i/m128d - identifies input vector types when they are different
* than the type of the returned vector
*
* For example, _mm_setzero_ps. The _mm implies that the function returns
* a 128-bit vector. The _ps at the end implies that the argument vectors
* contain floats.
*
* A complete example: Byte Shuffle - pshufb (_mm_shuffle_epi8)
* // Set packed 16-bit integers. 128 bits, 8 short, per 16 bits
* __m128i v_in = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
* // Set packed 8-bit integers
* // 128 bits, 16 chars, per 8 bits
* __m128i v_perm = _mm_setr_epi8(1, 0, 2, 3, 8, 9, 10, 11,
* 4, 5, 12, 13, 6, 7, 14, 15);
* // Shuffle packed 8-bit integers
* __m128i v_out = _mm_shuffle_epi8(v_in, v_perm); // pshufb
*
* Data (Number, Binary, Byte Index):
+------+------+-------------+------+------+-------------+
| 1 | 2 | 3 | 4 | Number
+------+------+------+------+------+------+------+------+
| 0000 | 0001 | 0000 | 0010 | 0000 | 0011 | 0000 | 0100 | Binary
+------+------+------+------+------+------+------+------+
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | Index
+------+------+------+------+------+------+------+------+
+------+------+------+------+------+------+------+------+
| 5 | 6 | 7 | 8 | Number
+------+------+------+------+------+------+------+------+
| 0000 | 0101 | 0000 | 0110 | 0000 | 0111 | 0000 | 1000 | Binary
+------+------+------+------+------+------+------+------+
| 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | Index
+------+------+------+------+------+------+------+------+
* Index (Byte Index):
+------+------+------+------+------+------+------+------+
| 1 | 0 | 2 | 3 | 8 | 9 | 10 | 11 |
+------+------+------+------+------+------+------+------+
+------+------+------+------+------+------+------+------+
| 4 | 5 | 12 | 13 | 6 | 7 | 14 | 15 |
+------+------+------+------+------+------+------+------+
* Result:
+------+------+------+------+------+------+------+------+
| 1 | 0 | 2 | 3 | 8 | 9 | 10 | 11 | Index
+------+------+------+------+------+------+------+------+
| 0001 | 0000 | 0000 | 0010 | 0000 | 0101 | 0000 | 0110 | Binary
+------+------+------+------+------+------+------+------+
| 256 | 2 | 5 | 6 | Number
+------+------+------+------+------+------+------+------+
+------+------+------+------+------+------+------+------+
| 4 | 5 | 12 | 13 | 6 | 7 | 14 | 15 | Index
+------+------+------+------+------+------+------+------+
| 0000 | 0011 | 0000 | 0111 | 0000 | 0100 | 0000 | 1000 | Binary
+------+------+------+------+------+------+------+------+
| 3 | 7 | 4 | 8 | Number
+------+------+------+------+------+------+-------------+
*/
/* Constants for use with _mm_prefetch. */
enum _mm_hint {
_MM_HINT_NTA = 0, /* load data to L1 and L2 cache, mark it as NTA */
_MM_HINT_T0 = 1, /* load data to L1 and L2 cache */
_MM_HINT_T1 = 2, /* load data to L2 cache only */
_MM_HINT_T2 = 3, /* load data to L2 cache only, mark it as NTA */
_MM_HINT_ENTA = 4, /* exclusive version of _MM_HINT_NTA */
_MM_HINT_ET0 = 5, /* exclusive version of _MM_HINT_T0 */
_MM_HINT_ET1 = 6, /* exclusive version of _MM_HINT_T1 */
_MM_HINT_ET2 = 7 /* exclusive version of _MM_HINT_T2 */
};
// The bit field mapping to the FPCR(floating-point control register)
typedef struct {
uint16_t res0;
uint8_t res1 : 6;
uint8_t bit22 : 1;
uint8_t bit23 : 1;
uint8_t res2;
#if defined(__aarch64__)
uint32_t res3;
#endif
} fpcr_bitfield;
// Takes the upper 64 bits of a and places it in the low end of the result
// Takes the lower 64 bits of b and places it into the high end of the result.
FORCE_INLINE __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b)
{
float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
return vreinterpretq_m128_f32(vcombine_f32(a32, b10));
}
// takes the lower two 32-bit values from a and swaps them and places in high
// end of result takes the higher two 32 bit values from b and swaps them and
// places in low end of result.
FORCE_INLINE __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b)
{
float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
float32x2_t b23 = vrev64_f32(vget_high_f32(vreinterpretq_f32_m128(b)));
return vreinterpretq_m128_f32(vcombine_f32(a01, b23));
}
FORCE_INLINE __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b)
{
float32x2_t a21 = vget_high_f32(
vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
float32x2_t b03 = vget_low_f32(
vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
return vreinterpretq_m128_f32(vcombine_f32(a21, b03));
}
FORCE_INLINE __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b)
{
float32x2_t a03 = vget_low_f32(
vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
float32x2_t b21 = vget_high_f32(
vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
return vreinterpretq_m128_f32(vcombine_f32(a03, b21));
}
FORCE_INLINE __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b)
{
float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
}
FORCE_INLINE __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b)
{
float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
return vreinterpretq_m128_f32(vcombine_f32(a01, b10));
}
FORCE_INLINE __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b)
{
float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
float32x2_t b01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(b)));
return vreinterpretq_m128_f32(vcombine_f32(a01, b01));
}
// keeps the low 64 bits of b in the low and puts the high 64 bits of a in the
// high
FORCE_INLINE __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b)
{
float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
return vreinterpretq_m128_f32(vcombine_f32(a10, b32));
}
FORCE_INLINE __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b)
{
float32x2_t a11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 1);
float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
return vreinterpretq_m128_f32(vcombine_f32(a11, b00));
}
FORCE_INLINE __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b)
{
float32x2_t a22 =
vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
return vreinterpretq_m128_f32(vcombine_f32(a22, b00));
}
FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b)
{
float32x2_t a00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 0);
float32x2_t b22 =
vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(b)), 0);
return vreinterpretq_m128_f32(vcombine_f32(a00, b22));
}
FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b)
{
float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
float32x2_t a22 =
vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
float32x2_t a02 = vset_lane_f32(a0, a22, 1); /* TODO: use vzip ?*/
float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
return vreinterpretq_m128_f32(vcombine_f32(a02, b32));
}
FORCE_INLINE __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b)
{
float32x2_t a33 =
vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 1);
float32x2_t b11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 1);
return vreinterpretq_m128_f32(vcombine_f32(a33, b11));
}
FORCE_INLINE __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b)
{
float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
float32_t b2 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 2);
float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
float32x2_t b20 = vset_lane_f32(b2, b00, 1);
return vreinterpretq_m128_f32(vcombine_f32(a10, b20));
}
FORCE_INLINE __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b)
{
float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
float32_t b2 = vgetq_lane_f32(b, 2);
float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
float32x2_t b20 = vset_lane_f32(b2, b00, 1);
return vreinterpretq_m128_f32(vcombine_f32(a01, b20));
}
FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b)
{
float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
float32_t b2 = vgetq_lane_f32(b, 2);
float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
float32x2_t b20 = vset_lane_f32(b2, b00, 1);
return vreinterpretq_m128_f32(vcombine_f32(a32, b20));
}
// Kahan summation for accurate summation of floating-point numbers.
// http://blog.zachbjornson.com/2019/08/11/fast-float-summation.html
FORCE_INLINE void _sse2neon_kadd_f32(float *sum, float *c, float y)
{
y -= *c;
float t = *sum + y;
*c = (t - *sum) - y;
*sum = t;
}
#if defined(__ARM_FEATURE_CRYPTO)
// Wraps vmull_p64
FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
{
poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0);
poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0);
return vreinterpretq_u64_p128(vmull_p64(a, b));
}
#else // ARMv7 polyfill
// ARMv7/some A64 lacks vmull_p64, but it has vmull_p8.
//
// vmull_p8 calculates 8 8-bit->16-bit polynomial multiplies, but we need a
// 64-bit->128-bit polynomial multiply.
//
// It needs some work and is somewhat slow, but it is still faster than all
// known scalar methods.
//
// Algorithm adapted to C from
// https://www.workofard.com/2017/07/ghash-for-low-end-cores/, which is adapted
// from "Fast Software Polynomial Multiplication on ARM Processors Using the
// NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and Ricardo Dahab
// (https://hal.inria.fr/hal-01506572)
static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
{
poly8x8_t a = vreinterpret_p8_u64(_a);
poly8x8_t b = vreinterpret_p8_u64(_b);
// Masks
uint8x16_t k48_32 = vcombine_u8(vcreate_u8(0x0000ffffffffffff),
vcreate_u8(0x00000000ffffffff));
uint8x16_t k16_00 = vcombine_u8(vcreate_u8(0x000000000000ffff),
vcreate_u8(0x0000000000000000));
// Do the multiplies, rotating with vext to get all combinations
uint8x16_t d = vreinterpretq_u8_p16(vmull_p8(a, b)); // D = A0 * B0
uint8x16_t e =
vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 1))); // E = A0 * B1
uint8x16_t f =
vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 1), b)); // F = A1 * B0
uint8x16_t g =
vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 2))); // G = A0 * B2
uint8x16_t h =
vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 2), b)); // H = A2 * B0
uint8x16_t i =
vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 3))); // I = A0 * B3
uint8x16_t j =
vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 3), b)); // J = A3 * B0
uint8x16_t k =
vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 4))); // L = A0 * B4
// Add cross products
uint8x16_t l = veorq_u8(e, f); // L = E + F
uint8x16_t m = veorq_u8(g, h); // M = G + H
uint8x16_t n = veorq_u8(i, j); // N = I + J
// Interleave. Using vzip1 and vzip2 prevents Clang from emitting TBL
// instructions.
#if defined(__aarch64__)
uint8x16_t lm_p0 = vreinterpretq_u8_u64(
vzip1q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
uint8x16_t lm_p1 = vreinterpretq_u8_u64(
vzip2q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
uint8x16_t nk_p0 = vreinterpretq_u8_u64(
vzip1q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
uint8x16_t nk_p1 = vreinterpretq_u8_u64(
vzip2q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
#else
uint8x16_t lm_p0 = vcombine_u8(vget_low_u8(l), vget_low_u8(m));
uint8x16_t lm_p1 = vcombine_u8(vget_high_u8(l), vget_high_u8(m));
uint8x16_t nk_p0 = vcombine_u8(vget_low_u8(n), vget_low_u8(k));
uint8x16_t nk_p1 = vcombine_u8(vget_high_u8(n), vget_high_u8(k));
#endif
// t0 = (L) (P0 + P1) << 8
// t1 = (M) (P2 + P3) << 16
uint8x16_t t0t1_tmp = veorq_u8(lm_p0, lm_p1);
uint8x16_t t0t1_h = vandq_u8(lm_p1, k48_32);
uint8x16_t t0t1_l = veorq_u8(t0t1_tmp, t0t1_h);
// t2 = (N) (P4 + P5) << 24
// t3 = (K) (P6 + P7) << 32
uint8x16_t t2t3_tmp = veorq_u8(nk_p0, nk_p1);
uint8x16_t t2t3_h = vandq_u8(nk_p1, k16_00);
uint8x16_t t2t3_l = veorq_u8(t2t3_tmp, t2t3_h);
// De-interleave
#if defined(__aarch64__)
uint8x16_t t0 = vreinterpretq_u8_u64(
vuzp1q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
uint8x16_t t1 = vreinterpretq_u8_u64(
vuzp2q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
uint8x16_t t2 = vreinterpretq_u8_u64(
vuzp1q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
uint8x16_t t3 = vreinterpretq_u8_u64(
vuzp2q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
#else
uint8x16_t t1 = vcombine_u8(vget_high_u8(t0t1_l), vget_high_u8(t0t1_h));
uint8x16_t t0 = vcombine_u8(vget_low_u8(t0t1_l), vget_low_u8(t0t1_h));
uint8x16_t t3 = vcombine_u8(vget_high_u8(t2t3_l), vget_high_u8(t2t3_h));
uint8x16_t t2 = vcombine_u8(vget_low_u8(t2t3_l), vget_low_u8(t2t3_h));
#endif
// Shift the cross products
uint8x16_t t0_shift = vextq_u8(t0, t0, 15); // t0 << 8
uint8x16_t t1_shift = vextq_u8(t1, t1, 14); // t1 << 16
uint8x16_t t2_shift = vextq_u8(t2, t2, 13); // t2 << 24
uint8x16_t t3_shift = vextq_u8(t3, t3, 12); // t3 << 32
// Accumulate the products
uint8x16_t cross1 = veorq_u8(t0_shift, t1_shift);
uint8x16_t cross2 = veorq_u8(t2_shift, t3_shift);
uint8x16_t mix = veorq_u8(d, cross1);
uint8x16_t r = veorq_u8(mix, cross2);
return vreinterpretq_u64_u8(r);
}
#endif // ARMv7 polyfill
// C equivalent:
// __m128i _mm_shuffle_epi32_default(__m128i a,
// __constrange(0, 255) int imm) {
// __m128i ret;
// ret[0] = a[imm & 0x3]; ret[1] = a[(imm >> 2) & 0x3];
// ret[2] = a[(imm >> 4) & 0x03]; ret[3] = a[(imm >> 6) & 0x03];
// return ret;
// }
#define _mm_shuffle_epi32_default(a, imm) \
__extension__({ \
int32x4_t ret; \
ret = vmovq_n_s32( \
vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm) & (0x3))); \
ret = vsetq_lane_s32( \
vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 2) & 0x3), \
ret, 1); \
ret = vsetq_lane_s32( \
vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 4) & 0x3), \
ret, 2); \
ret = vsetq_lane_s32( \
vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 6) & 0x3), \
ret, 3); \
vreinterpretq_m128i_s32(ret); \
})
// Takes the upper 64 bits of a and places it in the low end of the result
// Takes the lower 64 bits of a and places it into the high end of the result.
FORCE_INLINE __m128i _mm_shuffle_epi_1032(__m128i a)
{
int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
return vreinterpretq_m128i_s32(vcombine_s32(a32, a10));
}
// takes the lower two 32-bit values from a and swaps them and places in low end
// of result takes the higher two 32 bit values from a and swaps them and places
// in high end of result.
FORCE_INLINE __m128i _mm_shuffle_epi_2301(__m128i a)
{
int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
int32x2_t a23 = vrev64_s32(vget_high_s32(vreinterpretq_s32_m128i(a)));
return vreinterpretq_m128i_s32(vcombine_s32(a01, a23));
}
// rotates the least significant 32 bits into the most significant 32 bits, and
// shifts the rest down
FORCE_INLINE __m128i _mm_shuffle_epi_0321(__m128i a)
{
return vreinterpretq_m128i_s32(
vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 1));
}
// rotates the most significant 32 bits into the least significant 32 bits, and
// shifts the rest up
FORCE_INLINE __m128i _mm_shuffle_epi_2103(__m128i a)
{
return vreinterpretq_m128i_s32(
vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 3));
}
// gets the lower 64 bits of a, and places it in the upper 64 bits
// gets the lower 64 bits of a and places it in the lower 64 bits
FORCE_INLINE __m128i _mm_shuffle_epi_1010(__m128i a)
{
int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
return vreinterpretq_m128i_s32(vcombine_s32(a10, a10));
}
// gets the lower 64 bits of a, swaps the 0 and 1 elements, and places it in the
// lower 64 bits gets the lower 64 bits of a, and places it in the upper 64 bits
FORCE_INLINE __m128i _mm_shuffle_epi_1001(__m128i a)
{
int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
return vreinterpretq_m128i_s32(vcombine_s32(a01, a10));
}
// gets the lower 64 bits of a, swaps the 0 and 1 elements and places it in the
// upper 64 bits gets the lower 64 bits of a, swaps the 0 and 1 elements, and
// places it in the lower 64 bits
FORCE_INLINE __m128i _mm_shuffle_epi_0101(__m128i a)
{
int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
return vreinterpretq_m128i_s32(vcombine_s32(a01, a01));
}
FORCE_INLINE __m128i _mm_shuffle_epi_2211(__m128i a)
{
int32x2_t a11 = vdup_lane_s32(vget_low_s32(vreinterpretq_s32_m128i(a)), 1);
int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
return vreinterpretq_m128i_s32(vcombine_s32(a11, a22));
}
FORCE_INLINE __m128i _mm_shuffle_epi_0122(__m128i a)
{
int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
return vreinterpretq_m128i_s32(vcombine_s32(a22, a01));
}
FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a)
{
int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
int32x2_t a33 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 1);
return vreinterpretq_m128i_s32(vcombine_s32(a32, a33));
}
// FORCE_INLINE __m128i _mm_shuffle_epi32_splat(__m128i a, __constrange(0,255)
// int imm)
#if defined(__aarch64__)
#define _mm_shuffle_epi32_splat(a, imm) \
__extension__({ \
vreinterpretq_m128i_s32( \
vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm))); \
})
#else
#define _mm_shuffle_epi32_splat(a, imm) \
__extension__({ \
vreinterpretq_m128i_s32( \
vdupq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)))); \
})
#endif
// NEON does not support a general purpose permute intrinsic
// Selects four specific single-precision, floating-point values from a and b,
// based on the mask i.
//
// C equivalent:
// __m128 _mm_shuffle_ps_default(__m128 a, __m128 b,
// __constrange(0, 255) int imm) {
// __m128 ret;
// ret[0] = a[imm & 0x3]; ret[1] = a[(imm >> 2) & 0x3];
// ret[2] = b[(imm >> 4) & 0x03]; ret[3] = b[(imm >> 6) & 0x03];
// return ret;
// }
//
// https://msdn.microsoft.com/en-us/library/vstudio/5f0858x0(v=vs.100).aspx
#define _mm_shuffle_ps_default(a, b, imm) \
__extension__({ \
float32x4_t ret; \
ret = vmovq_n_f32( \
vgetq_lane_f32(vreinterpretq_f32_m128(a), (imm) & (0x3))); \
ret = vsetq_lane_f32( \
vgetq_lane_f32(vreinterpretq_f32_m128(a), ((imm) >> 2) & 0x3), \
ret, 1); \
ret = vsetq_lane_f32( \
vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 4) & 0x3), \
ret, 2); \
ret = vsetq_lane_f32( \
vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 6) & 0x3), \
ret, 3); \
vreinterpretq_m128_f32(ret); \
})
// Shuffles the lower 4 signed or unsigned 16-bit integers in a as specified
// by imm.
// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/y41dkk37(v=vs.100)
// FORCE_INLINE __m128i _mm_shufflelo_epi16_function(__m128i a,
// __constrange(0,255) int
// imm)
#define _mm_shufflelo_epi16_function(a, imm) \
__extension__({ \
int16x8_t ret = vreinterpretq_s16_m128i(a); \
int16x4_t lowBits = vget_low_s16(ret); \
ret = vsetq_lane_s16(vget_lane_s16(lowBits, (imm) & (0x3)), ret, 0); \
ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 2) & 0x3), ret, \
1); \
ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 4) & 0x3), ret, \
2); \
ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 6) & 0x3), ret, \
3); \
vreinterpretq_m128i_s16(ret); \
})
// Shuffles the upper 4 signed or unsigned 16-bit integers in a as specified
// by imm.
// https://msdn.microsoft.com/en-us/library/13ywktbs(v=vs.100).aspx
// FORCE_INLINE __m128i _mm_shufflehi_epi16_function(__m128i a,
// __constrange(0,255) int
// imm)
#define _mm_shufflehi_epi16_function(a, imm) \
__extension__({ \
int16x8_t ret = vreinterpretq_s16_m128i(a); \
int16x4_t highBits = vget_high_s16(ret); \
ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm) & (0x3)), ret, 4); \
ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 2) & 0x3), ret, \
5); \
ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 4) & 0x3), ret, \
6); \
ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 6) & 0x3), ret, \
7); \
vreinterpretq_m128i_s16(ret); \
})
/* SSE */
// Adds the four single-precision, floating-point values of a and b.
//
// r0 := a0 + b0
// r1 := a1 + b1
// r2 := a2 + b2
// r3 := a3 + b3
//
// https://msdn.microsoft.com/en-us/library/vstudio/c9848chc(v=vs.100).aspx
FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
{
return vreinterpretq_m128_f32(
vaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
}
// adds the scalar single-precision floating point values of a and b.
// https://msdn.microsoft.com/en-us/library/be94x2y6(v=vs.100).aspx
FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b)
{
float32_t b0 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0);
// the upper values in the result must be the remnants of <a>.
return vreinterpretq_m128_f32(vaddq_f32(a, value));
}
// Computes the bitwise AND of the four single-precision, floating-point values
// of a and b.
//
// r0 := a0 & b0
// r1 := a1 & b1
// r2 := a2 & b2
// r3 := a3 & b3
//
// https://msdn.microsoft.com/en-us/library/vstudio/73ck1xc5(v=vs.100).aspx
FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
{
return vreinterpretq_m128_s32(
vandq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
}
// Computes the bitwise AND-NOT of the four single-precision, floating-point
// values of a and b.
//
// r0 := ~a0 & b0
// r1 := ~a1 & b1
// r2 := ~a2 & b2
// r3 := ~a3 & b3
//
// https://msdn.microsoft.com/en-us/library/vstudio/68h7wd02(v=vs.100).aspx
FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b)
{
return vreinterpretq_m128_s32(
vbicq_s32(vreinterpretq_s32_m128(b),
vreinterpretq_s32_m128(a))); // *NOTE* argument swap
}
// Average packed unsigned 16-bit integers in a and b, and store the results in
// dst.
//
// FOR j := 0 to 3
// i := j*16
// dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
// ENDFOR
//
// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu16
FORCE_INLINE __m64 _mm_avg_pu16(__m64 a, __m64 b)
{
return vreinterpret_m64_u16(
vrhadd_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)));
}
// Average packed unsigned 8-bit integers in a and b, and store the results in
// dst.
//
// FOR j := 0 to 7
// i := j*8
// dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
// ENDFOR
//
// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu8
FORCE_INLINE __m64 _mm_avg_pu8(__m64 a, __m64 b)
{
return vreinterpret_m64_u8(
vrhadd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
}
// Compares for equality.
// https://msdn.microsoft.com/en-us/library/vstudio/36aectz5(v=vs.100).aspx
FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b)
{
return vreinterpretq_m128_u32(
vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
}
// Compares for equality.
// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/k423z28e(v=vs.100)
FORCE_INLINE __m128 _mm_cmpeq_ss(__m128 a, __m128 b)
{
return _mm_move_ss(a, _mm_cmpeq_ps(a, b));
}
// Compares for greater than or equal.
// https://msdn.microsoft.com/en-us/library/vstudio/fs813y2t(v=vs.100).aspx
FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b)
{
return vreinterpretq_m128_u32(
vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
}
// Compares for greater than or equal.
// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/kesh3ddc(v=vs.100)
FORCE_INLINE __m128 _mm_cmpge_ss(__m128 a, __m128 b)
{
return _mm_move_ss(a, _mm_cmpge_ps(a, b));
}
// Compares for greater than.
//
// r0 := (a0 > b0) ? 0xffffffff : 0x0
// r1 := (a1 > b1) ? 0xffffffff : 0x0
// r2 := (a2 > b2) ? 0xffffffff : 0x0
// r3 := (a3 > b3) ? 0xffffffff : 0x0
//
// https://msdn.microsoft.com/en-us/library/vstudio/11dy102s(v=vs.100).aspx
FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
{
return vreinterpretq_m128_u32(
vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
}
// Compares for greater than.
// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/1xyyyy9e(v=vs.100)
FORCE_INLINE __m128 _mm_cmpgt_ss(__m128 a, __m128 b)
{
return _mm_move_ss(a, _mm_cmpgt_ps(a, b));
}
// Compares for less than or equal.
//
// r0 := (a0 <= b0) ? 0xffffffff : 0x0
// r1 := (a1 <= b1) ? 0xffffffff : 0x0
// r2 := (a2 <= b2) ? 0xffffffff : 0x0
// r3 := (a3 <= b3) ? 0xffffffff : 0x0
//
// https://msdn.microsoft.com/en-us/library/vstudio/1s75w83z(v=vs.100).aspx
FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b)
{
return vreinterpretq_m128_u32(
vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
}
// Compares for less than or equal.
// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/a7x0hbhw(v=vs.100)
FORCE_INLINE __m128 _mm_cmple_ss(__m128 a, __m128 b)
{
return _mm_move_ss(a, _mm_cmple_ps(a, b));
}
// Compares for less than
// https://msdn.microsoft.com/en-us/library/vstudio/f330yhc8(v=vs.100).aspx
FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
{
return vreinterpretq_m128_u32(
vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
}
// Compares for less than
// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fy94wye7(v=vs.100)
FORCE_INLINE __m128 _mm_cmplt_ss(__m128 a, __m128 b)
{
return _mm_move_ss(a, _mm_cmplt_ps(a, b));
}
// Compares for inequality.
// https://msdn.microsoft.com/en-us/library/sf44thbx(v=vs.100).aspx
FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b)
{
return vreinterpretq_m128_u32(vmvnq_u32(
vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
}
// Compares for inequality.
// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/ekya8fh4(v=vs.100)
FORCE_INLINE __m128 _mm_cmpneq_ss(__m128 a, __m128 b)
{
return _mm_move_ss(a, _mm_cmpneq_ps(a, b));
}
// Compares for not greater than or equal.
// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/wsexys62(v=vs.100)
FORCE_INLINE __m128 _mm_cmpnge_ps(__m128 a, __m128 b)
{
return _mm_cmplt_ps(a, b);
}
// Compares for not greater than or equal.
// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fk2y80s8(v=vs.100)
FORCE_INLINE __m128 _mm_cmpnge_ss(__m128 a, __m128 b)
{
return _mm_cmplt_ss(a, b);
}
// Compares for not greater than.
// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/d0xh7w0s(v=vs.100)
FORCE_INLINE __m128 _mm_cmpngt_ps(__m128 a, __m128 b)
{
return _mm_cmple_ps(a, b);
}
// Compares for not greater than.
// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100)
FORCE_INLINE __m128 _mm_cmpngt_ss(__m128 a, __m128 b)
{
return _mm_cmple_ss(a, b);
}
// Compares for not less than or equal.
// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/6a330kxw(v=vs.100)
FORCE_INLINE __m128 _mm_cmpnle_ps(__m128 a, __m128 b)
{
return _mm_cmpgt_ps(a, b);
}
// Compares for not less than or equal.
// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100)
FORCE_INLINE __m128 _mm_cmpnle_ss(__m128 a, __m128 b)
{
return _mm_cmpgt_ss(a, b);
}
// Compares for not less than.
// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/4686bbdw(v=vs.100)
FORCE_INLINE __m128 _mm_cmpnlt_ps(__m128 a, __m128 b)
{
return _mm_cmpge_ps(a, b);
}
// Compares for not less than.
// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/56b9z2wf(v=vs.100)
FORCE_INLINE __m128 _mm_cmpnlt_ss(__m128 a, __m128 b)
{
return _mm_cmpge_ss(a, b);
}
// Compares the four 32-bit floats in a and b to check if any values are NaN.
// Ordered compare between each value returns true for "orderable" and false for
// "not orderable" (NaN).
// https://msdn.microsoft.com/en-us/library/vstudio/0h9w00fx(v=vs.100).aspx see
// also:
// http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean
// http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics
FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b)
{
// Note: NEON does not have ordered compare builtin
// Need to compare a eq a and b eq b to check for NaN
// Do AND of results to get final
uint32x4_t ceqaa =
vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
uint32x4_t ceqbb =
vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
return vreinterpretq_m128_u32(vandq_u32(ceqaa, ceqbb));
}
// Compares for ordered.
// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/343t62da(v=vs.100)
FORCE_INLINE __m128 _mm_cmpord_ss(__m128 a, __m128 b)
{
return _mm_move_ss(a, _mm_cmpord_ps(a, b));
}
// Compares for unordered.
// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/khy6fk1t(v=vs.100)
FORCE_INLINE __m128 _mm_cmpunord_ps(__m128 a, __m128 b)
{
uint32x4_t f32a =
vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
uint32x4_t f32b =
vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
return vreinterpretq_m128_u32(vmvnq_u32(vandq_u32(f32a, f32b)));
}
// Compares for unordered.
// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/2as2387b(v=vs.100)
FORCE_INLINE __m128 _mm_cmpunord_ss(__m128 a, __m128 b)
{
return _mm_move_ss(a, _mm_cmpunord_ps(a, b));
}
// Compares the lower single-precision floating point scalar values of a and b
// using an equality operation. :
// https://msdn.microsoft.com/en-us/library/93yx2h2b(v=vs.100).aspx
FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b)
{
// return vgetq_lane_u32(vceqq_f32(vreinterpretq_f32_m128(a),
// vreinterpretq_f32_m128(b)), 0);
uint32x4_t a_not_nan =
vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
uint32x4_t b_not_nan =
vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
uint32x4_t a_eq_b =
vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
return vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_eq_b), 0) & 0x1;
}
// Compares the lower single-precision floating point scalar values of a and b
// using a greater than or equal operation. :
// https://msdn.microsoft.com/en-us/library/8t80des6(v=vs.100).aspx
FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b)
{
// return vgetq_lane_u32(vcgeq_f32(vreinterpretq_f32_m128(a),
// vreinterpretq_f32_m128(b)), 0);
uint32x4_t a_not_nan =
vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
uint32x4_t b_not_nan =
vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
uint32x4_t a_ge_b =
vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
return vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_ge_b), 0) & 0x1;
}
// Compares the lower single-precision floating point scalar values of a and b
// using a greater than operation. :
// https://msdn.microsoft.com/en-us/library/b0738e0t(v=vs.100).aspx
FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b)
{
// return vgetq_lane_u32(vcgtq_f32(vreinterpretq_f32_m128(a),
// vreinterpretq_f32_m128(b)), 0);
uint32x4_t a_not_nan =
vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
uint32x4_t b_not_nan =
vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
uint32x4_t a_gt_b =
vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
return vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_gt_b), 0) & 0x1;
}
// Compares the lower single-precision floating point scalar values of a and b
// using a less than or equal operation. :
// https://msdn.microsoft.com/en-us/library/1w4t7c57(v=vs.90).aspx
FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b)
{
// return vgetq_lane_u32(vcleq_f32(vreinterpretq_f32_m128(a),
// vreinterpretq_f32_m128(b)), 0);
uint32x4_t a_not_nan =
vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
uint32x4_t b_not_nan =
vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
uint32x4_t a_le_b =
vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
return vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_le_b), 0) & 0x1;
}
// Compares the lower single-precision floating point scalar values of a and b
// using a less than operation. :
// https://msdn.microsoft.com/en-us/library/2kwe606b(v=vs.90).aspx Important
// note!! The documentation on MSDN is incorrect! If either of the values is a
// NAN the docs say you will get a one, but in fact, it will return a zero!!
FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b)
{
uint32x4_t a_not_nan =
vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
uint32x4_t b_not_nan =
vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
uint32x4_t a_lt_b =
vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
return vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_lt_b), 0) & 0x1;
}
// Compares the lower single-precision floating point scalar values of a and b
// using an inequality operation. :
// https://msdn.microsoft.com/en-us/library/bafh5e0a(v=vs.90).aspx
FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b)
{
// return !vgetq_lane_u32(vceqq_f32(vreinterpretq_f32_m128(a),
// vreinterpretq_f32_m128(b)), 0);
uint32x4_t a_not_nan =
vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
uint32x4_t b_not_nan =
vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan));
uint32x4_t a_neq_b = vmvnq_u32(
vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
return vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_neq_b), 0) & 0x1;
}
// Convert packed signed 32-bit integers in b to packed single-precision
// (32-bit) floating-point elements, store the results in the lower 2 elements
// of dst, and copy the upper 2 packed elements from a to the upper elements of
// dst.
//
// dst[31:0] := Convert_Int32_To_FP32(b[31:0])
// dst[63:32] := Convert_Int32_To_FP32(b[63:32])
// dst[95:64] := a[95:64]
// dst[127:96] := a[127:96]
//
// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_pi2ps
FORCE_INLINE __m128 _mm_cvt_pi2ps(__m128 a, __m64 b)
{
return vreinterpretq_m128_f32(
vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
vget_high_f32(vreinterpretq_f32_m128(a))));
}
// Convert packed single-precision (32-bit) floating-point elements in a to
// packed 32-bit integers, and store the results in dst.
//
// FOR j := 0 to 1
// i := 32*j
// dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
// ENDFOR
//
// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ps2pi
FORCE_INLINE __m64 _mm_cvt_ps2pi(__m128 a)
{
#if defined(__aarch64__)
return vreinterpret_m64_s32(
vget_low_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a)))));
#else
return vreinterpret_m64_s32(vcvt_s32_f32(vget_low_f32(
vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)))));
#endif
}
// Convert the signed 32-bit integer b to a single-precision (32-bit)
// floating-point element, store the result in the lower element of dst, and
// copy the upper 3 packed elements from a to the upper elements of dst.
//
// dst[31:0] := Convert_Int32_To_FP32(b[31:0])
// dst[127:32] := a[127:32]
//
// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_si2ss
FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b)
{
return vreinterpretq_m128_f32(
vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
}
// Convert the lower single-precision (32-bit) floating-point element in a to a
// 32-bit integer, and store the result in dst.
// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ss2si
FORCE_INLINE int _mm_cvt_ss2si(__m128 a)
{
#if defined(__aarch64__)
return vgetq_lane_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a))),
0);
#else
float32_t data = vgetq_lane_f32(
vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)), 0);
return (int32_t) data;
#endif
}
// Convert packed 16-bit integers in a to packed single-precision (32-bit)
// floating-point elements, and store the results in dst.
//
// FOR j := 0 to 3
// i := j*16
// m := j*32
// dst[m+31:m] := Convert_Int16_To_FP32(a[i+15:i])
// ENDFOR
//
// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi16_ps
FORCE_INLINE __m128 _mm_cvtpi16_ps(__m64 a)
{
return vreinterpretq_m128_f32(
vcvtq_f32_s32(vmovl_s16(vreinterpret_s16_m64(a))));
}
// Convert packed 32-bit integers in b to packed single-precision (32-bit)
// floating-point elements, store the results in the lower 2 elements of dst,
// and copy the upper 2 packed elements from a to the upper elements of dst.
//
// dst[31:0] := Convert_Int32_To_FP32(b[31:0])
// dst[63:32] := Convert_Int32_To_FP32(b[63:32])
// dst[95:64] := a[95:64]
// dst[127:96] := a[127:96]
//
// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32_ps
FORCE_INLINE __m128 _mm_cvtpi32_ps(__m128 a, __m64 b)
{
return vreinterpretq_m128_f32(
vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
vget_high_f32(vreinterpretq_f32_m128(a))));
}
// Convert packed signed 32-bit integers in a to packed single-precision
// (32-bit) floating-point elements, store the results in the lower 2 elements
// of dst, then covert the packed signed 32-bit integers in b to
// single-precision (32-bit) floating-point element, and store the results in
// the upper 2 elements of dst.
//
// dst[31:0] := Convert_Int32_To_FP32(a[31:0])
// dst[63:32] := Convert_Int32_To_FP32(a[63:32])
// dst[95:64] := Convert_Int32_To_FP32(b[31:0])
// dst[127:96] := Convert_Int32_To_FP32(b[63:32])
//
// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32x2_ps
FORCE_INLINE __m128 _mm_cvtpi32x2_ps(__m64 a, __m64 b)
{
return vreinterpretq_m128_f32(vcvtq_f32_s32(
vcombine_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b))));
}
// Convert the lower packed 8-bit integers in a to packed single-precision
// (32-bit) floating-point elements, and store the results in dst.
//
// FOR j := 0 to 3
// i := j*8
// m := j*32
// dst[m+31:m] := Convert_Int8_To_FP32(a[i+7:i])
// ENDFOR
//
// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi8_ps
FORCE_INLINE __m128 _mm_cvtpi8_ps(__m64 a)
{
return vreinterpretq_m128_f32(vcvtq_f32_s32(
vmovl_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_m64(a))))));
}
// Convert packed single-precision (32-bit) floating-point elements in a to
// packed 16-bit integers, and store the results in dst. Note: this intrinsic
// will generate 0x7FFF, rather than 0x8000, for input values between 0x7FFF and
// 0x7FFFFFFF.
//
// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pi16
FORCE_INLINE __m64 _mm_cvtps_pi16(__m128 a)
{
return vreinterpret_m64_s16(
vmovn_s32(vreinterpretq_s32_m128i(_mm_cvtps_epi32(a))));
}
// Convert packed single-precision (32-bit) floating-point elements in a to
// packed 32-bit integers, and store the results in dst.
//
// FOR j := 0 to 1
// i := 32*j
// dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
// ENDFOR
//
// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pi32
#define _mm_cvtps_pi32(a) _mm_cvt_ps2pi(a)
// Convert packed unsigned 16-bit integers in a to packed single-precision
// (32-bit) floating-point elements, and store the results in dst.
//
// FOR j := 0 to 3
// i := j*16
// m := j*32
// dst[m+31:m] := Convert_UInt16_To_FP32(a[i+15:i])
// ENDFOR
//
// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu16_ps
FORCE_INLINE __m128 _mm_cvtpu16_ps(__m64 a)
{
return vreinterpretq_m128_f32(
vcvtq_f32_u32(vmovl_u16(vreinterpret_u16_m64(a))));
}
// Convert the lower packed unsigned 8-bit integers in a to packed
// single-precision (32-bit) floating-point elements, and store the results in
// dst.
//
// FOR j := 0 to 3
// i := j*8
// m := j*32
// dst[m+31:m] := Convert_UInt8_To_FP32(a[i+7:i])
// ENDFOR
//
// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu8_ps
FORCE_INLINE __m128 _mm_cvtpu8_ps(__m64 a)
{
return vreinterpretq_m128_f32(vcvtq_f32_u32(
vmovl_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_m64(a))))));
}
// Convert the signed 32-bit integer b to a single-precision (32-bit)
// floating-point element, store the result in the lower element of dst, and
// copy the upper 3 packed elements from a to the upper elements of dst.
//
// dst[31:0] := Convert_Int32_To_FP32(b[31:0])
// dst[127:32] := a[127:32]
//
// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi32_ss
#define _mm_cvtsi32_ss(a, b) _mm_cvt_si2ss(a, b)
// Convert the signed 64-bit integer b to a single-precision (32-bit)
// floating-point element, store the result in the lower element of dst, and
// copy the upper 3 packed elements from a to the upper elements of dst.
//
// dst[31:0] := Convert_Int64_To_FP32(b[63:0])
// dst[127:32] := a[127:32]
//
// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64_ss
FORCE_INLINE __m128 _mm_cvtsi64_ss(__m128 a, int64_t b)
{
return vreinterpretq_m128_f32(
vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
}
// Copy the lower single-precision (32-bit) floating-point element of a to dst.
//
// dst[31:0] := a[31:0]
//
// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_f32
FORCE_INLINE float _mm_cvtss_f32(__m128 a)
{
return vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
}
// Convert the lower single-precision (32-bit) floating-point element in a to a
// 32-bit integer, and store the result in dst.
//
// dst[31:0] := Convert_FP32_To_Int32(a[31:0])
//
// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_si32
#define _mm_cvtss_si32(a) _mm_cvt_ss2si(a)
// Convert the lower single-precision (32-bit) floating-point element in a to a
// 64-bit integer, and store the result in dst.
//
// dst[63:0] := Convert_FP32_To_Int64(a[31:0])
//
// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_si64
FORCE_INLINE int64_t _mm_cvtss_si64(__m128 a)
{
#if defined(__aarch64__)
return (int64_t) vgetq_lane_f32(vrndiq_f32(vreinterpretq_f32_m128(a)), 0);
#else
float32_t data = vgetq_lane_f32(
vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)), 0);
return (int64_t) data;
#endif
}
// Convert packed single-precision (32-bit) floating-point elements in a to
// packed 32-bit integers with truncation, and store the results in dst.
//
// FOR j := 0 to 1
// i := 32*j
// dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
// ENDFOR
//
// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_ps2pi
FORCE_INLINE __m64 _mm_cvtt_ps2pi(__m128 a)
{
return vreinterpret_m64_s32(
vget_low_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a))));
}
// Convert the lower single-precision (32-bit) floating-point element in a to a
// 32-bit integer with truncation, and store the result in dst.
//
// dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
//
// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_ss2si
FORCE_INLINE int _mm_cvtt_ss2si(__m128 a)
{
return vgetq_lane_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)), 0);
}
// Convert packed single-precision (32-bit) floating-point elements in a to
// packed 32-bit integers with truncation, and store the results in dst.
//
// FOR j := 0 to 1
// i := 32*j
// dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
// ENDFOR
//
// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttps_pi32
#define _mm_cvttps_pi32(a) _mm_cvtt_ps2pi(a)
// Convert the lower single-precision (32-bit) floating-point element in a to a
// 32-bit integer with truncation, and store the result in dst.
//
// dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
//
// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_si32
#define _mm_cvttss_si32(a) _mm_cvtt_ss2si(a)
// Convert the lower single-precision (32-bit) floating-point element in a to a
// 64-bit integer with truncation, and store the result in dst.
//
// dst[63:0] := Convert_FP32_To_Int64_Truncate(a[31:0])
//
// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_si64
FORCE_INLINE int64_t _mm_cvttss_si64(__m128 a)
{
return (int64_t) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
}
// Divides the four single-precision, floating-point values of a and b.
//
// r0 := a0 / b0
// r1 := a1 / b1
// r2 := a2 / b2
// r3 := a3 / b3
//
// https://msdn.microsoft.com/en-us/library/edaw8147(v=vs.100).aspx
FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
{
#if defined(__aarch64__) && !SSE2NEON_PRECISE_DIV
return vreinterpretq_m128_f32(
vdivq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
#else
float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(b));
recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));
#if SSE2NEON_PRECISE_DIV
// Additional Netwon-Raphson iteration for accuracy
recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));
#endif
return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(a), recip));
#endif
}
// Divides the scalar single-precision floating point value of a by b.
// https://msdn.microsoft.com/en-us/library/4y73xa49(v=vs.100).aspx
FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b)
{
float32_t value =
vgetq_lane_f32(vreinterpretq_f32_m128(_mm_div_ps(a, b)), 0);
return vreinterpretq_m128_f32(
vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
}
// Extract a 16-bit integer from a, selected with imm8, and store the result in
// the lower element of dst.
// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_pi16
#define _mm_extract_pi16(a, imm) \
(int32_t) vget_lane_u16(vreinterpret_u16_m64(a), (imm))
// Free aligned memory that was allocated with _mm_malloc.
// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_free
FORCE_INLINE void _mm_free(void *addr)
{
free(addr);
}
// Macro: Get the rounding mode bits from the MXCSR control and status register.
// The rounding mode may contain any of the following flags: _MM_ROUND_NEAREST,
// _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO
// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_GET_ROUNDING_MODE
FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE()
{
union {
fpcr_bitfield field;
#if defined(__aarch64__)
uint64_t value;
#else
uint32_t value;
#endif
} r;
#if defined(__aarch64__)
asm volatile("mrs %0, FPCR" : "=r"(r.value)); /* read */
#else
asm volatile("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
#endif
if (r.field.bit22) {
return r.field.bit23 ? _MM_ROUND_TOWARD_ZERO : _MM_ROUND_UP;
} else {
return r.field.bit23 ? _MM_ROUND_DOWN : _MM_ROUND_NEAREST;
}
}
// Copy a to dst, and insert the 16-bit integer i into dst at the location
// specified by imm8.
// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_insert_pi16
#define _mm_insert_pi16(a, b, imm) \
__extension__({ \
vreinterpret_m64_s16( \
vset_lane_s16((b), vreinterpret_s16_m64(a), (imm))); \
})
// Loads four single-precision, floating-point values.
// https://msdn.microsoft.com/en-us/library/vstudio/zzd50xxt(v=vs.100).aspx
FORCE_INLINE __m128 _mm_load_ps(const float *p)
{
return vreinterpretq_m128_f32(vld1q_f32(p));
}
// Load a single-precision (32-bit) floating-point element from memory into all
// elements of dst.
//
// dst[31:0] := MEM[mem_addr+31:mem_addr]
// dst[63:32] := MEM[mem_addr+31:mem_addr]
// dst[95:64] := MEM[mem_addr+31:mem_addr]
// dst[127:96] := MEM[mem_addr+31:mem_addr]
//
// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_ps1
#define _mm_load_ps1 _mm_load1_ps
// Loads an single - precision, floating - point value into the low word and
// clears the upper three words.
// https://msdn.microsoft.com/en-us/library/548bb9h4%28v=vs.90%29.aspx
FORCE_INLINE __m128 _mm_load_ss(const float *p)
{
return vreinterpretq_m128_f32(vsetq_lane_f32(*p, vdupq_n_f32(0), 0));
}
// Loads a single single-precision, floating-point value, copying it into all
// four words
// https://msdn.microsoft.com/en-us/library/vstudio/5cdkf716(v=vs.100).aspx
FORCE_INLINE __m128 _mm_load1_ps(const float *p)
{
return vreinterpretq_m128_f32(vld1q_dup_f32(p));
}
// Sets the upper two single-precision, floating-point values with 64
// bits of data loaded from the address p; the lower two values are passed
// through from a.
//
// r0 := a0
// r1 := a1
// r2 := *p0
// r3 := *p1
//
// https://msdn.microsoft.com/en-us/library/w92wta0x(v%3dvs.100).aspx
FORCE_INLINE __m128 _mm_loadh_pi(__m128 a, __m64 const *p)
{
return vreinterpretq_m128_f32(
vcombine_f32(vget_low_f32(a), vld1_f32((const float32_t *) p)));
}
// Sets the lower two single-precision, floating-point values with 64
// bits of data loaded from the address p; the upper two values are passed
// through from a.
//
// Return Value
// r0 := *p0
// r1 := *p1
// r2 := a2
// r3 := a3
//
// https://msdn.microsoft.com/en-us/library/s57cyak2(v=vs.100).aspx
FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *p)
{
return vreinterpretq_m128_f32(
vcombine_f32(vld1_f32((const float32_t *) p), vget_high_f32(a)));
}
// Load 4 single-precision (32-bit) floating-point elements from memory into dst
// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
// general-protection exception may be generated.
//
// dst[31:0] := MEM[mem_addr+127:mem_addr+96]
// dst[63:32] := MEM[mem_addr+95:mem_addr+64]
// dst[95:64] := MEM[mem_addr+63:mem_addr+32]
// dst[127:96] := MEM[mem_addr+31:mem_addr]
//
// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_ps
FORCE_INLINE __m128 _mm_loadr_ps(const float *p)
{
float32x4_t v = vrev64q_f32(vld1q_f32(p));
return vreinterpretq_m128_f32(vextq_f32(v, v, 2));
}
// Loads four single-precision, floating-point values.
// https://msdn.microsoft.com/en-us/library/x1b16s7z%28v=vs.90%29.aspx
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
{
// for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are
// equivalent for neon
return vreinterpretq_m128_f32(vld1q_f32(p));
}
// Load unaligned 16-bit integer from memory into the first element of dst.
//
// dst[15:0] := MEM[mem_addr+15:mem_addr]
// dst[MAX:16] := 0
//
// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si16
FORCE_INLINE __m128i _mm_loadu_si16(const void *p)
{
return vreinterpretq_m128i_s16(
vsetq_lane_s16(*(const int16_t *) p, vdupq_n_s16(0), 0));
}
// Load unaligned 64-bit integer from memory into the first element of dst.
//
// dst[63:0] := MEM[mem_addr+63:mem_addr]
// dst[MAX:64] := 0
//
// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si64
FORCE_INLINE __m128i _mm_loadu_si64(const void *p)
{
return vreinterpretq_m128i_s64(
vcombine_s64(vld1_s64((const int64_t *) p), vdup_n_s64(0)));
}
// Allocate aligned blocks of memory.
// https://software.intel.com/en-us/
// cpp-compiler-developer-guide-and-reference-allocating-and-freeing-aligned-memory-blocks
FORCE_INLINE void *_mm_malloc(size_t size, size_t align)
{
void *ptr;
if (align == 1)
return malloc(size);
if (align == 2 || (sizeof(void *) == 8 && align == 4))
align = sizeof(void *);
if (!posix_memalign(&ptr, align, size))
return ptr;
return NULL;
}
// Conditionally store 8-bit integer elements from a into memory using mask
// (elements are not stored when the highest bit is not set in the corresponding
// element) and a non-temporal memory hint.
// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskmove_si64
FORCE_INLINE void _mm_maskmove_si64(__m64 a, __m64 mask, char *mem_addr)
{
int8x8_t shr_mask = vshr_n_s8(vreinterpret_s8_m64(mask), 7);
__m128 b = _mm_load_ps((const float *) mem_addr);
int8x8_t masked =
vbsl_s8(vreinterpret_u8_s8(shr_mask), vreinterpret_s8_m64(a),
vreinterpret_s8_u64(vget_low_u64(vreinterpretq_u64_m128(b))));
vst1_s8((int8_t *) mem_addr, masked);
}
// Conditionally store 8-bit integer elements from a into memory using mask
// (elements are not stored when the highest bit is not set in the corresponding
// element) and a non-temporal memory hint.
// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_maskmovq
#define _m_maskmovq(a, mask, mem_addr) _mm_maskmove_si64(a, mask, mem_addr)
// Compare packed signed 16-bit integers in a and b, and store packed maximum
// values in dst.
//
// FOR j := 0 to 3
// i := j*16
// dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
// ENDFOR
//
// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pi16
FORCE_INLINE __m64 _mm_max_pi16(__m64 a, __m64 b)
{
return vreinterpret_m64_s16(
vmax_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
}
// Computes the maximums of the four single-precision, floating-point values of
// a and b.
// https://msdn.microsoft.com/en-us/library/vstudio/ff5d607a(v=vs.100).aspx
FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
{
#if SSE2NEON_PRECISE_MINMAX
float32x4_t _a = vreinterpretq_f32_m128(a);
float32x4_t _b = vreinterpretq_f32_m128(b);
return vbslq_f32(vcltq_f32(_b, _a), _a, _b);
#else
return vreinterpretq_m128_f32(
vmaxq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
#endif
}
// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
// values in dst.
//
// FOR j := 0 to 7
// i := j*8
// dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
// ENDFOR
//
// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pu8
FORCE_INLINE __m64 _mm_max_pu8(__m64 a, __m64 b)
{
return vreinterpret_m64_u8(
vmax_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
}
// Computes the maximum of the two lower scalar single-precision floating point
// values of a and b.
// https://msdn.microsoft.com/en-us/library/s6db5esz(v=vs.100).aspx
FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b)
{
float32_t value = vgetq_lane_f32(_mm_max_ps(a, b), 0);
return vreinterpretq_m128_f32(
vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
}
// Compare packed signed 16-bit integers in a and b, and store packed minimum
// values in dst.
//
// FOR j := 0 to 3
// i := j*16
// dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
// ENDFOR
//
// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pi16
FORCE_INLINE __m64 _mm_min_pi16(__m64 a, __m64 b)
{
return vreinterpret_m64_s16(
vmin_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
}
// Computes the minima of the four single-precision, floating-point values of a
// and b.
// https://msdn.microsoft.com/en-us/library/vstudio/wh13kadz(v=vs.100).aspx
FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
{
#if SSE2NEON_PRECISE_MINMAX
float32x4_t _a = vreinterpretq_f32_m128(a);
float32x4_t _b = vreinterpretq_f32_m128(b);
return vbslq_f32(vcltq_f32(_a, _b), _a, _b);
#else
return vreinterpretq_m128_f32(
vminq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
#endif
}
// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
// values in dst.
//
// FOR j := 0 to 7
// i := j*8
// dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
// ENDFOR
//
// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pu8
FORCE_INLINE __m64 _mm_min_pu8(__m64 a, __m64 b)
{
return vreinterpret_m64_u8(
vmin_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
}
// Computes the minimum of the two lower scalar single-precision floating point
// values of a and b.
// https://msdn.microsoft.com/en-us/library/0a9y7xaa(v=vs.100).aspx
FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b)
{
float32_t value = vgetq_lane_f32(_mm_min_ps(a, b), 0);
return vreinterpretq_m128_f32(
vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
}
// Sets the low word to the single-precision, floating-point value of b
// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/35hdzazd(v=vs.100)
FORCE_INLINE __m128 _mm_move_ss(__m128 a, __m128 b)
{
return vreinterpretq_m128_f32(
vsetq_lane_f32(vgetq_lane_f32(vreinterpretq_f32_m128(b), 0),
vreinterpretq_f32_m128(a), 0));
}
// Moves the upper two values of B into the lower two values of A.
//
// r3 := a3
// r2 := a2
// r1 := b3
// r0 := b2
FORCE_INLINE __m128 _mm_movehl_ps(__m128 __A, __m128 __B)
{
float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(__A));
float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(__B));
return vreinterpretq_m128_f32(vcombine_f32(b32, a32));
}
// Moves the lower two values of B into the upper two values of A.
//
// r3 := b1
// r2 := b0
// r1 := a1
// r0 := a0
FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B)
{
float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(__A));
float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(__B));
return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
}
// Create mask from the most significant bit of each 8-bit element in a, and
// store the result in dst.
// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movemask_pi8
FORCE_INLINE int _mm_movemask_pi8(__m64 a)
{
uint8x8_t input = vreinterpret_u8_m64(a);
#if defined(__aarch64__)
static const int8x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7};
uint8x8_t tmp = vshr_n_u8(input, 7);
return vaddv_u8(vshl_u8(tmp, shift));
#else
// Refer the implementation of `_mm_movemask_epi8`
uint16x4_t high_bits = vreinterpret_u16_u8(vshr_n_u8(input, 7));
uint32x2_t paired16 =
vreinterpret_u32_u16(vsra_n_u16(high_bits, high_bits, 7));
uint8x8_t paired32 =
vreinterpret_u8_u32(vsra_n_u32(paired16, paired16, 14));
return vget_lane_u8(paired32, 0) | ((int) vget_lane_u8(paired32, 4) << 4);
#endif
}
// NEON does not provide this method
// Creates a 4-bit mask from the most significant bits of the four
// single-precision, floating-point values.
// https://msdn.microsoft.com/en-us/library/vstudio/4490ys29(v=vs.100).aspx
FORCE_INLINE int _mm_movemask_ps(__m128 a)
{
uint32x4_t input = vreinterpretq_u32_m128(a);
#if defined(__aarch64__)
static const int32x4_t shift = {0, 1, 2, 3};
uint32x4_t tmp = vshrq_n_u32(input, 31);
return vaddvq_u32(vshlq_u32(tmp, shift));
#else
// Uses the exact same method as _mm_movemask_epi8, see that for details.
// Shift out everything but the sign bits with a 32-bit unsigned shift
// right.
uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(input, 31));
// Merge the two pairs together with a 64-bit unsigned shift right + add.
uint8x16_t paired =
vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31));
// Extract the result.
return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2);
#endif
}
// Multiplies the four single-precision, floating-point values of a and b.
//
// r0 := a0 * b0
// r1 := a1 * b1
// r2 := a2 * b2
// r3 := a3 * b3
//
// https://msdn.microsoft.com/en-us/library/vstudio/22kbk6t9(v=vs.100).aspx
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
{
return vreinterpretq_m128_f32(
vmulq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
}
// Multiply the lower single-precision (32-bit) floating-point element in a and
// b, store the result in the lower element of dst, and copy the upper 3 packed
// elements from a to the upper elements of dst.
//
// dst[31:0] := a[31:0] * b[31:0]
// dst[127:32] := a[127:32]
//
// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_ss
FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b)
{
return _mm_move_ss(a, _mm_mul_ps(a, b));
}
// Multiply the packed unsigned 16-bit integers in a and b, producing
// intermediate 32-bit integers, and store the high 16 bits of the intermediate
// integers in dst.
// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhi_pu16
FORCE_INLINE __m64 _mm_mulhi_pu16(__m64 a, __m64 b)
{
return vreinterpret_m64_u16(vshrn_n_u32(
vmull_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)), 16));
}
// Computes the bitwise OR of the four single-precision, floating-point values
// of a and b.
// https://msdn.microsoft.com/en-us/library/vstudio/7ctdsyy0(v=vs.100).aspx
FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b)
{
return vreinterpretq_m128_s32(
vorrq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
}
// Average packed unsigned 8-bit integers in a and b, and store the results in
// dst.
//
// FOR j := 0 to 7
// i := j*8
// dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
// ENDFOR
//
// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgb
#define _m_pavgb(a, b) _mm_avg_pu8(a, b)
// Average packed unsigned 16-bit integers in a and b, and store the results in
// dst.
//
// FOR j := 0 to 3
// i := j*16
// dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
// ENDFOR
//
// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgw
#define _m_pavgw(a, b) _mm_avg_pu16(a, b)
// Extract a 16-bit integer from a, selected with imm8, and store the result in
// the lower element of dst.
// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pextrw
#define _m_pextrw(a, imm) _mm_extract_pi16(a, imm)
// Copy a to dst, and insert the 16-bit integer i into dst at the location
// specified by imm8.
// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=m_pinsrw
#define _m_pinsrw(a, i, imm) _mm_insert_pi16(a, i, imm)
// Compare packed signed 16-bit integers in a and b, and store packed maximum
// values in dst.
// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmaxsw
#define _m_pmaxsw(a, b) _mm_max_pi16(a, b)
// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
// values in dst.
// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmaxub
#define _m_pmaxub(a, b) _mm_max_pu8(a, b)
// Compare packed signed 16-bit integers in a and b, and store packed minimum
// values in dst.
// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pminsw
#define _m_pminsw(a, b) _mm_min_pi16(a, b)
// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
// values in dst.
// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pminub
#define _m_pminub(a, b) _mm_min_pu8(a, b)
// Create mask from the most significant bit of each 8-bit element in a, and
// store the result in dst.
// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmovmskb
#define _m_pmovmskb(a) _mm_movemask_pi8(a)
// Multiply the packed unsigned 16-bit integers in a and b, producing
// intermediate 32-bit integers, and store the high 16 bits of the intermediate
// integers in dst.
// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmulhuw
#define _m_pmulhuw(a, b) _mm_mulhi_pu16(a, b)
// Loads one cache line of data from address p to a location closer to the
// processor. https://msdn.microsoft.com/en-us/library/84szxsww(v=vs.100).aspx
FORCE_INLINE void _mm_prefetch(const void *p, int i)
{
(void) i;
__builtin_prefetch(p);
}
// Compute the absolute differences of packed unsigned 8-bit integers in a and
// b, then horizontally sum each consecutive 8 differences to produce four
// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
// 16 bits of dst.
// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=m_psadbw
#define _m_psadbw(a, b) _mm_sad_pu8(a, b)
// Shuffle 16-bit integers in a using the control in imm8, and store the results
// in dst.
// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pshufw
#define _m_pshufw(a, imm) _mm_shuffle_pi16(a, imm)
// Compute the approximate reciprocal of packed single-precision (32-bit)
// floating-point elements in a, and store the results in dst. The maximum
// relative error for this approximation is less than 1.5*2^-12.
// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ps
FORCE_INLINE __m128 _mm_rcp_ps(__m128 in)
{
float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in));
recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
#if SSE2NEON_PRECISE_DIV
// Additional Netwon-Raphson iteration for accuracy
recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
#endif
return vreinterpretq_m128_f32(recip);
}
// Compute the approximate reciprocal of the lower single-precision (32-bit)
// floating-point element in a, store the result in the lower element of dst,
// and copy the upper 3 packed elements from a to the upper elements of dst. The
// maximum relative error for this approximation is less than 1.5*2^-12.
//
// dst[31:0] := (1.0 / a[31:0])
// dst[127:32] := a[127:32]
//
// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ss
FORCE_INLINE __m128 _mm_rcp_ss(__m128 a)
{
return _mm_move_ss(a, _mm_rcp_ps(a));
}
// Computes the approximations of the reciprocal square roots of the four
// single-precision floating point values of in.
// The current precision is 1% error.
// https://msdn.microsoft.com/en-us/library/22hfsh53(v=vs.100).aspx
FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in)
{
float32x4_t out = vrsqrteq_f32(vreinterpretq_f32_m128(in));
#if SSE2NEON_PRECISE_SQRT
// Additional Netwon-Raphson iteration for accuracy
out = vmulq_f32(
out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));
out = vmulq_f32(
out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));
#endif
return vreinterpretq_m128_f32(out);
}
// Compute the approximate reciprocal square root of the lower single-precision
// (32-bit) floating-point element in a, store the result in the lower element
// of dst, and copy the upper 3 packed elements from a to the upper elements of
// dst.
// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt_ss
FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 in)
{
return vsetq_lane_f32(vgetq_lane_f32(_mm_rsqrt_ps(in), 0), in, 0);
}
// Compute the absolute differences of packed unsigned 8-bit integers in a and
// b, then horizontally sum each consecutive 8 differences to produce four
// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
// 16 bits of dst.
// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_pu8
FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b)
{
uint64x1_t t = vpaddl_u32(vpaddl_u16(
vpaddl_u8(vabd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)))));
return vreinterpret_m64_u16(
vset_lane_u16(vget_lane_u64(t, 0), vdup_n_u16(0), 0));
}
// Sets the four single-precision, floating-point values to the four inputs.
// https://msdn.microsoft.com/en-us/library/vstudio/afh0zf75(v=vs.100).aspx
FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x)
{
float ALIGN_STRUCT(16) data[4] = {x, y, z, w};
return vreinterpretq_m128_f32(vld1q_f32(data));
}
// Sets the four single-precision, floating-point values to w.
// https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
FORCE_INLINE __m128 _mm_set_ps1(float _w)
{
return vreinterpretq_m128_f32(vdupq_n_f32(_w));
}
// Macro: Set the rounding mode bits of the MXCSR control and status register to
// the value in unsigned 32-bit integer a. The rounding mode may contain any of
// the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP,
// _MM_ROUND_TOWARD_ZERO
// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_SET_ROUNDING_MODE
FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding)
{
union {
fpcr_bitfield field;
#if defined(__aarch64__)
uint64_t value;
#else
uint32_t value;
#endif
} r;
#if defined(__aarch64__)
asm volatile("mrs %0, FPCR" : "=r"(r.value)); /* read */
#else
asm volatile("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
#endif
switch (rounding) {
case _MM_ROUND_TOWARD_ZERO:
r.field.bit22 = 1;
r.field.bit23 = 1;
break;
case _MM_ROUND_DOWN:
r.field.bit22 = 0;
r.field.bit23 = 1;
break;
case _MM_ROUND_UP:
r.field.bit22 = 1;
r.field.bit23 = 0;
break;
default: //_MM_ROUND_NEAREST
r.field.bit22 = 0;
r.field.bit23 = 0;
}
#if defined(__aarch64__)
asm volatile("msr FPCR, %0" ::"r"(r)); /* write */
#else
asm volatile("vmsr FPSCR, %0" ::"r"(r)); /* write */
#endif
}
// Copy single-precision (32-bit) floating-point element a to the lower element
// of dst, and zero the upper 3 elements.
// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_ss
FORCE_INLINE __m128 _mm_set_ss(float a)
{
float ALIGN_STRUCT(16) data[4] = {a, 0, 0, 0};
return vreinterpretq_m128_f32(vld1q_f32(data));
}
// Sets the four single-precision, floating-point values to w.
//
// r0 := r1 := r2 := r3 := w
//
// https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
FORCE_INLINE __m128 _mm_set1_ps(float _w)
{
return vreinterpretq_m128_f32(vdupq_n_f32(_w));
}
FORCE_INLINE void _mm_setcsr(unsigned int a)
{
_MM_SET_ROUNDING_MODE(a);
}
// Sets the four single-precision, floating-point values to the four inputs in
// reverse order.
// https://msdn.microsoft.com/en-us/library/vstudio/d2172ct3(v=vs.100).aspx
FORCE_INLINE __m128 _mm_setr_ps(float w, float z, float y, float x)
{
float ALIGN_STRUCT(16) data[4] = {w, z, y, x};
return vreinterpretq_m128_f32(vld1q_f32(data));
}
// Clears the four single-precision, floating-point values.
// https://msdn.microsoft.com/en-us/library/vstudio/tk1t2tbz(v=vs.100).aspx
FORCE_INLINE __m128 _mm_setzero_ps(void)
{
return vreinterpretq_m128_f32(vdupq_n_f32(0));
}
// Shuffle 16-bit integers in a using the control in imm8, and store the results
// in dst.
// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_pi16
#if __has_builtin(__builtin_shufflevector)
#define _mm_shuffle_pi16(a, imm) \
__extension__({ \
vreinterpret_m64_s16(__builtin_shufflevector( \
vreinterpret_s16_m64(a), vreinterpret_s16_m64(a), (imm & 0x3), \
((imm >> 2) & 0x3), ((imm >> 4) & 0x3), ((imm >> 6) & 0x3))); \
})
#else
#define _mm_shuffle_pi16(a, imm) \
__extension__({ \
int16x4_t ret; \
ret = \
vmov_n_s16(vget_lane_s16(vreinterpret_s16_m64(a), (imm) & (0x3))); \
ret = vset_lane_s16( \
vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 2) & 0x3), ret, \
1); \
ret = vset_lane_s16( \
vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 4) & 0x3), ret, \
2); \
ret = vset_lane_s16( \
vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 6) & 0x3), ret, \
3); \
vreinterpret_m64_s16(ret); \
})
#endif
// Guarantees that every preceding store is globally visible before any
// subsequent store.
// https://msdn.microsoft.com/en-us/library/5h2w73d1%28v=vs.90%29.aspx
FORCE_INLINE void _mm_sfence(void)
{
__sync_synchronize();
}
// FORCE_INLINE __m128 _mm_shuffle_ps(__m128 a, __m128 b, __constrange(0,255)
// int imm)
#if __has_builtin(__builtin_shufflevector)
#define _mm_shuffle_ps(a, b, imm) \
__extension__({ \
float32x4_t _input1 = vreinterpretq_f32_m128(a); \
float32x4_t _input2 = vreinterpretq_f32_m128(b); \
float32x4_t _shuf = __builtin_shufflevector( \
_input1, _input2, (imm) & (0x3), ((imm) >> 2) & 0x3, \
(((imm) >> 4) & 0x3) + 4, (((imm) >> 6) & 0x3) + 4); \
vreinterpretq_m128_f32(_shuf); \
})
#else // generic
#define _mm_shuffle_ps(a, b, imm) \
__extension__({ \
__m128 ret; \
switch (imm) { \
case _MM_SHUFFLE(1, 0, 3, 2): \
ret = _mm_shuffle_ps_1032((a), (b)); \
break; \
case _MM_SHUFFLE(2, 3, 0, 1): \
ret = _mm_shuffle_ps_2301((a), (b)); \
break; \
case _MM_SHUFFLE(0, 3, 2, 1): \
ret = _mm_shuffle_ps_0321((a), (b)); \
break; \
case _MM_SHUFFLE(2, 1, 0, 3): \
ret = _mm_shuffle_ps_2103((a), (b)); \
break; \
case _MM_SHUFFLE(1, 0, 1, 0): \
ret = _mm_movelh_ps((a), (b)); \
break; \
case _MM_SHUFFLE(1, 0, 0, 1): \
ret = _mm_shuffle_ps_1001((a), (b)); \
break; \
case _MM_SHUFFLE(0, 1, 0, 1): \
ret = _mm_shuffle_ps_0101((a), (b)); \
break; \
case _MM_SHUFFLE(3, 2, 1, 0): \
ret = _mm_shuffle_ps_3210((a), (b)); \
break; \
case _MM_SHUFFLE(0, 0, 1, 1): \
ret = _mm_shuffle_ps_0011((a), (b)); \
break; \
case _MM_SHUFFLE(0, 0, 2, 2): \
ret = _mm_shuffle_ps_0022((a), (b)); \
break; \
case _MM_SHUFFLE(2, 2, 0, 0): \
ret = _mm_shuffle_ps_2200((a), (b)); \
break; \
case _MM_SHUFFLE(3, 2, 0, 2): \
ret = _mm_shuffle_ps_3202((a), (b)); \
break; \
case _MM_SHUFFLE(3, 2, 3, 2): \
ret = _mm_movehl_ps((b), (a)); \
break; \
case _MM_SHUFFLE(1, 1, 3, 3): \
ret = _mm_shuffle_ps_1133((a), (b)); \
break; \
case _MM_SHUFFLE(2, 0, 1, 0): \
ret = _mm_shuffle_ps_2010((a), (b)); \
break; \
case _MM_SHUFFLE(2, 0, 0, 1): \
ret = _mm_shuffle_ps_2001((a), (b)); \
break; \
case _MM_SHUFFLE(2, 0, 3, 2): \
ret = _mm_shuffle_ps_2032((a), (b)); \
break; \
default: \
ret = _mm_shuffle_ps_default((a), (b), (imm)); \
break; \
} \
ret; \
})
#endif
// Computes the approximations of square roots of the four single-precision,
// floating-point values of a. First computes reciprocal square roots and then
// reciprocals of the four values.
//
// r0 := sqrt(a0)
// r1 := sqrt(a1)
// r2 := sqrt(a2)
// r3 := sqrt(a3)
//
// https://msdn.microsoft.com/en-us/library/vstudio/8z67bwwk(v=vs.100).aspx
FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
{
#if SSE2NEON_PRECISE_SQRT
float32x4_t recip = vrsqrteq_f32(vreinterpretq_f32_m128(in));
// Test for vrsqrteq_f32(0) -> positive infinity case.
// Change to zero, so that s * 1/sqrt(s) result is zero too.
const uint32x4_t pos_inf = vdupq_n_u32(0x7F800000);
const uint32x4_t div_by_zero =
vceqq_u32(pos_inf, vreinterpretq_u32_f32(recip));
recip = vreinterpretq_f32_u32(
vandq_u32(vmvnq_u32(div_by_zero), vreinterpretq_u32_f32(recip)));
// Additional Netwon-Raphson iteration for accuracy
recip = vmulq_f32(
vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)),
recip);
recip = vmulq_f32(
vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)),
recip);
// sqrt(s) = s * 1/sqrt(s)
return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(in), recip));
#elif defined(__aarch64__)
return vreinterpretq_m128_f32(vsqrtq_f32(vreinterpretq_f32_m128(in)));
#else
float32x4_t recipsq = vrsqrteq_f32(vreinterpretq_f32_m128(in));
float32x4_t sq = vrecpeq_f32(recipsq);
return vreinterpretq_m128_f32(sq);
#endif
}
// Computes the approximation of the square root of the scalar single-precision
// floating point value of in.
// https://msdn.microsoft.com/en-us/library/ahfsc22d(v=vs.100).aspx
FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in)
{
float32_t value =
vgetq_lane_f32(vreinterpretq_f32_m128(_mm_sqrt_ps(in)), 0);
return vreinterpretq_m128_f32(
vsetq_lane_f32(value, vreinterpretq_f32_m128(in), 0));
}
// Stores four single-precision, floating-point values.
// https://msdn.microsoft.com/en-us/library/vstudio/s3h4ay6y(v=vs.100).aspx
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
{
vst1q_f32(p, vreinterpretq_f32_m128(a));
}
// Store the lower single-precision (32-bit) floating-point element from a into
// 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
// boundary or a general-protection exception may be generated.
//
// MEM[mem_addr+31:mem_addr] := a[31:0]
// MEM[mem_addr+63:mem_addr+32] := a[31:0]
// MEM[mem_addr+95:mem_addr+64] := a[31:0]
// MEM[mem_addr+127:mem_addr+96] := a[31:0]
//
// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_ps1
FORCE_INLINE void _mm_store_ps1(float *p, __m128 a)
{
float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
vst1q_f32(p, vdupq_n_f32(a0));
}
// Stores the lower single - precision, floating - point value.
// https://msdn.microsoft.com/en-us/library/tzz10fbx(v=vs.100).aspx
FORCE_INLINE void _mm_store_ss(float *p, __m128 a)
{
vst1q_lane_f32(p, vreinterpretq_f32_m128(a), 0);
}
// Store the lower single-precision (32-bit) floating-point element from a into
// 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
// boundary or a general-protection exception may be generated.
//
// MEM[mem_addr+31:mem_addr] := a[31:0]
// MEM[mem_addr+63:mem_addr+32] := a[31:0]
// MEM[mem_addr+95:mem_addr+64] := a[31:0]
// MEM[mem_addr+127:mem_addr+96] := a[31:0]
//
// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store1_ps
#define _mm_store1_ps _mm_store_ps1
// Stores the upper two single-precision, floating-point values of a to the
// address p.
//
// *p0 := a2
// *p1 := a3
//
// https://msdn.microsoft.com/en-us/library/a7525fs8(v%3dvs.90).aspx
FORCE_INLINE void _mm_storeh_pi(__m64 *p, __m128 a)
{
*p = vreinterpret_m64_f32(vget_high_f32(a));
}
// Stores the lower two single-precision floating point values of a to the
// address p.
//
// *p0 := a0
// *p1 := a1
//
// https://msdn.microsoft.com/en-us/library/h54t98ks(v=vs.90).aspx
FORCE_INLINE void _mm_storel_pi(__m64 *p, __m128 a)
{
*p = vreinterpret_m64_f32(vget_low_f32(a));
}
// Store 4 single-precision (32-bit) floating-point elements from a into memory
// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
// general-protection exception may be generated.
//
// MEM[mem_addr+31:mem_addr] := a[127:96]
// MEM[mem_addr+63:mem_addr+32] := a[95:64]
// MEM[mem_addr+95:mem_addr+64] := a[63:32]
// MEM[mem_addr+127:mem_addr+96] := a[31:0]
//
// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storer_ps
FORCE_INLINE void _mm_storer_ps(float *p, __m128 a)
{
float32x4_t tmp = vrev64q_f32(vreinterpretq_f32_m128(a));
float32x4_t rev = vextq_f32(tmp, tmp, 2);
vst1q_f32(p, rev);
}
// Stores four single-precision, floating-point values.
// https://msdn.microsoft.com/en-us/library/44e30x22(v=vs.100).aspx
FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
{
vst1q_f32(p, vreinterpretq_f32_m128(a));
}
// Stores 16-bits of integer data a at the address p.
// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si16
FORCE_INLINE void _mm_storeu_si16(void *p, __m128i a)
{
vst1q_lane_s16((int16_t *) p, vreinterpretq_s16_m128i(a), 0);
}
// Stores 64-bits of integer data a at the address p.
// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si64
FORCE_INLINE void _mm_storeu_si64(void *p, __m128i a)
{
vst1q_lane_s64((int64_t *) p, vreinterpretq_s64_m128i(a), 0);
}
// Store 64-bits of integer data from a into memory using a non-temporal memory
// hint.
// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_pi
FORCE_INLINE void _mm_stream_pi(__m64 *p, __m64 a)
{
vst1_s64((int64_t *) p, vreinterpret_s64_m64(a));
}
// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-
// point elements) from a into memory using a non-temporal memory hint.
// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_ps
FORCE_INLINE void _mm_stream_ps(float *p, __m128 a)
{
#if __has_builtin(__builtin_nontemporal_store)
__builtin_nontemporal_store(a, (float32x4_t *) p);
#else
vst1q_f32(p, vreinterpretq_f32_m128(a));
#endif
}
// Subtracts the four single-precision, floating-point values of a and b.
//
// r0 := a0 - b0
// r1 := a1 - b1
// r2 := a2 - b2
// r3 := a3 - b3
//
// https://msdn.microsoft.com/en-us/library/vstudio/1zad2k61(v=vs.100).aspx
FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
{
return vreinterpretq_m128_f32(
vsubq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
}
// Subtract the lower single-precision (32-bit) floating-point element in b from
// the lower single-precision (32-bit) floating-point element in a, store the
// result in the lower element of dst, and copy the upper 3 packed elements from
// a to the upper elements of dst.
//
// dst[31:0] := a[31:0] - b[31:0]
// dst[127:32] := a[127:32]
//
// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_ss
FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b)
{
return _mm_move_ss(a, _mm_sub_ps(a, b));
}
// Macro: Transpose the 4x4 matrix formed by the 4 rows of single-precision
// (32-bit) floating-point elements in row0, row1, row2, and row3, and store the
// transposed matrix in these vectors (row0 now contains column 0, etc.).
// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=MM_TRANSPOSE4_PS
#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
do { \
float32x4x2_t ROW01 = vtrnq_f32(row0, row1); \
float32x4x2_t ROW23 = vtrnq_f32(row2, row3); \
row0 = vcombine_f32(vget_low_f32(ROW01.val[0]), \
vget_low_f32(ROW23.val[0])); \
row1 = vcombine_f32(vget_low_f32(ROW01.val[1]), \
vget_low_f32(ROW23.val[1])); \
row2 = vcombine_f32(vget_high_f32(ROW01.val[0]), \
vget_high_f32(ROW23.val[0])); \
row3 = vcombine_f32(vget_high_f32(ROW01.val[1]), \
vget_high_f32(ROW23.val[1])); \
} while (0)
// according to the documentation, these intrinsics behave the same as the
// non-'u' versions. We'll just alias them here.
#define _mm_ucomieq_ss _mm_comieq_ss
#define _mm_ucomige_ss _mm_comige_ss
#define _mm_ucomigt_ss _mm_comigt_ss
#define _mm_ucomile_ss _mm_comile_ss
#define _mm_ucomilt_ss _mm_comilt_ss
#define _mm_ucomineq_ss _mm_comineq_ss
// Return vector of type __m128i with undefined elements.
// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_undefined_si128
FORCE_INLINE __m128i _mm_undefined_si128(void)
{
#if defined(__GNUC__) || defined(__clang__)
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wuninitialized"
#endif
__m128i a;
return a;
#if defined(__GNUC__) || defined(__clang__)
#pragma GCC diagnostic pop
#endif
}
// Return vector of type __m128 with undefined elements.
// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_undefined_ps
FORCE_INLINE __m128 _mm_undefined_ps(void)
{
#if defined(__GNUC__) || defined(__clang__)
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wuninitialized"
#endif
__m128 a;
return a;
#if defined(__GNUC__) || defined(__clang__)
#pragma GCC diagnostic pop
#endif
}
// Selects and interleaves the upper two single-precision, floating-point values
// from a and b.
//
// r0 := a2
// r1 := b2
// r2 := a3
// r3 := b3
//
// https://msdn.microsoft.com/en-us/library/skccxx7d%28v=vs.90%29.aspx
FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
{
#if defined(__aarch64__)
return vreinterpretq_m128_f32(
vzip2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
#else
float32x2_t a1 = vget_high_f32(vreinterpretq_f32_m128(a));
float32x2_t b1 = vget_high_f32(vreinterpretq_f32_m128(b));
float32x2x2_t result = vzip_f32(a1, b1);
return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
#endif
}
// Selects and interleaves the lower two single-precision, floating-point values
// from a and b.
//
// r0 := a0
// r1 := b0
// r2 := a1
// r3 := b1
//
// https://msdn.microsoft.com/en-us/library/25st103b%28v=vs.90%29.aspx
FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b)
{
#if defined(__aarch64__)
return vreinterpretq_m128_f32(
vzip1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
#else
float32x2_t a1 = vget_low_f32(vreinterpretq_f32_m128(a));
float32x2_t b1 = vget_low_f32(vreinterpretq_f32_m128(b));
float32x2x2_t result = vzip_f32(a1, b1);
return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
#endif
}
// Computes bitwise EXOR (exclusive-or) of the four single-precision,
// floating-point values of a and b.
// https://msdn.microsoft.com/en-us/library/ss6k3wk8(v=vs.100).aspx
FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b)
{
return vreinterpretq_m128_s32(
veorq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
}
/* SSE2 */
// Adds the 8 signed or unsigned 16-bit integers in a to the 8 signed or
// unsigned 16-bit integers in b.
// https://msdn.microsoft.com/en-us/library/fceha5k4(v=vs.100).aspx
FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b)
{
return vreinterpretq_m128i_s16(
vaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
}
// Adds the 4 signed or unsigned 32-bit integers in a to the 4 signed or
// unsigned 32-bit integers in b.
//
// r0 := a0 + b0
// r1 := a1 + b1
// r2 := a2 + b2
// r3 := a3 + b3
//
// https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b)
{
return vreinterpretq_m128i_s32(
vaddq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
}
// Adds the 4 signed or unsigned 64-bit integers in a to the 4 signed or
// unsigned 32-bit integers in b.
// https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
FORCE_INLINE __m128i _mm_add_epi64(__m128i a, __m128i b)
{
return vreinterpretq_m128i_s64(
vaddq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
}
// Adds the 16 signed or unsigned 8-bit integers in a to the 16 signed or
// unsigned 8-bit integers in b.
// https://technet.microsoft.com/en-us/subscriptions/yc7tcyzs(v=vs.90)
FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b)
{
return vreinterpretq_m128i_s8(
vaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
}
// Add packed double-precision (64-bit) floating-point elements in a and b, and
// store the results in dst.
// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_pd
FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b)
{
#if defined(__aarch64__)
return vreinterpretq_m128d_f64(
vaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
#else
double *da = (double *) &a;
double *db = (double *) &b;
double c[2];
c[0] = da[0] + db[0];
c[1] = da[1] + db[1];
return vld1q_f32((float32_t *) c);
#endif
}
// Add the lower double-precision (64-bit) floating-point element in a and b,
// store the result in the lower element of dst, and copy the upper element from
// a to the upper element of dst.
//
// dst[63:0] := a[63:0] + b[63:0]
// dst[127:64] := a[127:64]
//
// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_sd
FORCE_INLINE __m128d _mm_add_sd(__m128d a, __m128d b)
{
#if defined(__aarch64__)
return _mm_move_sd(a, _mm_add_pd(a, b));
#else
double *da = (double *) &a;
double *db = (double *) &b;
double c[2];
c[0] = da[0] + db[0];
c[1] = da[1];
return vld1q_f32((float32_t *) c);
#endif
}
// Add 64-bit integers a and b, and store the result in dst.
//
// dst[63:0] := a[63:0] + b[63:0]
//
// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_si64
FORCE_INLINE __m64 _mm_add_si64(__m64 a, __m64 b)
{