4197 lines
97 KiB
C
4197 lines
97 KiB
C
/* Permission is hereby granted, free of charge, to any person
|
|
* obtaining a copy of this software and associated documentation
|
|
* files (the "Software"), to deal in the Software without
|
|
* restriction, including without limitation the rights to use, copy,
|
|
* modify, merge, publish, distribute, sublicense, and/or sell copies
|
|
* of the Software, and to permit persons to whom the Software is
|
|
* furnished to do so, subject to the following conditions:
|
|
*
|
|
* The above copyright notice and this permission notice shall be
|
|
* included in all copies or substantial portions of the Software.
|
|
*
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
|
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
|
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
|
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
|
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
* SOFTWARE.
|
|
*
|
|
* Copyright:
|
|
* 2017 Evan Nemerson <evan@nemerson.com>
|
|
* 2015-2017 John W. Ratcliff <jratcliffscarab@gmail.com>
|
|
* 2015 Brandon Rowlett <browlett@nvidia.com>
|
|
* 2015 Ken Fast <kfast@gdeb.com>
|
|
* 2017 Hasindu Gamaarachchi <hasindu@unsw.edu.au>
|
|
* 2018 Jeff Daily <jeff.daily@amd.com>
|
|
*/
|
|
|
|
#if !defined(SIMDE__SSE2_H)
|
|
#if !defined(SIMDE__SSE2_H)
|
|
#define SIMDE__SSE2_H
|
|
#endif
|
|
#include "sse.h"
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
#undef SIMDE_SSE2_NATIVE
|
|
#endif
|
|
#if defined(SIMDE_SSE2_FORCE_NATIVE)
|
|
#define SIMDE_SSE2_NATIVE
|
|
#elif defined(__SSE2__) && !defined(SIMDE_SSE2_NO_NATIVE) && \
|
|
!defined(SIMDE_NO_NATIVE)
|
|
#define SIMDE_SSE2_NATIVE
|
|
#elif defined(__ARM_NEON) && !defined(SIMDE_SSE2_NO_NEON) && \
|
|
!defined(SIMDE_NO_NEON)
|
|
#define SIMDE_SSE2_NEON
|
|
#endif
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE) && !defined(SIMDE_SSE_NATIVE)
|
|
#if defined(SIMDE_SSE2_FORCE_NATIVE)
|
|
#error Native SSE2 support requires native SSE support
|
|
#else
|
|
#warning Native SSE2 support requires native SSE support, disabling
|
|
#undef SIMDE_SSE2_NATIVE
|
|
#endif
|
|
#elif defined(SIMDE_SSE2_NEON) && !defined(SIMDE_SSE_NEON)
|
|
#warning SSE2 NEON support requires SSE NEON support, disabling
|
|
#undef SIMDE_SSE_NEON
|
|
#endif
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
#include <emmintrin.h>
|
|
#else
|
|
#if defined(SIMDE_SSE2_NEON)
|
|
#include <arm_neon.h>
|
|
#endif
|
|
#endif
|
|
|
|
#include <stdint.h>
|
|
#include <limits.h>
|
|
#include <string.h>
|
|
|
|
#define vreinterpretq_m128i_s32(v) \
|
|
(simde__m128i) { .neon_i32 = v }
|
|
#define vreinterpretq_m128i_u64(v) \
|
|
(simde__m128i) { .neon_u64 = v }
|
|
|
|
#define vreinterpretq_s32_m128i(a) a.neon_i32
|
|
#define vreinterpretq_u64_m128i(a) a.neon_u64
|
|
|
|
SIMDE__BEGIN_DECLS
|
|
|
|
typedef SIMDE_ALIGN(16) union {
|
|
#if defined(SIMDE__ENABLE_GCC_VEC_EXT)
|
|
int8_t i8 __attribute__((__vector_size__(16), __may_alias__));
|
|
int16_t i16 __attribute__((__vector_size__(16), __may_alias__));
|
|
int32_t i32 __attribute__((__vector_size__(16), __may_alias__));
|
|
int64_t i64 __attribute__((__vector_size__(16), __may_alias__));
|
|
uint8_t u8 __attribute__((__vector_size__(16), __may_alias__));
|
|
uint16_t u16 __attribute__((__vector_size__(16), __may_alias__));
|
|
uint32_t u32 __attribute__((__vector_size__(16), __may_alias__));
|
|
uint64_t u64 __attribute__((__vector_size__(16), __may_alias__));
|
|
#if defined(SIMDE__HAVE_INT128)
|
|
simde_int128 i128 __attribute__((__vector_size__(16), __may_alias__));
|
|
simde_uint128 u128 __attribute__((__vector_size__(16), __may_alias__));
|
|
#endif
|
|
simde_float32 f32 __attribute__((__vector_size__(16), __may_alias__));
|
|
simde_float64 f64 __attribute__((__vector_size__(16), __may_alias__));
|
|
#else
|
|
int8_t i8[16];
|
|
int16_t i16[8];
|
|
int32_t i32[4];
|
|
int64_t i64[2];
|
|
uint8_t u8[16];
|
|
uint16_t u16[8];
|
|
uint32_t u32[4];
|
|
uint64_t u64[2];
|
|
#if defined(SIMDE__HAVE_INT128)
|
|
simde_int128 i128[1];
|
|
simde_uint128 u128[1];
|
|
#endif
|
|
simde_float32 f32[4];
|
|
simde_float64 f64[2];
|
|
#endif
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
__m128i n;
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
int8x16_t neon_i8;
|
|
int16x8_t neon_i16;
|
|
int32x4_t neon_i32;
|
|
int64x2_t neon_i64;
|
|
uint8x16_t neon_u8;
|
|
uint16x8_t neon_u16;
|
|
uint32x4_t neon_u32;
|
|
uint64x2_t neon_u64;
|
|
float32x4_t neon_f32;
|
|
#if defined(SIMDE_ARCH_AMD64)
|
|
float64x2_t neon_f64;
|
|
#endif
|
|
#endif
|
|
} simde__m128i;
|
|
|
|
typedef SIMDE_ALIGN(16) union {
|
|
#if defined(SIMDE__ENABLE_GCC_VEC_EXT)
|
|
int8_t i8 __attribute__((__vector_size__(16), __may_alias__));
|
|
int16_t i16 __attribute__((__vector_size__(16), __may_alias__));
|
|
int32_t i32 __attribute__((__vector_size__(16), __may_alias__));
|
|
int64_t i64 __attribute__((__vector_size__(16), __may_alias__));
|
|
uint8_t u8 __attribute__((__vector_size__(16), __may_alias__));
|
|
uint16_t u16 __attribute__((__vector_size__(16), __may_alias__));
|
|
uint32_t u32 __attribute__((__vector_size__(16), __may_alias__));
|
|
uint64_t u64 __attribute__((__vector_size__(16), __may_alias__));
|
|
simde_float32 f32 __attribute__((__vector_size__(16), __may_alias__));
|
|
simde_float64 f64 __attribute__((__vector_size__(16), __may_alias__));
|
|
#else
|
|
int8_t i8[16];
|
|
int16_t i16[8];
|
|
int32_t i32[4];
|
|
int64_t i64[2];
|
|
uint8_t u8[16];
|
|
uint16_t u16[8];
|
|
uint32_t u32[4];
|
|
uint64_t u64[2];
|
|
simde_float32 f32[4];
|
|
simde_float64 f64[2];
|
|
#endif
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
__m128d n;
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
int8x16_t neon_i8;
|
|
int16x8_t neon_i16;
|
|
int32x4_t neon_i32;
|
|
int64x2_t neon_i64;
|
|
uint8x16_t neon_u8;
|
|
uint16x8_t neon_u16;
|
|
uint32x4_t neon_u32;
|
|
uint64x2_t neon_u64;
|
|
float32x4_t neon_f32;
|
|
#if defined(SIMDE_ARCH_AMD64)
|
|
float64x2_t neon_f64;
|
|
#endif
|
|
#endif
|
|
} simde__m128d;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
HEDLEY_STATIC_ASSERT(sizeof(__m128i) == sizeof(simde__m128i),
|
|
"__m128i size doesn't match simde__m128i size");
|
|
HEDLEY_STATIC_ASSERT(sizeof(__m128d) == sizeof(simde__m128d),
|
|
"__m128d size doesn't match simde__m128d size");
|
|
SIMDE__FUNCTION_ATTRIBUTES simde__m128i SIMDE__M128I_C(__m128i v)
|
|
{
|
|
simde__m128i r;
|
|
r.n = v;
|
|
return r;
|
|
}
|
|
SIMDE__FUNCTION_ATTRIBUTES simde__m128d SIMDE__M128D_C(__m128d v)
|
|
{
|
|
simde__m128d r;
|
|
r.n = v;
|
|
return r;
|
|
}
|
|
#elif defined(SIMDE_SSE_NEON)
|
|
#define SIMDE__M128I_NEON_C(T, expr) \
|
|
(simde__m128i) { .neon_##T = expr }
|
|
#define SIMDE__M128D_NEON_C(T, expr) \
|
|
(simde__m128d) { .neon_##T = expr }
|
|
#endif
|
|
HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128i), "simde__m128i size incorrect");
|
|
HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128d), "simde__m128d size incorrect");
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_add_epi8(simde__m128i a, simde__m128i b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_add_epi8(a.n, b.n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
return SIMDE__M128I_NEON_C(i8, vaddq_s8(a.neon_i8, b.neon_i8));
|
|
#else
|
|
simde__m128i r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i8) / sizeof(r.i8[0])); i++) {
|
|
r.i8[i] = a.i8[i] + b.i8[i];
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_add_epi16(simde__m128i a, simde__m128i b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_add_epi16(a.n, b.n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
return SIMDE__M128I_NEON_C(i16, vaddq_s16(a.neon_i16, b.neon_i16));
|
|
#else
|
|
simde__m128i r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) {
|
|
r.i16[i] = a.i16[i] + b.i16[i];
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_add_epi32(simde__m128i a, simde__m128i b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_add_epi32(a.n, b.n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
return SIMDE__M128I_NEON_C(i32, vaddq_s32(a.neon_i32, b.neon_i32));
|
|
#else
|
|
simde__m128i r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
|
|
r.i32[i] = a.i32[i] + b.i32[i];
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_add_epi64(simde__m128i a, simde__m128i b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_add_epi64(a.n, b.n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
return SIMDE__M128I_NEON_C(i64, vaddq_s64(a.neon_i64, b.neon_i64));
|
|
#else
|
|
simde__m128i r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) {
|
|
r.i64[i] = a.i64[i] + b.i64[i];
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_add_pd(simde__m128d a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128D_C(_mm_add_pd(a.n, b.n));
|
|
#elif defined(SIMDE_SSE2_NEON) && defined(SIMDE_ARCH_AMD64)
|
|
return SIMDE__M128I_NEON_C(f64, vaddq_f64(a.neon_f64, b.neon_f64));
|
|
#else
|
|
simde__m128d r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
|
|
r.f64[i] = a.f64[i] + b.f64[i];
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_add_sd(simde__m128d a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128D_C(_mm_add_sd(a.n, b.n));
|
|
#else
|
|
simde__m128d r;
|
|
r.f64[0] = a.f64[0] + b.f64[0];
|
|
r.f64[1] = a.f64[1];
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m64 simde_mm_add_si64(simde__m64 a, simde__m64 b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M64_C(_mm_add_si64(a.n, b.n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
return SIMDE__M64_NEON_C(i64, vadd_s64(a.neon_i64, b.neon_i64));
|
|
#else
|
|
simde__m64 r;
|
|
r.i64[0] = a.i64[0] + b.i64[0];
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_adds_epi8(simde__m128i a, simde__m128i b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_adds_epi8(a.n, b.n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
return SIMDE__M128I_NEON_C(i8, vqaddq_s8(a.neon_i8, b.neon_i8));
|
|
#else
|
|
simde__m128i r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i8) / sizeof(r.i8[0])); i++) {
|
|
if ((((b.i8[i]) > 0) && ((a.i8[i]) > (INT8_MAX - (b.i8[i]))))) {
|
|
r.i8[i] = INT8_MAX;
|
|
} else if ((((b.i8[i]) < 0) &&
|
|
((a.i8[i]) < (INT8_MIN - (b.i8[i]))))) {
|
|
r.i8[i] = INT8_MIN;
|
|
} else {
|
|
r.i8[i] = (a.i8[i]) + (b.i8[i]);
|
|
}
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_adds_epi16(simde__m128i a, simde__m128i b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_adds_epi16(a.n, b.n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
return SIMDE__M128I_NEON_C(i16, vqaddq_s16(a.neon_i16, b.neon_i16));
|
|
#else
|
|
simde__m128i r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) {
|
|
if ((((b.i16[i]) > 0) &&
|
|
((a.i16[i]) > (INT16_MAX - (b.i16[i]))))) {
|
|
r.i16[i] = INT16_MAX;
|
|
} else if ((((b.i16[i]) < 0) &&
|
|
((a.i16[i]) < (INT16_MIN - (b.i16[i]))))) {
|
|
r.i16[i] = INT16_MIN;
|
|
} else {
|
|
r.i16[i] = (a.i16[i]) + (b.i16[i]);
|
|
}
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_adds_epu8(simde__m128i a, simde__m128i b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_adds_epu8(a.n, b.n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
return SIMDE__M128I_NEON_C(u8, vqaddq_u8(a.neon_u8, b.neon_u8));
|
|
#else
|
|
simde__m128i r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.u8) / sizeof(r.u8[0])); i++) {
|
|
r.u8[i] = ((UINT8_MAX - a.u8[i]) > b.u8[i])
|
|
? (a.u8[i] + b.u8[i])
|
|
: UINT8_MAX;
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_adds_epu16(simde__m128i a, simde__m128i b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_adds_epu16(a.n, b.n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
return SIMDE__M128I_NEON_C(u16, vqaddq_u16(a.neon_u16, b.neon_u16));
|
|
#else
|
|
simde__m128i r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.u16) / sizeof(r.u16[0])); i++) {
|
|
r.u16[i] = ((UINT16_MAX - a.u16[i]) > b.u16[i])
|
|
? (a.u16[i] + b.u16[i])
|
|
: UINT16_MAX;
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_and_pd(simde__m128d a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128D_C(_mm_and_pd(a.n, b.n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
return SIMDE__M128D_NEON_C(i32, vandq_s32(a.neon_i32, b.neon_i32));
|
|
#else
|
|
simde__m128d r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.u64) / sizeof(r.u64[0])); i++) {
|
|
r.u64[i] = a.u64[i] & b.u64[i];
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_and_si128(simde__m128i a, simde__m128i b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_and_si128(a.n, b.n));
|
|
#elif defined(SIMDE_SSE_NEON)
|
|
return SIMDE__M128I_NEON_C(i32, vandq_s32(b.neon_i32, a.neon_i32));
|
|
#else
|
|
simde__m128i r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) {
|
|
r.i64[i] = a.i64[i] & b.i64[i];
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_andnot_pd(simde__m128d a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128D_C(_mm_andnot_pd(a.n, b.n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
return SIMDE__M128D_NEON_C(i32, vbicq_s32(a.neon_i32, b.neon_i32));
|
|
#else
|
|
simde__m128d r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.u64) / sizeof(r.u64[0])); i++) {
|
|
r.u64[i] = ~a.u64[i] & b.u64[i];
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_andnot_si128(simde__m128i a, simde__m128i b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_andnot_si128(a.n, b.n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
return SIMDE__M128I_NEON_C(i32, vbicq_s32(b.neon_i32, a.neon_i32));
|
|
#else
|
|
simde__m128i r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) {
|
|
r.i64[i] = ~(a.i64[i]) & b.i64[i];
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_avg_epu8(simde__m128i a, simde__m128i b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_avg_epu8(a.n, b.n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
return SIMDE__M128I_NEON_C(u8, vrhaddq_u8(b.neon_u8, a.neon_u8));
|
|
#else
|
|
simde__m128i r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.u8) / sizeof(r.u8[0])); i++) {
|
|
r.u8[i] = (a.u8[i] + b.u8[i] + 1) >> 1;
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_avg_epu16(simde__m128i a, simde__m128i b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_avg_epu16(a.n, b.n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
return SIMDE__M128I_NEON_C(u16, vrhaddq_u16(b.neon_u16, a.neon_u16));
|
|
#else
|
|
simde__m128i r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.u16) / sizeof(r.u16[0])); i++) {
|
|
r.u16[i] = (a.u16[i] + b.u16[i] + 1) >> 1;
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_bslli_si128(simde__m128i a, const int imm8)
|
|
{
|
|
simde__m128i r;
|
|
|
|
if (HEDLEY_UNLIKELY(imm8 > 15)) {
|
|
r.u64[0] = 0;
|
|
r.u64[1] = 0;
|
|
return r;
|
|
}
|
|
|
|
const int s = imm8 * 8;
|
|
|
|
#if defined(SIMDE__HAVE_INT128)
|
|
r.u128[0] = a.u128[0] << s;
|
|
#else
|
|
if (s < 64) {
|
|
r.u64[0] = (a.u64[0] << s);
|
|
r.u64[1] = (a.u64[1] << s) | (a.u64[0] >> (64 - s));
|
|
} else {
|
|
r.u64[0] = 0;
|
|
r.u64[1] = a.u64[0] << (s - 64);
|
|
}
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
#if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI)
|
|
#define simde_mm_bslli_si128(a, imm8) SIMDE__M128I_C(_mm_slli_si128(a.n, imm8))
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
#define simde_mm_bslli_si128(a, imm8) \
|
|
SIMDE__M128I_NEON_C( \
|
|
i8, \
|
|
(((imm8) <= 0) ? ((a).neon_i8) \
|
|
: (((imm8) > 15) ? (vdupq_n_s8(0)) \
|
|
: (vextq_s8(vdupq_n_s8(0), \
|
|
(a).neon_i8, \
|
|
16 - (imm8))))))
|
|
#endif
|
|
#define simde_mm_slli_si128(a, imm8) simde_mm_bslli_si128(a, imm8)
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_bsrli_si128(simde__m128i a, const int imm8)
|
|
{
|
|
simde__m128i r;
|
|
|
|
if (HEDLEY_UNLIKELY(imm8 > 15)) {
|
|
r.u64[0] = 0;
|
|
r.u64[1] = 0;
|
|
return r;
|
|
}
|
|
|
|
const int s = imm8 * 8;
|
|
|
|
#if defined(SIMDE__HAVE_INT128)
|
|
r.u128[0] = a.u128[0] >> s;
|
|
#else
|
|
if (s < 64) {
|
|
r.u64[0] = (a.u64[0] >> s) | (a.u64[1] << (64 - s));
|
|
r.u64[1] = (a.u64[1] >> s);
|
|
} else {
|
|
r.u64[0] = a.u64[1] >> (s - 64);
|
|
r.u64[1] = 0;
|
|
}
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
#if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI)
|
|
#define simde_mm_bsrli_si128(a, imm8) SIMDE__M128I_C(_mm_srli_si128(a.n, imm8))
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
#define simde_mm_bsrli_si128(a, imm8) \
|
|
SIMDE__M128I_NEON_C( \
|
|
i8, \
|
|
((imm8) <= 0) \
|
|
? ((a).neon_i8) \
|
|
: (((imm8) > 15) ? (vdupq_n_s8(0)) \
|
|
: (vextq_s8((a).neon_i8, \
|
|
vdupq_n_s8(0), (imm8)))))
|
|
#endif
|
|
#define simde_mm_srli_si128(a, imm8) simde_mm_bsrli_si128(a, imm8)
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
void simde_mm_clflush(void const *p)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
_mm_clflush(p);
|
|
#else
|
|
(void)p;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
int simde_mm_comieq_sd(simde__m128d a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return _mm_comieq_sd(a.n, b.n);
|
|
#else
|
|
return a.f64[0] == b.f64[0];
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
int simde_mm_comige_sd(simde__m128d a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return _mm_comige_sd(a.n, b.n);
|
|
#else
|
|
return a.f64[0] >= b.f64[0];
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
int simde_mm_comigt_sd(simde__m128d a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return _mm_comigt_sd(a.n, b.n);
|
|
#else
|
|
return a.f64[0] > b.f64[0];
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
int simde_mm_comile_sd(simde__m128d a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return _mm_comile_sd(a.n, b.n);
|
|
#else
|
|
return a.f64[0] <= b.f64[0];
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
int simde_mm_comilt_sd(simde__m128d a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return _mm_comilt_sd(a.n, b.n);
|
|
#else
|
|
return a.f64[0] < b.f64[0];
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
int simde_mm_comineq_sd(simde__m128d a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return _mm_comineq_sd(a.n, b.n);
|
|
#else
|
|
return a.f64[0] != b.f64[0];
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128 simde_mm_castpd_ps(simde__m128d a)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128_C(_mm_castpd_ps(a.n));
|
|
#else
|
|
union {
|
|
simde__m128d pd;
|
|
simde__m128 ps;
|
|
} r;
|
|
r.pd = a;
|
|
return r.ps;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_castpd_si128(simde__m128d a)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_castpd_si128(a.n));
|
|
#else
|
|
union {
|
|
simde__m128d pd;
|
|
simde__m128i si128;
|
|
} r;
|
|
r.pd = a;
|
|
return r.si128;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_castps_pd(simde__m128 a)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128D_C(_mm_castps_pd(a.n));
|
|
#else
|
|
union {
|
|
simde__m128 ps;
|
|
simde__m128d pd;
|
|
} r;
|
|
r.ps = a;
|
|
return r.pd;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_castps_si128(simde__m128 a)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_castps_si128(a.n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
return SIMDE__M128I_NEON_C(i32, a.neon_i32);
|
|
#else
|
|
union {
|
|
simde__m128 ps;
|
|
simde__m128i si128;
|
|
} r;
|
|
r.ps = a;
|
|
return r.si128;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_castsi128_pd(simde__m128i a)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128D_C(_mm_castsi128_pd(a.n));
|
|
#else
|
|
union {
|
|
simde__m128i si128;
|
|
simde__m128d pd;
|
|
} r;
|
|
r.si128 = a;
|
|
return r.pd;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128 simde_mm_castsi128_ps(simde__m128i a)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128_C(_mm_castsi128_ps(a.n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
return SIMDE__M128_NEON_C(f32, a.neon_f32);
|
|
#else
|
|
union {
|
|
simde__m128i si128;
|
|
simde__m128 ps;
|
|
} r;
|
|
r.si128 = a;
|
|
return r.ps;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_cmpeq_epi8(simde__m128i a, simde__m128i b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_cmpeq_epi8(a.n, b.n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
return SIMDE__M128I_NEON_C(
|
|
i8, vreinterpretq_s8_u8(vceqq_s8(b.neon_i8, a.neon_i8)));
|
|
#else
|
|
simde__m128i r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i8) / sizeof(r.i8[0])); i++) {
|
|
r.i8[i] = (a.i8[i] == b.i8[i]) ? 0xff : 0x00;
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_cmpeq_epi16(simde__m128i a, simde__m128i b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_cmpeq_epi16(a.n, b.n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
return SIMDE__M128I_NEON_C(
|
|
i16, vreinterpretq_s16_u16(vceqq_s16(b.neon_i16, a.neon_i16)));
|
|
#else
|
|
simde__m128i r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) {
|
|
r.i16[i] = (a.i16[i] == b.i16[i]) ? 0xffff : 0x0000;
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_cmpeq_epi32(simde__m128i a, simde__m128i b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_cmpeq_epi32(a.n, b.n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
return SIMDE__M128I_NEON_C(
|
|
i32, vreinterpretq_s32_u32(vceqq_s32(b.neon_i32, a.neon_i32)));
|
|
#else
|
|
simde__m128i r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
|
|
r.i32[i] = (a.i32[i] == b.i32[i]) ? 0xffffffff : 0x00000000;
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_cmpeq_pd(simde__m128d a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128D_C(_mm_cmpeq_pd(a.n, b.n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
return SIMDE__M128D_NEON_C(
|
|
i32, vreinterpretq_s32_u32(
|
|
vceqq_s32(vreinterpretq_s32_f32(b.neon_f32),
|
|
vreinterpretq_s32_f32(a.neon_f32))));
|
|
#else
|
|
simde__m128d r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
|
|
r.u64[i] = (a.f64[i] == b.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_cmpeq_sd(simde__m128d a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128D_C(_mm_cmpeq_sd(a.n, b.n));
|
|
#else
|
|
simde__m128d r;
|
|
r.u64[0] = (a.f64[0] == b.f64[0]) ? ~UINT64_C(0) : 0;
|
|
r.u64[1] = a.u64[1];
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_cmpneq_pd(simde__m128d a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128D_C(_mm_cmpneq_pd(a.n, b.n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
return SIMDE__M128D_NEON_C(f32,
|
|
vreinterpretq_f32_u16(vmvnq_u16(
|
|
vceqq_s16(b.neon_i16, a.neon_i16))));
|
|
#else
|
|
simde__m128d r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
|
|
r.u64[i] = (a.f64[i] != b.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_cmpneq_sd(simde__m128d a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128D_C(_mm_cmpneq_sd(a.n, b.n));
|
|
#else
|
|
simde__m128d r;
|
|
r.u64[0] = (a.f64[0] != b.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
|
|
r.u64[1] = a.u64[1];
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_cmplt_epi8(simde__m128i a, simde__m128i b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_cmplt_epi8(a.n, b.n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
return SIMDE__M128I_NEON_C(
|
|
i8, vreinterpretq_s8_u8(vcltq_s8(a.neon_i8, b.neon_i8)));
|
|
#else
|
|
simde__m128i r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i8) / sizeof(r.i8[0])); i++) {
|
|
r.i8[i] = (a.i8[i] < b.i8[i]) ? 0xff : 0x00;
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_cmplt_epi16(simde__m128i a, simde__m128i b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_cmplt_epi16(a.n, b.n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
return SIMDE__M128I_NEON_C(
|
|
i16, vreinterpretq_s16_u16(vcltq_s16(a.neon_i16, b.neon_i16)));
|
|
#else
|
|
simde__m128i r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) {
|
|
r.i16[i] = (a.i16[i] < b.i16[i]) ? 0xffff : 0x0000;
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_cmplt_epi32(simde__m128i a, simde__m128i b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_cmplt_epi32(a.n, b.n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
return SIMDE__M128I_NEON_C(
|
|
i32, vreinterpretq_s32_u32(vcltq_s32(a.neon_i32, b.neon_i32)));
|
|
#else
|
|
simde__m128i r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
|
|
r.i32[i] = (a.i32[i] < b.i32[i]) ? 0xffffffff : 0x00000000;
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_cmplt_pd(simde__m128d a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128D_C(_mm_cmplt_pd(a.n, b.n));
|
|
#else
|
|
simde__m128d r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
|
|
r.u64[i] = (a.f64[i] < b.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_cmplt_sd(simde__m128d a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128D_C(_mm_cmplt_sd(a.n, b.n));
|
|
#else
|
|
simde__m128d r;
|
|
r.u64[0] = (a.f64[0] < b.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
|
|
r.u64[1] = a.u64[1];
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_cmple_pd(simde__m128d a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128D_C(_mm_cmple_pd(a.n, b.n));
|
|
#else
|
|
simde__m128d r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
|
|
r.u64[i] = (a.f64[i] <= b.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_cmple_sd(simde__m128d a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128D_C(_mm_cmple_sd(a.n, b.n));
|
|
#else
|
|
simde__m128d r;
|
|
r.u64[0] = (a.f64[0] <= b.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
|
|
r.u64[1] = a.u64[1];
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_cmpgt_epi8(simde__m128i a, simde__m128i b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_cmpgt_epi8(a.n, b.n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
return SIMDE__M128I_NEON_C(
|
|
i8, vreinterpretq_s8_u8(vcgtq_s8(a.neon_i8, b.neon_i8)));
|
|
#else
|
|
simde__m128i r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i8) / sizeof(r.i8[0])); i++) {
|
|
r.i8[i] = (a.i8[i] > b.i8[i]) ? 0xff : 0x00;
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_cmpgt_epi16(simde__m128i a, simde__m128i b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_cmpgt_epi16(a.n, b.n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
return SIMDE__M128I_NEON_C(
|
|
i16, vreinterpretq_s16_u16(vcgtq_s16(a.neon_i16, b.neon_i16)));
|
|
#else
|
|
simde__m128i r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) {
|
|
r.i16[i] = (a.i16[i] > b.i16[i]) ? 0xffff : 0x0000;
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_cmpgt_epi32(simde__m128i a, simde__m128i b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_cmpgt_epi32(a.n, b.n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
return SIMDE__M128I_NEON_C(
|
|
i32, vreinterpretq_s32_u32(vcgtq_s32(a.neon_i32, b.neon_i32)));
|
|
#else
|
|
simde__m128i r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
|
|
r.i32[i] = (a.i32[i] > b.i32[i]) ? 0xffffffff : 0x00000000;
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_cmpgt_pd(simde__m128d a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128D_C(_mm_cmpgt_pd(a.n, b.n));
|
|
#else
|
|
simde__m128d r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
|
|
r.u64[i] = (a.f64[i] > b.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_cmpgt_sd(simde__m128d a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI)
|
|
return SIMDE__M128D_C(_mm_cmpgt_sd(a.n, b.n));
|
|
#else
|
|
simde__m128d r;
|
|
r.u64[0] = (a.f64[0] > b.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
|
|
r.u64[1] = a.u64[1];
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_cmpge_pd(simde__m128d a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128D_C(_mm_cmpge_pd(a.n, b.n));
|
|
#else
|
|
simde__m128d r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
|
|
r.u64[i] = (a.f64[i] >= b.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_cmpge_sd(simde__m128d a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI)
|
|
return SIMDE__M128D_C(_mm_cmpge_sd(a.n, b.n));
|
|
#else
|
|
simde__m128d r;
|
|
r.u64[0] = (a.f64[0] >= b.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
|
|
r.u64[1] = a.u64[1];
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_cmpnge_pd(simde__m128d a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128D_C(_mm_cmpnge_pd(a.n, b.n));
|
|
#else
|
|
return simde_mm_cmplt_pd(a, b);
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_cmpnge_sd(simde__m128d a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI)
|
|
return SIMDE__M128D_C(_mm_cmpnge_sd(a.n, b.n));
|
|
#else
|
|
return simde_mm_cmplt_sd(a, b);
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_cmpnlt_pd(simde__m128d a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128D_C(_mm_cmpnlt_pd(a.n, b.n));
|
|
#else
|
|
return simde_mm_cmpge_pd(a, b);
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_cmpnlt_sd(simde__m128d a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128D_C(_mm_cmpnlt_sd(a.n, b.n));
|
|
#else
|
|
return simde_mm_cmpge_sd(a, b);
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_cmpnle_pd(simde__m128d a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128D_C(_mm_cmpnle_pd(a.n, b.n));
|
|
#else
|
|
return simde_mm_cmpgt_pd(a, b);
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_cmpnle_sd(simde__m128d a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128D_C(_mm_cmpnle_sd(a.n, b.n));
|
|
#else
|
|
return simde_mm_cmpgt_sd(a, b);
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_cmpord_pd(simde__m128d a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128D_C(_mm_cmpord_pd(a.n, b.n));
|
|
#else
|
|
simde__m128d r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
|
|
r.u64[i] = (!isnan(a.f64[i]) && !isnan(b.f64[i])) ? ~UINT64_C(0)
|
|
: UINT64_C(0);
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_cmpord_sd(simde__m128d a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128D_C(_mm_cmpord_sd(a.n, b.n));
|
|
#else
|
|
simde__m128d r;
|
|
r.u64[0] = (!isnan(a.f64[0]) && !isnan(b.f64[0])) ? ~UINT64_C(0)
|
|
: UINT64_C(0);
|
|
r.u64[1] = a.u64[1];
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_cmpunord_pd(simde__m128d a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128D_C(_mm_cmpunord_pd(a.n, b.n));
|
|
#else
|
|
simde__m128d r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
|
|
r.u64[i] = (isnan(a.f64[i]) || isnan(b.f64[i])) ? ~UINT64_C(0)
|
|
: UINT64_C(0);
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_cmpunord_sd(simde__m128d a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128D_C(_mm_cmpunord_sd(a.n, b.n));
|
|
#else
|
|
simde__m128d r;
|
|
r.u64[0] = (isnan(a.f64[0]) || isnan(b.f64[0])) ? ~UINT64_C(0)
|
|
: UINT64_C(0);
|
|
r.u64[1] = a.u64[1];
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_cvtepi32_pd(simde__m128i a)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128D_C(_mm_cvtepi32_pd(a.n));
|
|
#else
|
|
simde__m128d r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
|
|
r.f64[i] = (simde_float64)a.i32[i];
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128 simde_mm_cvtepi32_ps(simde__m128i a)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128_C(_mm_cvtepi32_ps(a.n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
return SIMDE__M128_NEON_C(f32, vcvtq_f32_s32(a.neon_i32));
|
|
#else
|
|
simde__m128 r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) {
|
|
r.f32[i] = (simde_float32)a.i32[i];
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_cvtpd_epi32(simde__m128d a)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_cvtpd_epi32(a.n));
|
|
#else
|
|
simde__m128i r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
|
|
r.i32[i] = (int32_t)a.f64[i];
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m64 simde_mm_cvtpd_pi32(simde__m128d a)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M64_C(_mm_cvtpd_pi32(a.n));
|
|
#else
|
|
simde__m64 r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
|
|
r.i32[i] = (int32_t)a.f64[i];
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128 simde_mm_cvtpd_ps(simde__m128d a)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128_C(_mm_cvtpd_ps(a.n));
|
|
#else
|
|
simde__m128 r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(a.f64) / sizeof(a.f64[0])); i++) {
|
|
r.f32[i] = (simde_float32)a.f64[i];
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_cvtpi32_pd(simde__m64 a)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128D_C(_mm_cvtpi32_pd(a.n));
|
|
#else
|
|
simde__m128d r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
|
|
r.f64[i] = (simde_float64)a.i32[i];
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_cvtps_epi32(simde__m128 a)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_cvtps_epi32(a.n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
/* The default rounding mode on SSE is 'round to even', which ArmV7
|
|
does not support! It is supported on ARMv8 however. */
|
|
#if defined(SIMDE_ARCH_AARCH64)
|
|
return SIMDE__M128I_NEON_C(i32, vcvtnq_s32_f32(a.neon_f32));
|
|
#else
|
|
uint32x4_t signmask = vdupq_n_u32(0x80000000);
|
|
float32x4_t half = vbslq_f32(signmask, a.neon_f32,
|
|
vdupq_n_f32(0.5f)); /* +/- 0.5 */
|
|
int32x4_t r_normal = vcvtq_s32_f32(
|
|
vaddq_f32(a.neon_f32, half)); /* round to integer: [a + 0.5]*/
|
|
int32x4_t r_trunc =
|
|
vcvtq_s32_f32(a.neon_f32); /* truncate to integer: [a] */
|
|
int32x4_t plusone = vshrq_n_s32(vnegq_s32(r_trunc), 31); /* 1 or 0 */
|
|
int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
|
|
vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
|
|
float32x4_t delta = vsubq_f32(
|
|
a.neon_f32,
|
|
vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
|
|
uint32x4_t is_delta_half =
|
|
vceqq_f32(delta, half); /* delta == +/- 0.5 */
|
|
return SIMDE__M128I_NEON_C(i32,
|
|
vbslq_s32(is_delta_half, r_even, r_normal));
|
|
#endif
|
|
#else
|
|
simde__m128i r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
|
|
r.i32[i] = (int32_t)a.f32[i];
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_cvtps_pd(simde__m128 a)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128D_C(_mm_cvtps_pd(a.n));
|
|
#else
|
|
simde__m128d r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
|
|
r.f64[i] = a.f32[i];
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
double simde_mm_cvtsd_f64(simde__m128d a)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI)
|
|
return _mm_cvtsd_f64(a.n);
|
|
#else
|
|
return a.f64[0];
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
int32_t simde_mm_cvtsd_si32(simde__m128d a)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return _mm_cvtsd_si32(a.n);
|
|
#else
|
|
return (int32_t)a.f64[0];
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
int32_t simde_mm_cvtsd_si64(simde__m128d a)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
|
|
#if defined(__PGI)
|
|
return _mm_cvtsd_si64x(a.n);
|
|
#else
|
|
return _mm_cvtsd_si64(a.n);
|
|
#endif
|
|
#else
|
|
return (int32_t)a.f64[0];
|
|
#endif
|
|
}
|
|
#define simde_mm_cvtsd_si64x(a) simde_mm_cvtsd_si64(a)
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128 simde_mm_cvtsd_ss(simde__m128 a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128_C(_mm_cvtsd_ss(a.n, b.n));
|
|
#else
|
|
simde__m128 r;
|
|
|
|
r.f32[0] = (simde_float32)b.f64[0];
|
|
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 1; i < (sizeof(r) / sizeof(r.i32[0])); i++) {
|
|
r.i32[i] = a.i32[i];
|
|
}
|
|
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
int32_t simde_mm_cvtsi128_si32(simde__m128i a)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return _mm_cvtsi128_si32(a.n);
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
return vgetq_lane_s32(a.neon_i32, 0);
|
|
#else
|
|
return a.i32[0];
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
int64_t simde_mm_cvtsi128_si64(simde__m128i a)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
|
|
#if defined(__PGI)
|
|
return _mm_cvtsi128_si64x(a.n);
|
|
#else
|
|
return _mm_cvtsi128_si64(a.n);
|
|
#endif
|
|
#else
|
|
return a.i64[0];
|
|
#endif
|
|
}
|
|
#define simde_mm_cvtsi128_si64x(a) simde_mm_cvtsi128_si64(a)
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_cvtsi32_sd(simde__m128d a, int32_t b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128D_C(_mm_cvtsi32_sd(a.n, b));
|
|
#else
|
|
simde__m128d r;
|
|
|
|
r.f64[0] = (simde_float64)b;
|
|
r.i64[1] = a.i64[1];
|
|
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_cvtsi32_si128(int32_t a)
|
|
{
|
|
simde__m128i r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_cvtsi32_si128(a);
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
r.neon_i32 = vsetq_lane_s32(a, vdupq_n_s32(0), 0);
|
|
#else
|
|
r.i32[0] = a;
|
|
r.i32[1] = 0;
|
|
r.i32[2] = 0;
|
|
r.i32[3] = 0;
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_cvtsi64_sd(simde__m128d a, int32_t b)
|
|
{
|
|
simde__m128d r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
|
|
#if !defined(__PGI)
|
|
r.n = _mm_cvtsi64_sd(a.n, b);
|
|
#else
|
|
r.n = _mm_cvtsi64x_sd(a.n, b);
|
|
#endif
|
|
#else
|
|
r.f64[0] = (simde_float64)b;
|
|
r.f64[1] = a.f64[1];
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
#define simde_mm_cvtsi64x_sd(a, b) simde_mm_cvtsi64(a, b)
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_cvtsi64_si128(int64_t a)
|
|
{
|
|
simde__m128i r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
|
|
#if !defined(__PGI)
|
|
r.n = _mm_cvtsi64_si128(a);
|
|
#else
|
|
r.n = _mm_cvtsi64x_si128(a);
|
|
#endif
|
|
#else
|
|
r.i64[0] = a;
|
|
r.i64[1] = 0;
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
#define simde_mm_cvtsi64x_si128(a) simde_mm_cvtsi64_si128(a)
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_cvtss_sd(simde__m128d a, simde__m128 b)
|
|
{
|
|
simde__m128d r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_cvtss_sd(a.n, b.n);
|
|
#else
|
|
r.f64[0] = b.f32[0];
|
|
r.i64[1] = a.i64[1];
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_cvttpd_epi32(simde__m128d a)
|
|
{
|
|
simde__m128i r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_cvttpd_epi32(a.n);
|
|
#else
|
|
for (size_t i = 0; i < (sizeof(a.f64) / sizeof(a.f64[0])); i++) {
|
|
r.i32[i] = (int32_t)trunc(a.f64[i]);
|
|
}
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m64 simde_mm_cvttpd_pi32(simde__m128d a)
|
|
{
|
|
simde__m64 r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_cvttpd_pi32(a.n);
|
|
#else
|
|
for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
|
|
r.i32[i] = (int32_t)trunc(a.f64[i]);
|
|
}
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_cvttps_epi32(simde__m128 a)
|
|
{
|
|
simde__m128i r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_cvttps_epi32(a.n);
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
r.neon_i32 = vcvtq_s32_f32(a.neon_f32);
|
|
#else
|
|
for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
|
|
r.i32[i] = (int32_t)truncf(a.f32[i]);
|
|
}
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
int32_t simde_mm_cvttsd_si32(simde__m128d a)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return _mm_cvttsd_si32(a.n);
|
|
#else
|
|
return (int32_t)trunc(a.f64[0]);
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
int64_t simde_mm_cvttsd_si64(simde__m128d a)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
|
|
#if !defined(__PGI)
|
|
return _mm_cvttsd_si64(a.n);
|
|
#else
|
|
return _mm_cvttsd_si64x(a.n);
|
|
#endif
|
|
#else
|
|
return (int64_t)trunc(a.f64[0]);
|
|
#endif
|
|
}
|
|
#define simde_mm_cvttsd_si64x(a) simde_mm_cvttsd_si64(a)
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_div_pd(simde__m128d a, simde__m128d b)
|
|
{
|
|
simde__m128d r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_div_pd(a.n, b.n);
|
|
#else
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
|
|
r.f64[i] = a.f64[i] / b.f64[i];
|
|
}
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_div_sd(simde__m128d a, simde__m128d b)
|
|
{
|
|
simde__m128d r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_div_sd(a.n, b.n);
|
|
#else
|
|
r.f64[0] = a.f64[0] / b.f64[0];
|
|
r.f64[1] = a.f64[1];
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
int32_t simde_mm_extract_epi16(simde__m128i a, const int imm8)
|
|
{
|
|
return a.u16[imm8 & 7];
|
|
}
|
|
#if defined(SIMDE_SSE2_NATIVE) && \
|
|
(!defined(SIMDE__REALLY_GCC) || HEDLEY_GCC_VERSION_CHECK(4, 6, 0))
|
|
#define simde_mm_extract_epi16(a, imm8) _mm_extract_epi16(a.n, imm8)
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
#define simde_mm_extract_epi16(a, imm8) \
|
|
(vgetq_lane_s16((a).neon_i16, (imm8)) & ((int32_t)UINT32_C(0x0000ffff)))
|
|
#endif
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_insert_epi16(simde__m128i a, int32_t i, const int imm8)
|
|
{
|
|
a.u16[imm8 & 7] = (int16_t)i;
|
|
return a;
|
|
}
|
|
#if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI)
|
|
#define simde_mm_insert_epi16(a, i, imm8) \
|
|
SIMDE__M128I_C(_mm_insert_epi16((a).n, (i), (imm8)))
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
#define simde_mm_insert_epi16(a, i, imm8) \
|
|
SIMDE__M128I_NEON_C(i16, vsetq_lane_s16((i), a.neon_i16, (imm8)))
|
|
#endif
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d
|
|
simde_mm_load_pd(simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM(2)])
|
|
{
|
|
simde__m128d r;
|
|
|
|
simde_assert_aligned(16, mem_addr);
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_load_pd(mem_addr);
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
r.neon_u32 = vld1q_u32((uint32_t const *)mem_addr);
|
|
#else
|
|
SIMDE__ASSUME_ALIGNED(mem_addr, 16);
|
|
memcpy(&r, mem_addr, sizeof(r));
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_load_pd1(simde_float64 const *mem_addr)
|
|
{
|
|
simde__m128d r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_load_pd1(mem_addr);
|
|
#else
|
|
r.f64[0] = *mem_addr;
|
|
r.f64[1] = *mem_addr;
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
#define simde_mm_load1_pd(mem_addr) simde_mm_load_pd1(mem_addr)
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_load_sd(simde_float64 const *mem_addr)
|
|
{
|
|
simde__m128d r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_load_sd(mem_addr);
|
|
#else
|
|
memcpy(&r, mem_addr, sizeof(simde_float64));
|
|
r.u64[1] = 0;
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_load_si128(simde__m128i const *mem_addr)
|
|
{
|
|
simde__m128i r;
|
|
|
|
simde_assert_aligned(16, mem_addr);
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_load_si128(&(mem_addr->n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
r.neon_i32 = vld1q_s32((int32_t const *)mem_addr);
|
|
#else
|
|
SIMDE__ASSUME_ALIGNED(mem_addr, 16);
|
|
memcpy(&r, mem_addr, sizeof(r));
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_loadh_pd(simde__m128d a, simde_float64 const *mem_addr)
|
|
{
|
|
simde__m128d r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_loadh_pd(a.n, mem_addr);
|
|
#else
|
|
simde_float64 t;
|
|
memcpy(&t, mem_addr, sizeof(t));
|
|
r.f64[0] = a.f64[0];
|
|
r.f64[1] = t;
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_loadl_epi64(simde__m128i const *mem_addr)
|
|
{
|
|
simde__m128i r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_loadl_epi64(&mem_addr->n);
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
r.neon_i32 = vcombine_s32(vld1_s32((int32_t const *)mem_addr),
|
|
vcreate_s32(0));
|
|
#else
|
|
r.u64[0] = mem_addr->u64[0];
|
|
r.u64[1] = 0;
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_loadl_pd(simde__m128d a, simde_float64 const *mem_addr)
|
|
{
|
|
simde__m128d r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_loadl_pd(a.n, mem_addr);
|
|
#else
|
|
memcpy(&r, mem_addr, sizeof(simde_float64));
|
|
r.u64[1] = a.u64[1];
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d
|
|
simde_mm_loadr_pd(simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM(2)])
|
|
{
|
|
simde__m128d r;
|
|
|
|
simde_assert_aligned(16, mem_addr);
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_loadr_pd(mem_addr);
|
|
#else
|
|
SIMDE__ASSUME_ALIGNED(mem_addr, 16);
|
|
r.f64[0] = mem_addr[1];
|
|
r.f64[1] = mem_addr[0];
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d
|
|
simde_mm_loadu_pd(simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM(2)])
|
|
{
|
|
simde__m128d r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_loadu_pd(mem_addr);
|
|
#else
|
|
simde_float64 l, h;
|
|
memcpy(&l, &mem_addr[0], sizeof(l));
|
|
memcpy(&h, &mem_addr[1], sizeof(h));
|
|
r.f64[0] = l;
|
|
r.f64[1] = h;
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_loadu_si128(simde__m128i const *mem_addr)
|
|
{
|
|
simde__m128i r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_loadu_si128(&((*mem_addr).n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
r.neon_i32 = vld1q_s32((int32_t const *)mem_addr);
|
|
#else
|
|
memcpy(&r, mem_addr, sizeof(r));
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_madd_epi16(simde__m128i a, simde__m128i b)
|
|
{
|
|
simde__m128i r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_madd_epi16(a.n, b.n);
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
int32x4_t pl =
|
|
vmull_s16(vget_low_s16(a.neon_i16), vget_low_s16(b.neon_i16));
|
|
int32x4_t ph =
|
|
vmull_s16(vget_high_s16(a.neon_i16), vget_high_s16(b.neon_i16));
|
|
int32x2_t rl = vpadd_s32(vget_low_s32(pl), vget_high_s32(pl));
|
|
int32x2_t rh = vpadd_s32(vget_low_s32(ph), vget_high_s32(ph));
|
|
r.neon_i32 = vcombine_s32(rl, rh);
|
|
#else
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r) / sizeof(r.i16[0])); i += 2) {
|
|
r.i32[i / 2] =
|
|
(a.i16[i] * b.i16[i]) + (a.i16[i + 1] * b.i16[i + 1]);
|
|
}
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
void simde_mm_maskmoveu_si128(simde__m128i a, simde__m128i mask,
|
|
int8_t mem_addr[HEDLEY_ARRAY_PARAM(16)])
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
_mm_maskmoveu_si128(a.n, mask.n, (char *)mem_addr);
|
|
#else
|
|
for (size_t i = 0; i < 16; i++) {
|
|
if (mask.u8[i] & 0x80) {
|
|
mem_addr[i] = a.i8[i];
|
|
}
|
|
}
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
int32_t simde_mm_movemask_epi8(simde__m128i a)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return _mm_movemask_epi8(a.n);
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
uint8x16_t input = a.neon_u8;
|
|
SIMDE_ALIGN(16)
|
|
static const int8_t xr[8] = {-7, -6, -5, -4, -3, -2, -1, 0};
|
|
uint8x8_t mask_and = vdup_n_u8(0x80);
|
|
int8x8_t mask_shift = vld1_s8(xr);
|
|
|
|
uint8x8_t lo = vget_low_u8(input);
|
|
uint8x8_t hi = vget_high_u8(input);
|
|
|
|
lo = vand_u8(lo, mask_and);
|
|
lo = vshl_u8(lo, mask_shift);
|
|
|
|
hi = vand_u8(hi, mask_and);
|
|
hi = vshl_u8(hi, mask_shift);
|
|
|
|
lo = vpadd_u8(lo, lo);
|
|
lo = vpadd_u8(lo, lo);
|
|
lo = vpadd_u8(lo, lo);
|
|
|
|
hi = vpadd_u8(hi, hi);
|
|
hi = vpadd_u8(hi, hi);
|
|
hi = vpadd_u8(hi, hi);
|
|
|
|
return ((hi[0] << 8) | (lo[0] & 0xFF));
|
|
#else
|
|
int32_t r = 0;
|
|
SIMDE__VECTORIZE_REDUCTION(| : r)
|
|
for (size_t i = 0; i < 16; i++) {
|
|
r |= (a.u8[15 - i] >> 7) << (15 - i);
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
int32_t simde_mm_movemask_pd(simde__m128d a)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return _mm_movemask_pd(a.n);
|
|
#else
|
|
int32_t r = 0;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(a.u64) / sizeof(a.u64[0])); i++) {
|
|
r |= (a.u64[i] >> 63) << i;
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m64 simde_mm_movepi64_pi64(simde__m128i a)
|
|
{
|
|
simde__m64 r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_movepi64_pi64(a.n);
|
|
#else
|
|
r.i64[0] = a.i64[0];
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_movpi64_epi64(simde__m64 a)
|
|
{
|
|
simde__m128i r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_movpi64_epi64(a.n);
|
|
#else
|
|
r.i64[0] = a.i64[0];
|
|
r.i64[1] = 0;
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_min_epi16(simde__m128i a, simde__m128i b)
|
|
{
|
|
simde__m128i r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_min_epi16(a.n, b.n);
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
r.neon_i16 = vminq_s16(a.neon_i16, b.neon_i16);
|
|
#else
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) {
|
|
r.i16[i] = (a.i16[i] < b.i16[i]) ? a.i16[i] : b.i16[i];
|
|
}
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_min_epu8(simde__m128i a, simde__m128i b)
|
|
{
|
|
simde__m128i r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_min_epu8(a.n, b.n);
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
r.neon_u8 = vminq_u8(a.neon_u8, b.neon_u8);
|
|
#else
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.u8) / sizeof(r.u8[0])); i++) {
|
|
r.u8[i] = (a.u8[i] < b.u8[i]) ? a.u8[i] : b.u8[i];
|
|
}
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_min_pd(simde__m128d a, simde__m128d b)
|
|
{
|
|
simde__m128d r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_min_pd(a.n, b.n);
|
|
#else
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
|
|
r.f64[i] = (a.f64[i] < b.f64[i]) ? a.f64[i] : b.f64[i];
|
|
}
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_min_sd(simde__m128d a, simde__m128d b)
|
|
{
|
|
simde__m128d r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_min_sd(a.n, b.n);
|
|
#else
|
|
r.f64[0] = (a.f64[0] < b.f64[0]) ? a.f64[0] : b.f64[0];
|
|
r.f64[1] = a.f64[1];
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_max_epi16(simde__m128i a, simde__m128i b)
|
|
{
|
|
simde__m128i r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_max_epi16(a.n, b.n);
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
r.neon_i16 = vmaxq_s16(a.neon_i16, b.neon_i16);
|
|
#else
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) {
|
|
r.i16[i] = (a.i16[i] > b.i16[i]) ? a.i16[i] : b.i16[i];
|
|
}
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_max_epu8(simde__m128i a, simde__m128i b)
|
|
{
|
|
simde__m128i r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_max_epu8(a.n, b.n);
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
r.neon_u8 = vmaxq_u8(a.neon_u8, b.neon_u8);
|
|
#else
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.u8) / sizeof(r.u8[0])); i++) {
|
|
r.u8[i] = (a.u8[i] > b.u8[i]) ? a.u8[i] : b.u8[i];
|
|
}
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_max_pd(simde__m128d a, simde__m128d b)
|
|
{
|
|
simde__m128d r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_max_pd(a.n, b.n);
|
|
#else
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
|
|
r.f64[i] = (a.f64[i] > b.f64[i]) ? a.f64[i] : b.f64[i];
|
|
}
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_max_sd(simde__m128d a, simde__m128d b)
|
|
{
|
|
simde__m128d r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_max_sd(a.n, b.n);
|
|
#else
|
|
r.f64[0] = (a.f64[0] > b.f64[0]) ? a.f64[0] : b.f64[0];
|
|
r.f64[1] = a.f64[1];
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_move_epi64(simde__m128i a)
|
|
{
|
|
simde__m128i r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_move_epi64(a.n);
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
r.neon_i64 = vsetq_lane_s64(0, a.neon_i64, 1);
|
|
#else
|
|
r.i64[0] = a.i64[0];
|
|
r.i64[1] = 0;
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_move_sd(simde__m128d a, simde__m128d b)
|
|
{
|
|
simde__m128d r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_move_sd(a.n, b.n);
|
|
#else
|
|
r.f64[0] = b.f64[0];
|
|
r.f64[1] = a.f64[1];
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_mul_epu32(simde__m128i a, simde__m128i b)
|
|
{
|
|
simde__m128i r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_mul_epu32(a.n, b.n);
|
|
#else
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.u64) / sizeof(r.u64[0])); i++) {
|
|
r.u64[i] = ((uint64_t)a.u32[i * 2]) * ((uint64_t)b.u32[i * 2]);
|
|
}
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_x_mm_mul_epi64(simde__m128i a, simde__m128i b)
|
|
{
|
|
simde__m128i r;
|
|
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) {
|
|
r.i64[i] = a.i64[i] * b.i64[i];
|
|
}
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_x_mm_mod_epi64(simde__m128i a, simde__m128i b)
|
|
{
|
|
simde__m128i r;
|
|
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) {
|
|
r.i64[i] = a.i64[i] % b.i64[i];
|
|
}
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_mul_pd(simde__m128d a, simde__m128d b)
|
|
{
|
|
simde__m128d r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_mul_pd(a.n, b.n);
|
|
#else
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
|
|
r.f64[i] = a.f64[i] * b.f64[i];
|
|
}
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_mul_sd(simde__m128d a, simde__m128d b)
|
|
{
|
|
simde__m128d r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_mul_sd(a.n, b.n);
|
|
#else
|
|
r.f64[0] = a.f64[0] * b.f64[0];
|
|
r.f64[1] = a.f64[1];
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m64 simde_mm_mul_su32(simde__m64 a, simde__m64 b)
|
|
{
|
|
simde__m64 r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI)
|
|
r.n = _mm_mul_su32(a.n, b.n);
|
|
#else
|
|
r.u64[0] = ((uint64_t)a.u32[0]) * ((uint64_t)b.u32[0]);
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_mulhi_epi16(simde__m128i a, simde__m128i b)
|
|
{
|
|
simde__m128i r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_mulhi_epi16(a.n, b.n);
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
int16x4_t a3210 = vget_low_s16(a.neon_i16);
|
|
int16x4_t b3210 = vget_low_s16(b.neon_i16);
|
|
int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */
|
|
int16x4_t a7654 = vget_high_s16(a.neon_i16);
|
|
int16x4_t b7654 = vget_high_s16(b.neon_i16);
|
|
int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */
|
|
uint16x8x2_t rv = vuzpq_u16(vreinterpretq_u16_s32(ab3210),
|
|
vreinterpretq_u16_s32(ab7654));
|
|
r.neon_u16 = rv.val[1];
|
|
#else
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) {
|
|
r.u16[i] = (uint16_t)(((uint32_t)(((int32_t)a.i16[i]) *
|
|
((int32_t)b.i16[i]))) >>
|
|
16);
|
|
}
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_mulhi_epu16(simde__m128i a, simde__m128i b)
|
|
{
|
|
simde__m128i r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI)
|
|
r.n = _mm_mulhi_epu16(a.n, b.n);
|
|
#else
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.u16) / sizeof(r.u16[0])); i++) {
|
|
r.u16[i] = (uint16_t)(
|
|
(((uint32_t)a.u16[i]) * ((uint32_t)b.u16[i])) >> 16);
|
|
}
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_mullo_epi16(simde__m128i a, simde__m128i b)
|
|
{
|
|
simde__m128i r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_mullo_epi16(a.n, b.n);
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
r.neon_i16 = vmulq_s16(a.neon_i16, b.neon_i16);
|
|
#else
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) {
|
|
r.u16[i] = (uint16_t)(((uint32_t)(((int32_t)a.i16[i]) *
|
|
((int32_t)b.i16[i]))) &
|
|
0xffff);
|
|
}
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_or_pd(simde__m128d a, simde__m128d b)
|
|
{
|
|
simde__m128d r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_or_pd(a.n, b.n);
|
|
#else
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) {
|
|
r.i64[i] = a.i64[i] | b.i64[i];
|
|
}
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_or_si128(simde__m128i a, simde__m128i b)
|
|
{
|
|
simde__m128i r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_or_si128(a.n, b.n);
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
r.neon_i32 = vorrq_s32(a.neon_i32, b.neon_i32);
|
|
#else
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) {
|
|
r.i64[i] = a.i64[i] | b.i64[i];
|
|
}
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_packs_epi16(simde__m128i a, simde__m128i b)
|
|
{
|
|
simde__m128i r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_packs_epi16(a.n, b.n);
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
r.neon_i8 = vcombine_s8(vqmovn_s16(a.neon_i16), vqmovn_s16(b.neon_i16));
|
|
#else
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) {
|
|
r.i8[i] = (a.i16[i] > INT8_MAX)
|
|
? INT8_MAX
|
|
: ((a.i16[i] < INT8_MIN)
|
|
? INT8_MIN
|
|
: ((int8_t)a.i16[i]));
|
|
r.i8[i + 8] = (b.i16[i] > INT8_MAX)
|
|
? INT8_MAX
|
|
: ((b.i16[i] < INT8_MIN)
|
|
? INT8_MIN
|
|
: ((int8_t)b.i16[i]));
|
|
}
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_packs_epi32(simde__m128i a, simde__m128i b)
|
|
{
|
|
simde__m128i r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_packs_epi32(a.n, b.n);
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
r.neon_i16 =
|
|
vcombine_s16(vqmovn_s32(a.neon_i32), vqmovn_s32(b.neon_i32));
|
|
#else
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
|
|
r.i16[i] = (a.i32[i] > INT16_MAX)
|
|
? INT16_MAX
|
|
: ((a.i32[i] < INT16_MIN)
|
|
? INT16_MIN
|
|
: ((int16_t)a.i32[i]));
|
|
r.i16[i + 4] = (b.i32[i] > INT16_MAX)
|
|
? INT16_MAX
|
|
: ((b.i32[i] < INT16_MIN)
|
|
? INT16_MIN
|
|
: ((int16_t)b.i32[i]));
|
|
}
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_packus_epi16(simde__m128i a, simde__m128i b)
|
|
{
|
|
simde__m128i r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_packus_epi16(a.n, b.n);
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
r.neon_u8 =
|
|
vcombine_u8(vqmovun_s16(a.neon_i16), vqmovun_s16(b.neon_i16));
|
|
#else
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) {
|
|
r.u8[i] = (a.i16[i] > UINT8_MAX)
|
|
? UINT8_MAX
|
|
: ((a.i16[i] < 0) ? 0 : ((int8_t)a.i16[i]));
|
|
r.u8[i + 8] =
|
|
(b.i16[i] > UINT8_MAX)
|
|
? UINT8_MAX
|
|
: ((b.i16[i] < 0) ? 0 : ((int8_t)b.i16[i]));
|
|
}
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
void simde_mm_pause(void)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
_mm_pause();
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_sad_epu8(simde__m128i a, simde__m128i b)
|
|
{
|
|
simde__m128i r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_sad_epu8(a.n, b.n);
|
|
#else
|
|
for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) {
|
|
uint16_t tmp = 0;
|
|
SIMDE__VECTORIZE_REDUCTION(+ : tmp)
|
|
for (size_t j = 0; j < ((sizeof(r.u8) / sizeof(r.u8[0])) / 2);
|
|
j++) {
|
|
const size_t e = j + (i * 8);
|
|
tmp += (a.u8[e] > b.u8[e]) ? (a.u8[e] - b.u8[e])
|
|
: (b.u8[e] - a.u8[e]);
|
|
}
|
|
r.i64[i] = tmp;
|
|
}
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_set_epi8(int8_t e15, int8_t e14, int8_t e13, int8_t e12,
|
|
int8_t e11, int8_t e10, int8_t e9, int8_t e8,
|
|
int8_t e7, int8_t e6, int8_t e5, int8_t e4,
|
|
int8_t e3, int8_t e2, int8_t e1, int8_t e0)
|
|
{
|
|
simde__m128i r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_set_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4,
|
|
e3, e2, e1, e0);
|
|
#else
|
|
r.i8[0] = e0;
|
|
r.i8[1] = e1;
|
|
r.i8[2] = e2;
|
|
r.i8[3] = e3;
|
|
r.i8[4] = e4;
|
|
r.i8[5] = e5;
|
|
r.i8[6] = e6;
|
|
r.i8[7] = e7;
|
|
r.i8[8] = e8;
|
|
r.i8[9] = e9;
|
|
r.i8[10] = e10;
|
|
r.i8[11] = e11;
|
|
r.i8[12] = e12;
|
|
r.i8[13] = e13;
|
|
r.i8[14] = e14;
|
|
r.i8[15] = e15;
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_set_epi16(int16_t e7, int16_t e6, int16_t e5, int16_t e4,
|
|
int16_t e3, int16_t e2, int16_t e1, int16_t e0)
|
|
{
|
|
simde__m128i r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_set_epi16(e7, e6, e5, e4, e3, e2, e1, e0);
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
SIMDE_ALIGN(16) int16_t data[8] = {e0, e1, e2, e3, e4, e5, e6, e7};
|
|
r.neon_i16 = vld1q_s16(data);
|
|
#else
|
|
r.i16[0] = e0;
|
|
r.i16[1] = e1;
|
|
r.i16[2] = e2;
|
|
r.i16[3] = e3;
|
|
r.i16[4] = e4;
|
|
r.i16[5] = e5;
|
|
r.i16[6] = e6;
|
|
r.i16[7] = e7;
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_set_epi32(int32_t e3, int32_t e2, int32_t e1, int32_t e0)
|
|
{
|
|
simde__m128i r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_set_epi32(e3, e2, e1, e0);
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
SIMDE_ALIGN(16) int32_t data[4] = {e0, e1, e2, e3};
|
|
r.neon_i32 = vld1q_s32(data);
|
|
#else
|
|
r.i32[0] = e0;
|
|
r.i32[1] = e1;
|
|
r.i32[2] = e2;
|
|
r.i32[3] = e3;
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_set_epi64(simde__m64 e1, simde__m64 e0)
|
|
{
|
|
simde__m128i r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_set_epi64(e1.n, e0.n);
|
|
#else
|
|
r.i64[0] = e0.i64[0];
|
|
r.i64[1] = e1.i64[0];
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_set_epi64x(int64_t e1, int64_t e0)
|
|
{
|
|
simde__m128i r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_set_epi64x(e1, e0);
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
r = SIMDE__M128I_NEON_C(i64,
|
|
vcombine_s64(vdup_n_s64(e0), vdup_n_s64(e1)));
|
|
#else
|
|
r.i64[0] = e0;
|
|
r.i64[1] = e1;
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_x_mm_set_epu8(uint8_t e15, uint8_t e14, uint8_t e13,
|
|
uint8_t e12, uint8_t e11, uint8_t e10,
|
|
uint8_t e9, uint8_t e8, uint8_t e7, uint8_t e6,
|
|
uint8_t e5, uint8_t e4, uint8_t e3, uint8_t e2,
|
|
uint8_t e1, uint8_t e0)
|
|
{
|
|
simde__m128i r;
|
|
|
|
r.u8[0] = e0;
|
|
r.u8[1] = e1;
|
|
r.u8[2] = e2;
|
|
r.u8[3] = e3;
|
|
r.u8[4] = e4;
|
|
r.u8[5] = e5;
|
|
r.u8[6] = e6;
|
|
r.u8[7] = e7;
|
|
r.u8[8] = e8;
|
|
r.u8[9] = e9;
|
|
r.u8[10] = e10;
|
|
r.u8[11] = e11;
|
|
r.u8[12] = e12;
|
|
r.u8[13] = e13;
|
|
r.u8[14] = e14;
|
|
r.u8[15] = e15;
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_x_mm_set_epu16(uint16_t e7, uint16_t e6, uint16_t e5,
|
|
uint16_t e4, uint16_t e3, uint16_t e2,
|
|
uint16_t e1, uint16_t e0)
|
|
{
|
|
simde__m128i r;
|
|
|
|
r.u16[0] = e0;
|
|
r.u16[1] = e1;
|
|
r.u16[2] = e2;
|
|
r.u16[3] = e3;
|
|
r.u16[4] = e4;
|
|
r.u16[5] = e5;
|
|
r.u16[6] = e6;
|
|
r.u16[7] = e7;
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_x_mm_set_epu32(uint32_t e3, uint32_t e2, uint32_t e1,
|
|
uint32_t e0)
|
|
{
|
|
simde__m128i r;
|
|
|
|
r.u32[0] = e0;
|
|
r.u32[1] = e1;
|
|
r.u32[2] = e2;
|
|
r.u32[3] = e3;
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_x_mm_set_epu64x(uint64_t e1, uint64_t e0)
|
|
{
|
|
simde__m128i r;
|
|
|
|
r.u64[0] = e0;
|
|
r.u64[1] = e1;
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_set_pd(simde_float64 e1, simde_float64 e0)
|
|
{
|
|
simde__m128d r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_set_pd(e1, e0);
|
|
#else
|
|
r.f64[0] = e0;
|
|
r.f64[1] = e1;
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_set_pd1(simde_float64 a)
|
|
{
|
|
simde__m128d r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_set1_pd(a);
|
|
#else
|
|
r.f64[0] = a;
|
|
r.f64[1] = a;
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_set_sd(simde_float64 a)
|
|
{
|
|
simde__m128d r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_set_sd(a);
|
|
#else
|
|
r.f64[0] = a;
|
|
r.u64[1] = 0;
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_set1_epi8(int8_t a)
|
|
{
|
|
simde__m128i r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_set1_epi8(a);
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
r.neon_i8 = vdupq_n_s8(a);
|
|
#else
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i8) / sizeof(r.i8[0])); i++) {
|
|
r.i8[i] = a;
|
|
}
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_set1_epi16(int16_t a)
|
|
{
|
|
simde__m128i r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_set1_epi16(a);
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
r.neon_i16 = vdupq_n_s16(a);
|
|
#else
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) {
|
|
r.i16[i] = a;
|
|
}
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_set1_epi32(int32_t a)
|
|
{
|
|
simde__m128i r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_set1_epi32(a);
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
r.neon_i32 = vdupq_n_s32(a);
|
|
#else
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
|
|
r.i32[i] = a;
|
|
}
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_set1_epi64x(int64_t a)
|
|
{
|
|
simde__m128i r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_set1_epi64x(a);
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
r.neon_i64 = vmovq_n_s64(a);
|
|
#else
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) {
|
|
r.i64[i] = a;
|
|
}
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_set1_epi64(simde__m64 a)
|
|
{
|
|
simde__m128i r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_set1_epi64(a.n);
|
|
#else
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) {
|
|
r.i64[i] = a.i64[0];
|
|
}
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_set1_pd(simde_float64 a)
|
|
{
|
|
simde__m128d r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_set1_pd(a);
|
|
#else
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) {
|
|
r.f64[i] = a;
|
|
}
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_setr_epi8(int8_t e15, int8_t e14, int8_t e13, int8_t e12,
|
|
int8_t e11, int8_t e10, int8_t e9, int8_t e8,
|
|
int8_t e7, int8_t e6, int8_t e5, int8_t e4,
|
|
int8_t e3, int8_t e2, int8_t e1, int8_t e0)
|
|
{
|
|
simde__m128i r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_setr_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5,
|
|
e4, e3, e2, e1, e0);
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
int8_t t[] = {e15, e14, e13, e12, e11, e10, e9, e8,
|
|
e7, e6, e5, e4, e3, e2, e1, e0};
|
|
r.neon_i8 = vld1q_s8(t);
|
|
#else
|
|
r.i8[0] = e15;
|
|
r.i8[1] = e14;
|
|
r.i8[2] = e13;
|
|
r.i8[3] = e12;
|
|
r.i8[4] = e11;
|
|
r.i8[5] = e10;
|
|
r.i8[6] = e9;
|
|
r.i8[7] = e8;
|
|
r.i8[8] = e7;
|
|
r.i8[9] = e6;
|
|
r.i8[10] = e5;
|
|
r.i8[11] = e4;
|
|
r.i8[12] = e3;
|
|
r.i8[13] = e2;
|
|
r.i8[14] = e1;
|
|
r.i8[15] = e0;
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_setr_epi16(int16_t e7, int16_t e6, int16_t e5, int16_t e4,
|
|
int16_t e3, int16_t e2, int16_t e1, int16_t e0)
|
|
{
|
|
simde__m128i r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_setr_epi16(e7, e6, e5, e4, e3, e2, e1, e0);
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
int16_t t[] = {e7, e6, e5, e4, e3, e2, e1, e0};
|
|
r.neon_i16 = vld1q_s16(t);
|
|
#else
|
|
r.i16[0] = e7;
|
|
r.i16[1] = e6;
|
|
r.i16[2] = e5;
|
|
r.i16[3] = e4;
|
|
r.i16[4] = e3;
|
|
r.i16[5] = e2;
|
|
r.i16[6] = e1;
|
|
r.i16[7] = e0;
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_setr_epi32(int32_t e3, int32_t e2, int32_t e1, int32_t e0)
|
|
{
|
|
simde__m128i r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_setr_epi32(e3, e2, e1, e0);
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
int32_t t[] = {e3, e2, e1, e0};
|
|
r.neon_i32 = vld1q_s32(t);
|
|
#else
|
|
r.i32[0] = e3;
|
|
r.i32[1] = e2;
|
|
r.i32[2] = e1;
|
|
r.i32[3] = e0;
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_setr_epi64(simde__m64 e1, simde__m64 e0)
|
|
{
|
|
simde__m128i r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_setr_epi64(e1.n, e0.n);
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
r.neon_i64 = vcombine_s64(e1.neon_i64, e0.neon_i64);
|
|
#else
|
|
r.i64[0] = e1.i64[0];
|
|
r.i64[1] = e0.i64[0];
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_setr_pd(simde_float64 e1, simde_float64 e0)
|
|
{
|
|
simde__m128d r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_setr_pd(e1, e0);
|
|
#else
|
|
r.f64[0] = e1;
|
|
r.f64[1] = e0;
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_setzero_pd(void)
|
|
{
|
|
simde__m128d r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_setzero_pd();
|
|
#else
|
|
r.u64[0] = 0;
|
|
r.u64[1] = 0;
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_setzero_si128(void)
|
|
{
|
|
simde__m128i r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
r.n = _mm_setzero_si128();
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
r.neon_i32 = vdupq_n_s32(0);
|
|
#else
|
|
r.u64[0] = 0;
|
|
r.u64[1] = 0;
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_shuffle_epi32(simde__m128i a, const int imm8)
|
|
{
|
|
simde__m128i r;
|
|
|
|
for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
|
|
r.i32[i] = a.i32[(imm8 >> (i * 2)) & 3];
|
|
}
|
|
|
|
return r;
|
|
}
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
#define simde_mm_shuffle_epi32(a, imm8) \
|
|
SIMDE__M128I_C(_mm_shuffle_epi32((a).n, (imm8)))
|
|
#elif defined(SIMDE__SHUFFLE_VECTOR)
|
|
#define simde_mm_shuffle_epi32(a, imm8) \
|
|
({ \
|
|
const simde__m128i simde__tmp_a_ = a; \
|
|
(simde__m128i){.i32 = SIMDE__SHUFFLE_VECTOR( \
|
|
32, 16, (simde__tmp_a_).i32, \
|
|
(simde__tmp_a_).i32, ((imm8)) & 3, \
|
|
((imm8) >> 2) & 3, ((imm8) >> 4) & 3, \
|
|
((imm8) >> 6) & 3)}; \
|
|
})
|
|
#endif
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_shuffle_pd(simde__m128d a, simde__m128d b, const int imm8)
|
|
{
|
|
simde__m128d r;
|
|
|
|
r.f64[0] = ((imm8 & 1) == 0) ? a.f64[0] : a.f64[1];
|
|
r.f64[1] = ((imm8 & 2) == 0) ? b.f64[0] : b.f64[1];
|
|
|
|
return r;
|
|
}
|
|
#if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI)
|
|
#define simde_mm_shuffle_pd(a, b, imm8) \
|
|
SIMDE__M128D_C(_mm_shuffle_pd((a).n, (b).n, (imm8)))
|
|
#elif defined(SIMDE__SHUFFLE_VECTOR)
|
|
#define simde_mm_shuffle_pd(a, b, imm8) \
|
|
({ \
|
|
(simde__m128d){.f64 = SIMDE__SHUFFLE_VECTOR( \
|
|
64, 16, (a).f64, (b).f64, \
|
|
(((imm8)) & 1), \
|
|
(((imm8) >> 1) & 1) + 2)}; \
|
|
})
|
|
#endif
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_shufflehi_epi16(simde__m128i a, const int imm8)
|
|
{
|
|
simde__m128i r;
|
|
|
|
r.i64[0] = a.i64[0];
|
|
for (size_t i = 4; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) {
|
|
r.i16[i] = a.i16[((imm8 >> ((i - 4) * 2)) & 3) + 4];
|
|
}
|
|
|
|
return r;
|
|
}
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
#define simde_mm_shufflehi_epi16(a, imm8) \
|
|
SIMDE__M128I_C(_mm_shufflehi_epi16((a).n, (imm8)))
|
|
#elif defined(SIMDE__SHUFFLE_VECTOR)
|
|
#define simde_mm_shufflehi_epi16(a, imm8) \
|
|
({ \
|
|
const simde__m128i simde__tmp_a_ = a; \
|
|
(simde__m128i){.i16 = SIMDE__SHUFFLE_VECTOR( \
|
|
16, 16, (simde__tmp_a_).i16, \
|
|
(simde__tmp_a_).i16, 0, 1, 2, 3, \
|
|
(((imm8)) & 3) + 4, \
|
|
(((imm8) >> 2) & 3) + 4, \
|
|
(((imm8) >> 4) & 3) + 4, \
|
|
(((imm8) >> 6) & 3) + 4)}; \
|
|
})
|
|
#endif
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_shufflelo_epi16(simde__m128i a, const int imm8)
|
|
{
|
|
simde__m128i r;
|
|
|
|
for (size_t i = 0; i < ((sizeof(r.i16) / sizeof(r.i16[0])) / 2); i++) {
|
|
r.i16[i] = a.i16[((imm8 >> (i * 2)) & 3)];
|
|
}
|
|
r.i64[1] = a.i64[1];
|
|
|
|
return r;
|
|
}
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
#define simde_mm_shufflelo_epi16(a, imm8) \
|
|
SIMDE__M128I_C(_mm_shufflelo_epi16((a).n, (imm8)))
|
|
#elif defined(SIMDE__SHUFFLE_VECTOR)
|
|
#define simde_mm_shufflelo_epi16(a, imm8) \
|
|
({ \
|
|
const simde__m128i simde__tmp_a_ = a; \
|
|
(simde__m128i){.i16 = SIMDE__SHUFFLE_VECTOR( \
|
|
16, 16, (simde__tmp_a_).i16, \
|
|
(simde__tmp_a_).i16, (((imm8)) & 3), \
|
|
(((imm8) >> 2) & 3), \
|
|
(((imm8) >> 4) & 3), \
|
|
(((imm8) >> 6) & 3), 4, 5, 6, 7)}; \
|
|
})
|
|
#endif
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_sll_epi16(simde__m128i a, simde__m128i count)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_sll_epi16(a.n, count.n));
|
|
#else
|
|
simde__m128i r;
|
|
|
|
if (count.u64[0] > 15)
|
|
return simde_mm_setzero_si128();
|
|
const int s = (int)(count.u64[0]);
|
|
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.u16) / sizeof(r.u16[0])); i++) {
|
|
r.u16[i] = a.u16[i] << s;
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_sll_epi32(simde__m128i a, simde__m128i count)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_sll_epi32(a.n, count.n));
|
|
#else
|
|
simde__m128i r;
|
|
|
|
if (count.u64[0] > 31)
|
|
return simde_mm_setzero_si128();
|
|
const int s = (int)(count.u64[0]);
|
|
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
|
|
r.i32[i] = a.i32[i] << s;
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_sll_epi64(simde__m128i a, simde__m128i count)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_sll_epi64(a.n, count.n));
|
|
#else
|
|
simde__m128i r;
|
|
|
|
if (count.u64[0] > 63)
|
|
return simde_mm_setzero_si128();
|
|
const int s = (int)(count.u64[0]);
|
|
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) {
|
|
r.i64[i] = a.i64[i] << s;
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_sqrt_pd(simde__m128d a)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128D_C(_mm_sqrt_pd(a.n));
|
|
#else
|
|
simde__m128d r;
|
|
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
|
|
r.f64[i] = sqrt(a.f64[i]);
|
|
}
|
|
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_sqrt_sd(simde__m128d a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128D_C(_mm_sqrt_sd(a.n, b.n));
|
|
#else
|
|
simde__m128d r;
|
|
r.f64[0] = sqrt(b.f64[0]);
|
|
r.f64[1] = a.f64[1];
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_srl_epi16(simde__m128i a, simde__m128i count)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_srl_epi16(a.n, count.n));
|
|
#else
|
|
simde__m128i r;
|
|
|
|
if (count.u64[0] > 15)
|
|
return simde_mm_setzero_si128();
|
|
const int s = (int)(count.u64[0]);
|
|
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.u16) / sizeof(r.u16[0])); i++) {
|
|
r.u16[i] = a.u16[i] >> s;
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_srl_epi32(simde__m128i a, simde__m128i count)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_srl_epi32(a.n, count.n));
|
|
#else
|
|
simde__m128i r;
|
|
|
|
if (count.u64[0] > 31)
|
|
return simde_mm_setzero_si128();
|
|
const int s = (int)(count.u64[0]);
|
|
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.u32) / sizeof(r.u32[0])); i++) {
|
|
r.u32[i] = a.u32[i] >> s;
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_srl_epi64(simde__m128i a, simde__m128i count)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_srl_epi64(a.n, count.n));
|
|
#else
|
|
simde__m128i r;
|
|
|
|
if (count.u64[0] > 31)
|
|
return simde_mm_setzero_si128();
|
|
const int s = (int)(count.u64[0]);
|
|
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.u64) / sizeof(r.u64[0])); i++) {
|
|
r.u64[i] = a.u64[i] >> s;
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_srai_epi16(simde__m128i a, int imm8)
|
|
{
|
|
simde__m128i r;
|
|
|
|
const uint16_t m =
|
|
(uint16_t)((~0U) << ((sizeof(int16_t) * CHAR_BIT) - imm8));
|
|
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r) / sizeof(r.u16[0])); i++) {
|
|
const uint16_t is_neg = ((uint16_t)(
|
|
((a.u16[i]) >> ((sizeof(int16_t) * CHAR_BIT) - 1))));
|
|
r.u16[i] = (a.u16[i] >> imm8) | (m * is_neg);
|
|
}
|
|
|
|
return r;
|
|
}
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
#define simde_mm_srai_epi16(a, imm8) \
|
|
SIMDE__M128I_C(_mm_srai_epi16((a).n, (imm8)));
|
|
#endif
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_srai_epi32(simde__m128i a, int imm8)
|
|
{
|
|
simde__m128i r;
|
|
|
|
const uint32_t m =
|
|
(uint32_t)((~0U) << ((sizeof(int) * CHAR_BIT) - imm8));
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r) / sizeof(r.u32[0])); i++) {
|
|
uint32_t is_neg = ((uint32_t)(
|
|
((a.u32[i]) >> ((sizeof(int32_t) * CHAR_BIT) - 1))));
|
|
r.u32[i] = (a.u32[i] >> imm8) | (m * is_neg);
|
|
}
|
|
|
|
return r;
|
|
}
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
#define simde_mm_srai_epi32(a, imm8) \
|
|
SIMDE__M128I_C(_mm_srai_epi32((a).n, (imm8)))
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
#define simde_mm_srai_epi32(a, imm8) \
|
|
SIMDE__M128I_NEON_C( \
|
|
i32, \
|
|
((imm8) <= 0) \
|
|
? (a.neon_i32) \
|
|
: (((imm8) > 31) \
|
|
? (vshrq_n_s32(vshrq_n_s32(a.neon_i32, 16), \
|
|
16)) \
|
|
: (vshrq_n_s32(a.neon_i32, (imm8)))))
|
|
#endif
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_sra_epi16(simde__m128i a, simde__m128i count)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_sra_epi16(a.n, count.n));
|
|
#else
|
|
simde__m128i r;
|
|
int cnt = (int)count.i64[0];
|
|
|
|
if (cnt > 15 || cnt < 0) {
|
|
for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0]));
|
|
i++) {
|
|
r.u16[i] = (a.i16[i] < 0) ? 0xffff : 0x0000;
|
|
}
|
|
} else {
|
|
const uint16_t m = (uint16_t)(
|
|
(~0U) << ((sizeof(int16_t) * CHAR_BIT) - cnt));
|
|
for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0]));
|
|
i++) {
|
|
const uint16_t is_neg = a.i16[i] < 0;
|
|
r.u16[i] = (a.u16[i] >> cnt) | (m * is_neg);
|
|
}
|
|
}
|
|
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_sra_epi32(simde__m128i a, simde__m128i count)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_MM_SRA_EPI32)
|
|
return SIMDE__M128I_C(_mm_sra_epi32(a.n, count.n));
|
|
#else
|
|
simde__m128i r;
|
|
const uint64_t cnt = count.u64[0];
|
|
|
|
if (cnt > 31) {
|
|
for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0]));
|
|
i++) {
|
|
r.u32[i] = (a.i32[i] < 0) ? UINT32_MAX : 0;
|
|
}
|
|
} else if (cnt == 0) {
|
|
memcpy(&r, &a, sizeof(r));
|
|
} else {
|
|
const uint32_t m = (uint32_t)(
|
|
(~0U) << ((sizeof(int32_t) * CHAR_BIT) - cnt));
|
|
for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0]));
|
|
i++) {
|
|
const uint32_t is_neg = a.i32[i] < 0;
|
|
r.u32[i] = (a.u32[i] >> cnt) | (m * is_neg);
|
|
}
|
|
}
|
|
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_slli_epi16(simde__m128i a, const int imm8)
|
|
{
|
|
simde__m128i r;
|
|
const int s = (imm8 > ((int)sizeof(r.i16[0]) * CHAR_BIT) - 1) ? 0
|
|
: imm8;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) {
|
|
r.i16[i] = a.i16[i] << s;
|
|
}
|
|
return r;
|
|
}
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
#define simde_mm_slli_epi16(a, imm8) SIMDE__M128I_C(_mm_slli_epi16(a.n, imm8));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
#define simde_mm_slli_epi16(a, imm8) \
|
|
SIMDE__M128I_NEON_C( \
|
|
i16, ((imm8) <= 0) \
|
|
? ((a).neon_i16) \
|
|
: (((imm8) > 31) ? (vdupq_n_s16(0)) \
|
|
: (vshlq_n_s16((a).neon_i16, \
|
|
(imm8)))))
|
|
#endif
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_slli_epi32(simde__m128i a, const int imm8)
|
|
{
|
|
simde__m128i r;
|
|
const int s = (imm8 > ((int)sizeof(r.i32[0]) * CHAR_BIT) - 1) ? 0
|
|
: imm8;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
|
|
r.i32[i] = a.i32[i] << s;
|
|
}
|
|
return r;
|
|
}
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
#define simde_mm_slli_epi32(a, imm8) SIMDE__M128I_C(_mm_slli_epi32(a.n, imm8));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
#define simde_mm_slli_epi32(a, imm8) \
|
|
SIMDE__M128I_NEON_C( \
|
|
i32, ((imm8) <= 0) \
|
|
? ((a).neon_i32) \
|
|
: (((imm8) > 31) ? (vdupq_n_s32(0)) \
|
|
: (vshlq_n_s32((a).neon_i32, \
|
|
(imm8)))))
|
|
#endif
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_slli_epi64(simde__m128i a, const int imm8)
|
|
{
|
|
simde__m128i r;
|
|
const int s = (imm8 > ((int)sizeof(r.i64[0]) * CHAR_BIT) - 1) ? 0
|
|
: imm8;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) {
|
|
r.i64[i] = a.i64[i] << s;
|
|
}
|
|
return r;
|
|
}
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
#define simde_mm_slli_epi64(a, imm8) SIMDE__M128I_C(_mm_slli_epi64(a.n, imm8));
|
|
#endif
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_srli_epi16(simde__m128i a, const int imm8)
|
|
{
|
|
simde__m128i r;
|
|
const int s = (imm8 > ((int)sizeof(r.i16[0]) * CHAR_BIT) - 1) ? 0
|
|
: imm8;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) {
|
|
r.u16[i] = a.u16[i] >> s;
|
|
}
|
|
return r;
|
|
}
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
#define simde_mm_srli_epi16(a, imm8) SIMDE__M128I_C(_mm_srli_epi16(a.n, imm8));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
#define simde_mm_srli_epi16(a, imm8) \
|
|
SIMDE__M128I_NEON_C( \
|
|
u16, ((imm8) <= 0) \
|
|
? ((a).neon_u16) \
|
|
: (((imm8) > 31) ? (vdupq_n_u16(0)) \
|
|
: (vshrq_n_u16((a).neon_u16, \
|
|
(imm8)))))
|
|
#endif
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_srli_epi32(simde__m128i a, const int imm8)
|
|
{
|
|
simde__m128i r;
|
|
const int s = (imm8 > ((int)sizeof(r.i32[0]) * CHAR_BIT) - 1) ? 0
|
|
: imm8;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
|
|
r.u32[i] = a.u32[i] >> s;
|
|
}
|
|
return r;
|
|
}
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
#define simde_mm_srli_epi32(a, imm8) SIMDE__M128I_C(_mm_srli_epi32(a.n, imm8))
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
#define simde_mm_srli_epi32(a, imm8) \
|
|
SIMDE__M128I_NEON_C( \
|
|
u32, ((imm8) <= 0) \
|
|
? ((a).neon_u32) \
|
|
: (((imm8) > 31) ? (vdupq_n_u32(0)) \
|
|
: (vshrq_n_u32((a).neon_u32, \
|
|
(imm8)))))
|
|
#endif
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_srli_epi64(simde__m128i a, const int imm8)
|
|
{
|
|
simde__m128i r;
|
|
const unsigned char s = imm8 & 255;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) {
|
|
if (s > 63) {
|
|
r.u64[i] = 0;
|
|
} else {
|
|
r.u64[i] = a.u64[i] >> s;
|
|
}
|
|
}
|
|
return r;
|
|
}
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
#define simde_mm_srli_epi64(a, imm8) SIMDE__M128I_C(_mm_srli_epi64(a.n, imm8))
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
#define simde_mm_srli_epi64(a, imm8) \
|
|
SIMDE__M128I_NEON_C( \
|
|
u64, \
|
|
(((imm8)&255) < 0 || ((imm8)&255) > 63) \
|
|
? (vdupq_n_u64(0)) \
|
|
: ((((imm8)&255) == 0) \
|
|
? (a.neon_u64) \
|
|
: (vshrq_n_u64((a).neon_u64, (imm8)&255))))
|
|
#endif
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
void simde_mm_store_pd(simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(2)],
|
|
simde__m128d a)
|
|
{
|
|
simde_assert_aligned(16, mem_addr);
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
_mm_store_pd(mem_addr, a.n);
|
|
#else
|
|
SIMDE__ASSUME_ALIGNED(mem_addr, 16);
|
|
memcpy(mem_addr, &a, sizeof(a));
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
void simde_mm_store1_pd(simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(2)],
|
|
simde__m128d a)
|
|
{
|
|
simde_assert_aligned(16, mem_addr);
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
_mm_store1_pd(mem_addr, a.n);
|
|
#else
|
|
SIMDE__ASSUME_ALIGNED(mem_addr, 16);
|
|
mem_addr[0] = a.f64[0];
|
|
mem_addr[1] = a.f64[0];
|
|
#endif
|
|
}
|
|
#define simde_mm_store_pd1(mem_addr, a) simde_mm_store1_pd(mem_addr, a)
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
void simde_mm_store_sd(simde_float64 *mem_addr, simde__m128d a)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
_mm_store_sd(mem_addr, a.n);
|
|
#else
|
|
memcpy(mem_addr, &a, sizeof(a.f64[0]));
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
void simde_mm_store_si128(simde__m128i *mem_addr, simde__m128i a)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
_mm_store_si128(&mem_addr->n, a.n);
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
vst1q_s32((int32_t *)mem_addr, a.neon_i32);
|
|
#else
|
|
SIMDE__ASSUME_ALIGNED(mem_addr, 16);
|
|
memcpy(mem_addr, &a, sizeof(a));
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
void simde_mm_storeh_pd(simde_float64 *mem_addr, simde__m128d a)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
_mm_storeh_pd(mem_addr, a.n);
|
|
#else
|
|
*mem_addr = a.f64[1];
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
void simde_mm_storel_epi64(simde__m128i *mem_addr, simde__m128i a)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
_mm_storel_epi64(&(mem_addr->n), a.n);
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
mem_addr->i64[0] = vgetq_lane_s64(a.neon_i64, 0);
|
|
#else
|
|
mem_addr->i64[0] = a.i64[0];
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
void simde_mm_storel_pd(simde_float64 *mem_addr, simde__m128d a)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
_mm_storel_pd(mem_addr, a.n);
|
|
#else
|
|
*mem_addr = a.f64[0];
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
void simde_mm_storer_pd(simde_float64 mem_addr[2], simde__m128d a)
|
|
{
|
|
simde_assert_aligned(16, mem_addr);
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
_mm_storer_pd(mem_addr, a.n);
|
|
#else
|
|
SIMDE__ASSUME_ALIGNED(mem_addr, 16);
|
|
mem_addr[0] = a.f64[1];
|
|
mem_addr[1] = a.f64[0];
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
void simde_mm_storeu_pd(simde_float64 *mem_addr, simde__m128d a)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
_mm_storeu_pd(mem_addr, a.n);
|
|
#else
|
|
memcpy(mem_addr, &a, sizeof(a));
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
void simde_mm_storeu_si128(simde__m128i *mem_addr, simde__m128i a)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
_mm_storeu_si128(&mem_addr->n, a.n);
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
int32_t v[4];
|
|
vst1q_s32(v, a.neon_i32);
|
|
memcpy(mem_addr, v, sizeof(v));
|
|
#else
|
|
memcpy(mem_addr, &a, sizeof(a));
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
void simde_mm_stream_pd(simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(2)],
|
|
simde__m128d a)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
_mm_stream_pd(mem_addr, a.n);
|
|
#else
|
|
SIMDE__ASSUME_ALIGNED(mem_addr, 16);
|
|
memcpy(mem_addr, &a, sizeof(a));
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
void simde_mm_stream_si128(simde__m128i *mem_addr, simde__m128i a)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
_mm_stream_si128(&mem_addr->n, a.n);
|
|
#else
|
|
SIMDE__ASSUME_ALIGNED(mem_addr, 16);
|
|
memcpy(mem_addr, &a, sizeof(a));
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
void simde_mm_stream_si32(int32_t *mem_addr, int32_t a)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
_mm_stream_si32(mem_addr, a);
|
|
#else
|
|
*mem_addr = a;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
void simde_mm_stream_si64(int64_t *mem_addr, int64_t a)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
|
|
#if defined(SIMDE__REALLY_GCC) && !HEDLEY_GCC_VERSION_CHECK(4, 8, 0)
|
|
*mem_addr = a;
|
|
#elif defined(__GNUC__)
|
|
_mm_stream_si64((long long *)mem_addr, a);
|
|
#else
|
|
_mm_stream_si64(mem_addr, a);
|
|
#endif
|
|
#else
|
|
*mem_addr = a;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_sub_epi8(simde__m128i a, simde__m128i b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_sub_epi8(a.n, b.n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
return SIMDE__M128I_NEON_C(i8, vsubq_s8(a.neon_i8, b.neon_i8));
|
|
#else
|
|
simde__m128i r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i8) / sizeof(r.i8[0])); i++) {
|
|
r.i8[i] = a.i8[i] - b.i8[i];
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_sub_epi16(simde__m128i a, simde__m128i b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_sub_epi16(a.n, b.n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
return SIMDE__M128I_NEON_C(i16, vsubq_s16(a.neon_i16, b.neon_i16));
|
|
#else
|
|
simde__m128i r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) {
|
|
r.i16[i] = a.i16[i] - b.i16[i];
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_sub_epi32(simde__m128i a, simde__m128i b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_sub_epi32(a.n, b.n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
return SIMDE__M128I_NEON_C(i32, vsubq_s32(a.neon_i32, b.neon_i32));
|
|
#else
|
|
simde__m128i r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
|
|
r.i32[i] = a.i32[i] - b.i32[i];
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_sub_epi64(simde__m128i a, simde__m128i b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_sub_epi64(a.n, b.n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
return SIMDE__M128I_NEON_C(i64, vsubq_s64(a.neon_i64, b.neon_i64));
|
|
#else
|
|
simde__m128i r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) {
|
|
r.i64[i] = a.i64[i] - b.i64[i];
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_sub_pd(simde__m128d a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128D_C(_mm_sub_pd(a.n, b.n));
|
|
#else
|
|
simde__m128d r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
|
|
r.f64[i] = a.f64[i] - b.f64[i];
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_sub_sd(simde__m128d a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128D_C(_mm_sub_sd(a.n, b.n));
|
|
#else
|
|
simde__m128d r;
|
|
r.f64[0] = a.f64[0] - b.f64[0];
|
|
r.f64[1] = a.f64[1];
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m64 simde_mm_sub_si64(simde__m64 a, simde__m64 b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M64_C(_mm_sub_si64(a.n, b.n));
|
|
#else
|
|
simde__m64 r;
|
|
r.i64[0] = a.i64[0] - b.i64[0];
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_subs_epi8(simde__m128i a, simde__m128i b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_subs_epi8(a.n, b.n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
return SIMDE__M128I_NEON_C(i8, vqsubq_s8(a.neon_i8, b.neon_i8));
|
|
#else
|
|
simde__m128i r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r) / sizeof(r.i8[0])); i++) {
|
|
if (((b.i8[i]) > 0 && (a.i8[i]) < INT8_MIN + (b.i8[i]))) {
|
|
r.i8[i] = INT8_MIN;
|
|
} else if ((b.i8[i]) < 0 && (a.i8[i]) > INT8_MAX + (b.i8[i])) {
|
|
r.i8[i] = INT8_MAX;
|
|
} else {
|
|
r.i8[i] = (a.i8[i]) - (b.i8[i]);
|
|
}
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_subs_epi16(simde__m128i a, simde__m128i b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_subs_epi16(a.n, b.n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
return SIMDE__M128I_NEON_C(i16, vqsubq_s16(a.neon_i16, b.neon_i16));
|
|
#else
|
|
simde__m128i r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r) / sizeof(r.i16[0])); i++) {
|
|
if (((b.i16[i]) > 0 && (a.i16[i]) < INT16_MIN + (b.i16[i]))) {
|
|
r.i16[i] = INT16_MIN;
|
|
} else if ((b.i16[i]) < 0 &&
|
|
(a.i16[i]) > INT16_MAX + (b.i16[i])) {
|
|
r.i16[i] = INT16_MAX;
|
|
} else {
|
|
r.i16[i] = (a.i16[i]) - (b.i16[i]);
|
|
}
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_subs_epu8(simde__m128i a, simde__m128i b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_subs_epu8(a.n, b.n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
return SIMDE__M128I_NEON_C(u8, vqsubq_u8(a.neon_u8, b.neon_u8));
|
|
#else
|
|
simde__m128i r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r) / sizeof(r.i8[0])); i++) {
|
|
const int32_t x = a.u8[i] - b.u8[i];
|
|
if (x < 0) {
|
|
r.u8[i] = 0;
|
|
} else if (x > UINT8_MAX) {
|
|
r.u8[i] = UINT8_MAX;
|
|
} else {
|
|
r.u8[i] = (uint8_t)x;
|
|
}
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_subs_epu16(simde__m128i a, simde__m128i b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_subs_epu16(a.n, b.n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
return SIMDE__M128I_NEON_C(u16, vqsubq_u16(a.neon_u16, b.neon_u16));
|
|
#else
|
|
simde__m128i r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r) / sizeof(r.i16[0])); i++) {
|
|
const int32_t x = a.u16[i] - b.u16[i];
|
|
if (x < 0) {
|
|
r.u16[i] = 0;
|
|
} else if (x > UINT16_MAX) {
|
|
r.u16[i] = UINT16_MAX;
|
|
} else {
|
|
r.u16[i] = (uint16_t)x;
|
|
}
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
int simde_mm_ucomieq_sd(simde__m128d a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return _mm_ucomieq_sd(a.n, b.n);
|
|
#else
|
|
fenv_t envp;
|
|
int x = feholdexcept(&envp);
|
|
int r = a.f64[0] == b.f64[0];
|
|
if (HEDLEY_LIKELY(x == 0))
|
|
fesetenv(&envp);
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
int simde_mm_ucomige_sd(simde__m128d a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return _mm_ucomige_sd(a.n, b.n);
|
|
#else
|
|
fenv_t envp;
|
|
int x = feholdexcept(&envp);
|
|
int r = a.f64[0] >= b.f64[0];
|
|
if (HEDLEY_LIKELY(x == 0))
|
|
fesetenv(&envp);
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
int simde_mm_ucomigt_sd(simde__m128d a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return _mm_ucomigt_sd(a.n, b.n);
|
|
#else
|
|
fenv_t envp;
|
|
int x = feholdexcept(&envp);
|
|
int r = a.f64[0] > b.f64[0];
|
|
if (HEDLEY_LIKELY(x == 0))
|
|
fesetenv(&envp);
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
int simde_mm_ucomile_sd(simde__m128d a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return _mm_ucomile_sd(a.n, b.n);
|
|
#else
|
|
fenv_t envp;
|
|
int x = feholdexcept(&envp);
|
|
int r = a.f64[0] <= b.f64[0];
|
|
if (HEDLEY_LIKELY(x == 0))
|
|
fesetenv(&envp);
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
int simde_mm_ucomilt_sd(simde__m128d a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return _mm_ucomilt_sd(a.n, b.n);
|
|
#else
|
|
fenv_t envp;
|
|
int x = feholdexcept(&envp);
|
|
int r = a.f64[0] < b.f64[0];
|
|
if (HEDLEY_LIKELY(x == 0))
|
|
fesetenv(&envp);
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
int simde_mm_ucomineq_sd(simde__m128d a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return _mm_ucomineq_sd(a.n, b.n);
|
|
#else
|
|
fenv_t envp;
|
|
int x = feholdexcept(&envp);
|
|
int r = a.f64[0] != b.f64[0];
|
|
if (HEDLEY_LIKELY(x == 0))
|
|
fesetenv(&envp);
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_undefined_pd(void)
|
|
{
|
|
simde__m128d r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE) && defined(SIMDE__HAVE_UNDEFINED128)
|
|
r.n = _mm_undefined_pd();
|
|
#else
|
|
r = simde_mm_setzero_pd();
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_undefined_si128(void)
|
|
{
|
|
simde__m128i r;
|
|
|
|
#if defined(SIMDE_SSE2_NATIVE) && defined(SIMDE__HAVE_UNDEFINED128)
|
|
r.n = _mm_undefined_si128();
|
|
#else
|
|
r = simde_mm_setzero_si128();
|
|
#endif
|
|
|
|
return r;
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
void simde_mm_lfence(void)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
_mm_lfence();
|
|
#else
|
|
simde_mm_sfence();
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
void simde_mm_mfence(void)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
_mm_mfence();
|
|
#else
|
|
simde_mm_sfence();
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_unpackhi_epi8(simde__m128i a, simde__m128i b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_unpackhi_epi8(a.n, b.n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
int8x8_t a1 = vreinterpret_s8_s16(vget_high_s16(a.neon_i16));
|
|
int8x8_t b1 = vreinterpret_s8_s16(vget_high_s16(b.neon_i16));
|
|
int8x8x2_t result = vzip_s8(a1, b1);
|
|
return SIMDE__M128I_NEON_C(i8,
|
|
vcombine_s8(result.val[0], result.val[1]));
|
|
#else
|
|
simde__m128i r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < ((sizeof(r) / sizeof(r.i8[0])) / 2); i++) {
|
|
r.i8[(i * 2)] = a.i8[i + ((sizeof(r) / sizeof(r.i8[0])) / 2)];
|
|
r.i8[(i * 2) + 1] =
|
|
b.i8[i + ((sizeof(r) / sizeof(r.i8[0])) / 2)];
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_unpackhi_epi16(simde__m128i a, simde__m128i b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_unpackhi_epi16(a.n, b.n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
int16x4_t a1 = vget_high_s16(a.neon_i16);
|
|
int16x4_t b1 = vget_high_s16(b.neon_i16);
|
|
int16x4x2_t result = vzip_s16(a1, b1);
|
|
return SIMDE__M128I_NEON_C(i16,
|
|
vcombine_s16(result.val[0], result.val[1]));
|
|
#else
|
|
simde__m128i r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < ((sizeof(r) / sizeof(r.i16[0])) / 2); i++) {
|
|
r.i16[(i * 2)] =
|
|
a.i16[i + ((sizeof(r) / sizeof(r.i16[0])) / 2)];
|
|
r.i16[(i * 2) + 1] =
|
|
b.i16[i + ((sizeof(r) / sizeof(r.i16[0])) / 2)];
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_unpackhi_epi32(simde__m128i a, simde__m128i b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_unpackhi_epi32(a.n, b.n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
int32x2_t a1 = vget_high_s32(a.neon_i32);
|
|
int32x2_t b1 = vget_high_s32(b.neon_i32);
|
|
int32x2x2_t result = vzip_s32(a1, b1);
|
|
return SIMDE__M128I_NEON_C(i32,
|
|
vcombine_s32(result.val[0], result.val[1]));
|
|
#else
|
|
simde__m128i r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < ((sizeof(r) / sizeof(r.i32[0])) / 2); i++) {
|
|
r.i32[(i * 2)] =
|
|
a.i32[i + ((sizeof(r) / sizeof(r.i32[0])) / 2)];
|
|
r.i32[(i * 2) + 1] =
|
|
b.i32[i + ((sizeof(r) / sizeof(r.i32[0])) / 2)];
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_unpackhi_epi64(simde__m128i a, simde__m128i b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_unpackhi_epi64(a.n, b.n));
|
|
#else
|
|
simde__m128i r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < ((sizeof(r) / sizeof(r.i64[0])) / 2); i++) {
|
|
r.i64[(i * 2)] =
|
|
a.i64[i + ((sizeof(r) / sizeof(r.i64[0])) / 2)];
|
|
r.i64[(i * 2) + 1] =
|
|
b.i64[i + ((sizeof(r) / sizeof(r.i64[0])) / 2)];
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_unpackhi_pd(simde__m128d a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128D_C(_mm_unpackhi_pd(a.n, b.n));
|
|
#else
|
|
simde__m128d r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < ((sizeof(r) / sizeof(r.f64[0])) / 2); i++) {
|
|
r.f64[(i * 2)] =
|
|
a.f64[i + ((sizeof(r) / sizeof(r.f64[0])) / 2)];
|
|
r.f64[(i * 2) + 1] =
|
|
b.f64[i + ((sizeof(r) / sizeof(r.f64[0])) / 2)];
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_unpacklo_epi8(simde__m128i a, simde__m128i b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_unpacklo_epi8(a.n, b.n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(a.neon_i16));
|
|
int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(b.neon_i16));
|
|
int8x8x2_t result = vzip_s8(a1, b1);
|
|
return SIMDE__M128I_NEON_C(i8,
|
|
vcombine_s8(result.val[0], result.val[1]));
|
|
#else
|
|
simde__m128i r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < ((sizeof(r) / sizeof(r.i8[0])) / 2); i++) {
|
|
r.i8[(i * 2)] = a.i8[i];
|
|
r.i8[(i * 2) + 1] = b.i8[i];
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_unpacklo_epi16(simde__m128i a, simde__m128i b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_unpacklo_epi16(a.n, b.n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
int16x4_t a1 = vget_low_s16(a.neon_i16);
|
|
int16x4_t b1 = vget_low_s16(b.neon_i16);
|
|
int16x4x2_t result = vzip_s16(a1, b1);
|
|
return SIMDE__M128I_NEON_C(i16,
|
|
vcombine_s16(result.val[0], result.val[1]));
|
|
#else
|
|
simde__m128i r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < ((sizeof(r) / sizeof(r.i16[0])) / 2); i++) {
|
|
r.i16[(i * 2)] = a.i16[i];
|
|
r.i16[(i * 2) + 1] = b.i16[i];
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_unpacklo_epi32(simde__m128i a, simde__m128i b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_unpacklo_epi32(a.n, b.n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
int32x2_t a1 = vget_low_s32(a.neon_i32);
|
|
int32x2_t b1 = vget_low_s32(b.neon_i32);
|
|
int32x2x2_t result = vzip_s32(a1, b1);
|
|
return SIMDE__M128I_NEON_C(i32,
|
|
vcombine_s32(result.val[0], result.val[1]));
|
|
#else
|
|
simde__m128i r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < ((sizeof(r) / sizeof(r.i32[0])) / 2); i++) {
|
|
r.i32[(i * 2)] = a.i32[i];
|
|
r.i32[(i * 2) + 1] = b.i32[i];
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_unpacklo_epi64(simde__m128i a, simde__m128i b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_unpacklo_epi64(a.n, b.n));
|
|
#else
|
|
simde__m128i r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < ((sizeof(r) / sizeof(r.i64[0])) / 2); i++) {
|
|
r.i64[(i * 2)] = a.i64[i];
|
|
r.i64[(i * 2) + 1] = b.i64[i];
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_unpacklo_pd(simde__m128d a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128D_C(_mm_unpacklo_pd(a.n, b.n));
|
|
#else
|
|
simde__m128d r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < ((sizeof(r) / sizeof(r.f64[0])) / 2); i++) {
|
|
r.f64[(i * 2)] = a.f64[i];
|
|
r.f64[(i * 2) + 1] = b.f64[i];
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128d simde_mm_xor_pd(simde__m128d a, simde__m128d b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128D_C(_mm_xor_pd(a.n, b.n));
|
|
#else
|
|
simde__m128d r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) {
|
|
r.i64[i] = a.i64[i] ^ b.i64[i];
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_mm_xor_si128(simde__m128i a, simde__m128i b)
|
|
{
|
|
#if defined(SIMDE_SSE2_NATIVE)
|
|
return SIMDE__M128I_C(_mm_xor_si128(a.n, b.n));
|
|
#elif defined(SIMDE_SSE2_NEON)
|
|
return SIMDE__M128I_NEON_C(i32, veorq_s32(a.neon_i32, b.neon_i32));
|
|
#else
|
|
simde__m128i r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
|
|
r.i32[i] = a.i32[i] ^ b.i32[i];
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__FUNCTION_ATTRIBUTES
|
|
simde__m128i simde_x_mm_not_si128(simde__m128i a)
|
|
{
|
|
#if defined(SIMDE_SSE2_NEON)
|
|
return SIMDE__M128I_NEON_C(i32, vmvnq_s32(a.neon_i32));
|
|
#else
|
|
simde__m128i r;
|
|
SIMDE__VECTORIZE
|
|
for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
|
|
r.i32[i] = ~(a.i32[i]);
|
|
}
|
|
return r;
|
|
#endif
|
|
}
|
|
|
|
SIMDE__END_DECLS
|
|
|
|
#endif /* !defined(SIMDE__SSE2_H) */
|