yolobs-studio/libobs/util/simde/sse2.h
2020-03-25 09:07:22 +01:00

4197 lines
97 KiB
C

/* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use, copy,
* modify, merge, publish, distribute, sublicense, and/or sell copies
* of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
* Copyright:
* 2017 Evan Nemerson <evan@nemerson.com>
* 2015-2017 John W. Ratcliff <jratcliffscarab@gmail.com>
* 2015 Brandon Rowlett <browlett@nvidia.com>
* 2015 Ken Fast <kfast@gdeb.com>
* 2017 Hasindu Gamaarachchi <hasindu@unsw.edu.au>
* 2018 Jeff Daily <jeff.daily@amd.com>
*/
#if !defined(SIMDE__SSE2_H)
#if !defined(SIMDE__SSE2_H)
#define SIMDE__SSE2_H
#endif
#include "sse.h"
#if defined(SIMDE_SSE2_NATIVE)
#undef SIMDE_SSE2_NATIVE
#endif
#if defined(SIMDE_SSE2_FORCE_NATIVE)
#define SIMDE_SSE2_NATIVE
#elif defined(__SSE2__) && !defined(SIMDE_SSE2_NO_NATIVE) && \
!defined(SIMDE_NO_NATIVE)
#define SIMDE_SSE2_NATIVE
#elif defined(__ARM_NEON) && !defined(SIMDE_SSE2_NO_NEON) && \
!defined(SIMDE_NO_NEON)
#define SIMDE_SSE2_NEON
#endif
#if defined(SIMDE_SSE2_NATIVE) && !defined(SIMDE_SSE_NATIVE)
#if defined(SIMDE_SSE2_FORCE_NATIVE)
#error Native SSE2 support requires native SSE support
#else
#warning Native SSE2 support requires native SSE support, disabling
#undef SIMDE_SSE2_NATIVE
#endif
#elif defined(SIMDE_SSE2_NEON) && !defined(SIMDE_SSE_NEON)
#warning SSE2 NEON support requires SSE NEON support, disabling
#undef SIMDE_SSE_NEON
#endif
#if defined(SIMDE_SSE2_NATIVE)
#include <emmintrin.h>
#else
#if defined(SIMDE_SSE2_NEON)
#include <arm_neon.h>
#endif
#endif
#include <stdint.h>
#include <limits.h>
#include <string.h>
#define vreinterpretq_m128i_s32(v) \
(simde__m128i) { .neon_i32 = v }
#define vreinterpretq_m128i_u64(v) \
(simde__m128i) { .neon_u64 = v }
#define vreinterpretq_s32_m128i(a) a.neon_i32
#define vreinterpretq_u64_m128i(a) a.neon_u64
SIMDE__BEGIN_DECLS
typedef SIMDE_ALIGN(16) union {
#if defined(SIMDE__ENABLE_GCC_VEC_EXT)
int8_t i8 __attribute__((__vector_size__(16), __may_alias__));
int16_t i16 __attribute__((__vector_size__(16), __may_alias__));
int32_t i32 __attribute__((__vector_size__(16), __may_alias__));
int64_t i64 __attribute__((__vector_size__(16), __may_alias__));
uint8_t u8 __attribute__((__vector_size__(16), __may_alias__));
uint16_t u16 __attribute__((__vector_size__(16), __may_alias__));
uint32_t u32 __attribute__((__vector_size__(16), __may_alias__));
uint64_t u64 __attribute__((__vector_size__(16), __may_alias__));
#if defined(SIMDE__HAVE_INT128)
simde_int128 i128 __attribute__((__vector_size__(16), __may_alias__));
simde_uint128 u128 __attribute__((__vector_size__(16), __may_alias__));
#endif
simde_float32 f32 __attribute__((__vector_size__(16), __may_alias__));
simde_float64 f64 __attribute__((__vector_size__(16), __may_alias__));
#else
int8_t i8[16];
int16_t i16[8];
int32_t i32[4];
int64_t i64[2];
uint8_t u8[16];
uint16_t u16[8];
uint32_t u32[4];
uint64_t u64[2];
#if defined(SIMDE__HAVE_INT128)
simde_int128 i128[1];
simde_uint128 u128[1];
#endif
simde_float32 f32[4];
simde_float64 f64[2];
#endif
#if defined(SIMDE_SSE2_NATIVE)
__m128i n;
#elif defined(SIMDE_SSE2_NEON)
int8x16_t neon_i8;
int16x8_t neon_i16;
int32x4_t neon_i32;
int64x2_t neon_i64;
uint8x16_t neon_u8;
uint16x8_t neon_u16;
uint32x4_t neon_u32;
uint64x2_t neon_u64;
float32x4_t neon_f32;
#if defined(SIMDE_ARCH_AMD64)
float64x2_t neon_f64;
#endif
#endif
} simde__m128i;
typedef SIMDE_ALIGN(16) union {
#if defined(SIMDE__ENABLE_GCC_VEC_EXT)
int8_t i8 __attribute__((__vector_size__(16), __may_alias__));
int16_t i16 __attribute__((__vector_size__(16), __may_alias__));
int32_t i32 __attribute__((__vector_size__(16), __may_alias__));
int64_t i64 __attribute__((__vector_size__(16), __may_alias__));
uint8_t u8 __attribute__((__vector_size__(16), __may_alias__));
uint16_t u16 __attribute__((__vector_size__(16), __may_alias__));
uint32_t u32 __attribute__((__vector_size__(16), __may_alias__));
uint64_t u64 __attribute__((__vector_size__(16), __may_alias__));
simde_float32 f32 __attribute__((__vector_size__(16), __may_alias__));
simde_float64 f64 __attribute__((__vector_size__(16), __may_alias__));
#else
int8_t i8[16];
int16_t i16[8];
int32_t i32[4];
int64_t i64[2];
uint8_t u8[16];
uint16_t u16[8];
uint32_t u32[4];
uint64_t u64[2];
simde_float32 f32[4];
simde_float64 f64[2];
#endif
#if defined(SIMDE_SSE2_NATIVE)
__m128d n;
#elif defined(SIMDE_SSE2_NEON)
int8x16_t neon_i8;
int16x8_t neon_i16;
int32x4_t neon_i32;
int64x2_t neon_i64;
uint8x16_t neon_u8;
uint16x8_t neon_u16;
uint32x4_t neon_u32;
uint64x2_t neon_u64;
float32x4_t neon_f32;
#if defined(SIMDE_ARCH_AMD64)
float64x2_t neon_f64;
#endif
#endif
} simde__m128d;
#if defined(SIMDE_SSE2_NATIVE)
HEDLEY_STATIC_ASSERT(sizeof(__m128i) == sizeof(simde__m128i),
"__m128i size doesn't match simde__m128i size");
HEDLEY_STATIC_ASSERT(sizeof(__m128d) == sizeof(simde__m128d),
"__m128d size doesn't match simde__m128d size");
SIMDE__FUNCTION_ATTRIBUTES simde__m128i SIMDE__M128I_C(__m128i v)
{
simde__m128i r;
r.n = v;
return r;
}
SIMDE__FUNCTION_ATTRIBUTES simde__m128d SIMDE__M128D_C(__m128d v)
{
simde__m128d r;
r.n = v;
return r;
}
#elif defined(SIMDE_SSE_NEON)
#define SIMDE__M128I_NEON_C(T, expr) \
(simde__m128i) { .neon_##T = expr }
#define SIMDE__M128D_NEON_C(T, expr) \
(simde__m128d) { .neon_##T = expr }
#endif
HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128i), "simde__m128i size incorrect");
HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128d), "simde__m128d size incorrect");
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_add_epi8(simde__m128i a, simde__m128i b)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128I_C(_mm_add_epi8(a.n, b.n));
#elif defined(SIMDE_SSE2_NEON)
return SIMDE__M128I_NEON_C(i8, vaddq_s8(a.neon_i8, b.neon_i8));
#else
simde__m128i r;
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r.i8) / sizeof(r.i8[0])); i++) {
r.i8[i] = a.i8[i] + b.i8[i];
}
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_add_epi16(simde__m128i a, simde__m128i b)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128I_C(_mm_add_epi16(a.n, b.n));
#elif defined(SIMDE_SSE2_NEON)
return SIMDE__M128I_NEON_C(i16, vaddq_s16(a.neon_i16, b.neon_i16));
#else
simde__m128i r;
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) {
r.i16[i] = a.i16[i] + b.i16[i];
}
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_add_epi32(simde__m128i a, simde__m128i b)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128I_C(_mm_add_epi32(a.n, b.n));
#elif defined(SIMDE_SSE2_NEON)
return SIMDE__M128I_NEON_C(i32, vaddq_s32(a.neon_i32, b.neon_i32));
#else
simde__m128i r;
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
r.i32[i] = a.i32[i] + b.i32[i];
}
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_add_epi64(simde__m128i a, simde__m128i b)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128I_C(_mm_add_epi64(a.n, b.n));
#elif defined(SIMDE_SSE2_NEON)
return SIMDE__M128I_NEON_C(i64, vaddq_s64(a.neon_i64, b.neon_i64));
#else
simde__m128i r;
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) {
r.i64[i] = a.i64[i] + b.i64[i];
}
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128d simde_mm_add_pd(simde__m128d a, simde__m128d b)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128D_C(_mm_add_pd(a.n, b.n));
#elif defined(SIMDE_SSE2_NEON) && defined(SIMDE_ARCH_AMD64)
return SIMDE__M128I_NEON_C(f64, vaddq_f64(a.neon_f64, b.neon_f64));
#else
simde__m128d r;
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
r.f64[i] = a.f64[i] + b.f64[i];
}
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128d simde_mm_add_sd(simde__m128d a, simde__m128d b)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128D_C(_mm_add_sd(a.n, b.n));
#else
simde__m128d r;
r.f64[0] = a.f64[0] + b.f64[0];
r.f64[1] = a.f64[1];
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m64 simde_mm_add_si64(simde__m64 a, simde__m64 b)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M64_C(_mm_add_si64(a.n, b.n));
#elif defined(SIMDE_SSE2_NEON)
return SIMDE__M64_NEON_C(i64, vadd_s64(a.neon_i64, b.neon_i64));
#else
simde__m64 r;
r.i64[0] = a.i64[0] + b.i64[0];
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_adds_epi8(simde__m128i a, simde__m128i b)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128I_C(_mm_adds_epi8(a.n, b.n));
#elif defined(SIMDE_SSE2_NEON)
return SIMDE__M128I_NEON_C(i8, vqaddq_s8(a.neon_i8, b.neon_i8));
#else
simde__m128i r;
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r.i8) / sizeof(r.i8[0])); i++) {
if ((((b.i8[i]) > 0) && ((a.i8[i]) > (INT8_MAX - (b.i8[i]))))) {
r.i8[i] = INT8_MAX;
} else if ((((b.i8[i]) < 0) &&
((a.i8[i]) < (INT8_MIN - (b.i8[i]))))) {
r.i8[i] = INT8_MIN;
} else {
r.i8[i] = (a.i8[i]) + (b.i8[i]);
}
}
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_adds_epi16(simde__m128i a, simde__m128i b)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128I_C(_mm_adds_epi16(a.n, b.n));
#elif defined(SIMDE_SSE2_NEON)
return SIMDE__M128I_NEON_C(i16, vqaddq_s16(a.neon_i16, b.neon_i16));
#else
simde__m128i r;
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) {
if ((((b.i16[i]) > 0) &&
((a.i16[i]) > (INT16_MAX - (b.i16[i]))))) {
r.i16[i] = INT16_MAX;
} else if ((((b.i16[i]) < 0) &&
((a.i16[i]) < (INT16_MIN - (b.i16[i]))))) {
r.i16[i] = INT16_MIN;
} else {
r.i16[i] = (a.i16[i]) + (b.i16[i]);
}
}
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_adds_epu8(simde__m128i a, simde__m128i b)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128I_C(_mm_adds_epu8(a.n, b.n));
#elif defined(SIMDE_SSE2_NEON)
return SIMDE__M128I_NEON_C(u8, vqaddq_u8(a.neon_u8, b.neon_u8));
#else
simde__m128i r;
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r.u8) / sizeof(r.u8[0])); i++) {
r.u8[i] = ((UINT8_MAX - a.u8[i]) > b.u8[i])
? (a.u8[i] + b.u8[i])
: UINT8_MAX;
}
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_adds_epu16(simde__m128i a, simde__m128i b)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128I_C(_mm_adds_epu16(a.n, b.n));
#elif defined(SIMDE_SSE2_NEON)
return SIMDE__M128I_NEON_C(u16, vqaddq_u16(a.neon_u16, b.neon_u16));
#else
simde__m128i r;
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r.u16) / sizeof(r.u16[0])); i++) {
r.u16[i] = ((UINT16_MAX - a.u16[i]) > b.u16[i])
? (a.u16[i] + b.u16[i])
: UINT16_MAX;
}
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128d simde_mm_and_pd(simde__m128d a, simde__m128d b)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128D_C(_mm_and_pd(a.n, b.n));
#elif defined(SIMDE_SSE2_NEON)
return SIMDE__M128D_NEON_C(i32, vandq_s32(a.neon_i32, b.neon_i32));
#else
simde__m128d r;
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r.u64) / sizeof(r.u64[0])); i++) {
r.u64[i] = a.u64[i] & b.u64[i];
}
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_and_si128(simde__m128i a, simde__m128i b)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128I_C(_mm_and_si128(a.n, b.n));
#elif defined(SIMDE_SSE_NEON)
return SIMDE__M128I_NEON_C(i32, vandq_s32(b.neon_i32, a.neon_i32));
#else
simde__m128i r;
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) {
r.i64[i] = a.i64[i] & b.i64[i];
}
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128d simde_mm_andnot_pd(simde__m128d a, simde__m128d b)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128D_C(_mm_andnot_pd(a.n, b.n));
#elif defined(SIMDE_SSE2_NEON)
return SIMDE__M128D_NEON_C(i32, vbicq_s32(a.neon_i32, b.neon_i32));
#else
simde__m128d r;
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r.u64) / sizeof(r.u64[0])); i++) {
r.u64[i] = ~a.u64[i] & b.u64[i];
}
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_andnot_si128(simde__m128i a, simde__m128i b)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128I_C(_mm_andnot_si128(a.n, b.n));
#elif defined(SIMDE_SSE2_NEON)
return SIMDE__M128I_NEON_C(i32, vbicq_s32(b.neon_i32, a.neon_i32));
#else
simde__m128i r;
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) {
r.i64[i] = ~(a.i64[i]) & b.i64[i];
}
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_avg_epu8(simde__m128i a, simde__m128i b)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128I_C(_mm_avg_epu8(a.n, b.n));
#elif defined(SIMDE_SSE2_NEON)
return SIMDE__M128I_NEON_C(u8, vrhaddq_u8(b.neon_u8, a.neon_u8));
#else
simde__m128i r;
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r.u8) / sizeof(r.u8[0])); i++) {
r.u8[i] = (a.u8[i] + b.u8[i] + 1) >> 1;
}
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_avg_epu16(simde__m128i a, simde__m128i b)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128I_C(_mm_avg_epu16(a.n, b.n));
#elif defined(SIMDE_SSE2_NEON)
return SIMDE__M128I_NEON_C(u16, vrhaddq_u16(b.neon_u16, a.neon_u16));
#else
simde__m128i r;
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r.u16) / sizeof(r.u16[0])); i++) {
r.u16[i] = (a.u16[i] + b.u16[i] + 1) >> 1;
}
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_bslli_si128(simde__m128i a, const int imm8)
{
simde__m128i r;
if (HEDLEY_UNLIKELY(imm8 > 15)) {
r.u64[0] = 0;
r.u64[1] = 0;
return r;
}
const int s = imm8 * 8;
#if defined(SIMDE__HAVE_INT128)
r.u128[0] = a.u128[0] << s;
#else
if (s < 64) {
r.u64[0] = (a.u64[0] << s);
r.u64[1] = (a.u64[1] << s) | (a.u64[0] >> (64 - s));
} else {
r.u64[0] = 0;
r.u64[1] = a.u64[0] << (s - 64);
}
#endif
return r;
}
#if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI)
#define simde_mm_bslli_si128(a, imm8) SIMDE__M128I_C(_mm_slli_si128(a.n, imm8))
#elif defined(SIMDE_SSE2_NEON)
#define simde_mm_bslli_si128(a, imm8) \
SIMDE__M128I_NEON_C( \
i8, \
(((imm8) <= 0) ? ((a).neon_i8) \
: (((imm8) > 15) ? (vdupq_n_s8(0)) \
: (vextq_s8(vdupq_n_s8(0), \
(a).neon_i8, \
16 - (imm8))))))
#endif
#define simde_mm_slli_si128(a, imm8) simde_mm_bslli_si128(a, imm8)
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_bsrli_si128(simde__m128i a, const int imm8)
{
simde__m128i r;
if (HEDLEY_UNLIKELY(imm8 > 15)) {
r.u64[0] = 0;
r.u64[1] = 0;
return r;
}
const int s = imm8 * 8;
#if defined(SIMDE__HAVE_INT128)
r.u128[0] = a.u128[0] >> s;
#else
if (s < 64) {
r.u64[0] = (a.u64[0] >> s) | (a.u64[1] << (64 - s));
r.u64[1] = (a.u64[1] >> s);
} else {
r.u64[0] = a.u64[1] >> (s - 64);
r.u64[1] = 0;
}
#endif
return r;
}
#if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI)
#define simde_mm_bsrli_si128(a, imm8) SIMDE__M128I_C(_mm_srli_si128(a.n, imm8))
#elif defined(SIMDE_SSE2_NEON)
#define simde_mm_bsrli_si128(a, imm8) \
SIMDE__M128I_NEON_C( \
i8, \
((imm8) <= 0) \
? ((a).neon_i8) \
: (((imm8) > 15) ? (vdupq_n_s8(0)) \
: (vextq_s8((a).neon_i8, \
vdupq_n_s8(0), (imm8)))))
#endif
#define simde_mm_srli_si128(a, imm8) simde_mm_bsrli_si128(a, imm8)
SIMDE__FUNCTION_ATTRIBUTES
void simde_mm_clflush(void const *p)
{
#if defined(SIMDE_SSE2_NATIVE)
_mm_clflush(p);
#else
(void)p;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
int simde_mm_comieq_sd(simde__m128d a, simde__m128d b)
{
#if defined(SIMDE_SSE2_NATIVE)
return _mm_comieq_sd(a.n, b.n);
#else
return a.f64[0] == b.f64[0];
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
int simde_mm_comige_sd(simde__m128d a, simde__m128d b)
{
#if defined(SIMDE_SSE2_NATIVE)
return _mm_comige_sd(a.n, b.n);
#else
return a.f64[0] >= b.f64[0];
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
int simde_mm_comigt_sd(simde__m128d a, simde__m128d b)
{
#if defined(SIMDE_SSE2_NATIVE)
return _mm_comigt_sd(a.n, b.n);
#else
return a.f64[0] > b.f64[0];
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
int simde_mm_comile_sd(simde__m128d a, simde__m128d b)
{
#if defined(SIMDE_SSE2_NATIVE)
return _mm_comile_sd(a.n, b.n);
#else
return a.f64[0] <= b.f64[0];
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
int simde_mm_comilt_sd(simde__m128d a, simde__m128d b)
{
#if defined(SIMDE_SSE2_NATIVE)
return _mm_comilt_sd(a.n, b.n);
#else
return a.f64[0] < b.f64[0];
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
int simde_mm_comineq_sd(simde__m128d a, simde__m128d b)
{
#if defined(SIMDE_SSE2_NATIVE)
return _mm_comineq_sd(a.n, b.n);
#else
return a.f64[0] != b.f64[0];
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128 simde_mm_castpd_ps(simde__m128d a)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128_C(_mm_castpd_ps(a.n));
#else
union {
simde__m128d pd;
simde__m128 ps;
} r;
r.pd = a;
return r.ps;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_castpd_si128(simde__m128d a)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128I_C(_mm_castpd_si128(a.n));
#else
union {
simde__m128d pd;
simde__m128i si128;
} r;
r.pd = a;
return r.si128;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128d simde_mm_castps_pd(simde__m128 a)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128D_C(_mm_castps_pd(a.n));
#else
union {
simde__m128 ps;
simde__m128d pd;
} r;
r.ps = a;
return r.pd;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_castps_si128(simde__m128 a)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128I_C(_mm_castps_si128(a.n));
#elif defined(SIMDE_SSE2_NEON)
return SIMDE__M128I_NEON_C(i32, a.neon_i32);
#else
union {
simde__m128 ps;
simde__m128i si128;
} r;
r.ps = a;
return r.si128;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128d simde_mm_castsi128_pd(simde__m128i a)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128D_C(_mm_castsi128_pd(a.n));
#else
union {
simde__m128i si128;
simde__m128d pd;
} r;
r.si128 = a;
return r.pd;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128 simde_mm_castsi128_ps(simde__m128i a)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128_C(_mm_castsi128_ps(a.n));
#elif defined(SIMDE_SSE2_NEON)
return SIMDE__M128_NEON_C(f32, a.neon_f32);
#else
union {
simde__m128i si128;
simde__m128 ps;
} r;
r.si128 = a;
return r.ps;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_cmpeq_epi8(simde__m128i a, simde__m128i b)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128I_C(_mm_cmpeq_epi8(a.n, b.n));
#elif defined(SIMDE_SSE2_NEON)
return SIMDE__M128I_NEON_C(
i8, vreinterpretq_s8_u8(vceqq_s8(b.neon_i8, a.neon_i8)));
#else
simde__m128i r;
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r.i8) / sizeof(r.i8[0])); i++) {
r.i8[i] = (a.i8[i] == b.i8[i]) ? 0xff : 0x00;
}
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_cmpeq_epi16(simde__m128i a, simde__m128i b)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128I_C(_mm_cmpeq_epi16(a.n, b.n));
#elif defined(SIMDE_SSE2_NEON)
return SIMDE__M128I_NEON_C(
i16, vreinterpretq_s16_u16(vceqq_s16(b.neon_i16, a.neon_i16)));
#else
simde__m128i r;
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) {
r.i16[i] = (a.i16[i] == b.i16[i]) ? 0xffff : 0x0000;
}
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_cmpeq_epi32(simde__m128i a, simde__m128i b)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128I_C(_mm_cmpeq_epi32(a.n, b.n));
#elif defined(SIMDE_SSE2_NEON)
return SIMDE__M128I_NEON_C(
i32, vreinterpretq_s32_u32(vceqq_s32(b.neon_i32, a.neon_i32)));
#else
simde__m128i r;
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
r.i32[i] = (a.i32[i] == b.i32[i]) ? 0xffffffff : 0x00000000;
}
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128d simde_mm_cmpeq_pd(simde__m128d a, simde__m128d b)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128D_C(_mm_cmpeq_pd(a.n, b.n));
#elif defined(SIMDE_SSE2_NEON)
return SIMDE__M128D_NEON_C(
i32, vreinterpretq_s32_u32(
vceqq_s32(vreinterpretq_s32_f32(b.neon_f32),
vreinterpretq_s32_f32(a.neon_f32))));
#else
simde__m128d r;
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
r.u64[i] = (a.f64[i] == b.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
}
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128d simde_mm_cmpeq_sd(simde__m128d a, simde__m128d b)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128D_C(_mm_cmpeq_sd(a.n, b.n));
#else
simde__m128d r;
r.u64[0] = (a.f64[0] == b.f64[0]) ? ~UINT64_C(0) : 0;
r.u64[1] = a.u64[1];
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128d simde_mm_cmpneq_pd(simde__m128d a, simde__m128d b)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128D_C(_mm_cmpneq_pd(a.n, b.n));
#elif defined(SIMDE_SSE2_NEON)
return SIMDE__M128D_NEON_C(f32,
vreinterpretq_f32_u16(vmvnq_u16(
vceqq_s16(b.neon_i16, a.neon_i16))));
#else
simde__m128d r;
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
r.u64[i] = (a.f64[i] != b.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
}
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128d simde_mm_cmpneq_sd(simde__m128d a, simde__m128d b)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128D_C(_mm_cmpneq_sd(a.n, b.n));
#else
simde__m128d r;
r.u64[0] = (a.f64[0] != b.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
r.u64[1] = a.u64[1];
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_cmplt_epi8(simde__m128i a, simde__m128i b)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128I_C(_mm_cmplt_epi8(a.n, b.n));
#elif defined(SIMDE_SSE2_NEON)
return SIMDE__M128I_NEON_C(
i8, vreinterpretq_s8_u8(vcltq_s8(a.neon_i8, b.neon_i8)));
#else
simde__m128i r;
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r.i8) / sizeof(r.i8[0])); i++) {
r.i8[i] = (a.i8[i] < b.i8[i]) ? 0xff : 0x00;
}
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_cmplt_epi16(simde__m128i a, simde__m128i b)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128I_C(_mm_cmplt_epi16(a.n, b.n));
#elif defined(SIMDE_SSE2_NEON)
return SIMDE__M128I_NEON_C(
i16, vreinterpretq_s16_u16(vcltq_s16(a.neon_i16, b.neon_i16)));
#else
simde__m128i r;
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) {
r.i16[i] = (a.i16[i] < b.i16[i]) ? 0xffff : 0x0000;
}
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_cmplt_epi32(simde__m128i a, simde__m128i b)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128I_C(_mm_cmplt_epi32(a.n, b.n));
#elif defined(SIMDE_SSE2_NEON)
return SIMDE__M128I_NEON_C(
i32, vreinterpretq_s32_u32(vcltq_s32(a.neon_i32, b.neon_i32)));
#else
simde__m128i r;
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
r.i32[i] = (a.i32[i] < b.i32[i]) ? 0xffffffff : 0x00000000;
}
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128d simde_mm_cmplt_pd(simde__m128d a, simde__m128d b)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128D_C(_mm_cmplt_pd(a.n, b.n));
#else
simde__m128d r;
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
r.u64[i] = (a.f64[i] < b.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
}
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128d simde_mm_cmplt_sd(simde__m128d a, simde__m128d b)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128D_C(_mm_cmplt_sd(a.n, b.n));
#else
simde__m128d r;
r.u64[0] = (a.f64[0] < b.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
r.u64[1] = a.u64[1];
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128d simde_mm_cmple_pd(simde__m128d a, simde__m128d b)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128D_C(_mm_cmple_pd(a.n, b.n));
#else
simde__m128d r;
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
r.u64[i] = (a.f64[i] <= b.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
}
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128d simde_mm_cmple_sd(simde__m128d a, simde__m128d b)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128D_C(_mm_cmple_sd(a.n, b.n));
#else
simde__m128d r;
r.u64[0] = (a.f64[0] <= b.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
r.u64[1] = a.u64[1];
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_cmpgt_epi8(simde__m128i a, simde__m128i b)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128I_C(_mm_cmpgt_epi8(a.n, b.n));
#elif defined(SIMDE_SSE2_NEON)
return SIMDE__M128I_NEON_C(
i8, vreinterpretq_s8_u8(vcgtq_s8(a.neon_i8, b.neon_i8)));
#else
simde__m128i r;
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r.i8) / sizeof(r.i8[0])); i++) {
r.i8[i] = (a.i8[i] > b.i8[i]) ? 0xff : 0x00;
}
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_cmpgt_epi16(simde__m128i a, simde__m128i b)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128I_C(_mm_cmpgt_epi16(a.n, b.n));
#elif defined(SIMDE_SSE2_NEON)
return SIMDE__M128I_NEON_C(
i16, vreinterpretq_s16_u16(vcgtq_s16(a.neon_i16, b.neon_i16)));
#else
simde__m128i r;
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) {
r.i16[i] = (a.i16[i] > b.i16[i]) ? 0xffff : 0x0000;
}
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_cmpgt_epi32(simde__m128i a, simde__m128i b)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128I_C(_mm_cmpgt_epi32(a.n, b.n));
#elif defined(SIMDE_SSE2_NEON)
return SIMDE__M128I_NEON_C(
i32, vreinterpretq_s32_u32(vcgtq_s32(a.neon_i32, b.neon_i32)));
#else
simde__m128i r;
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
r.i32[i] = (a.i32[i] > b.i32[i]) ? 0xffffffff : 0x00000000;
}
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128d simde_mm_cmpgt_pd(simde__m128d a, simde__m128d b)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128D_C(_mm_cmpgt_pd(a.n, b.n));
#else
simde__m128d r;
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
r.u64[i] = (a.f64[i] > b.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
}
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128d simde_mm_cmpgt_sd(simde__m128d a, simde__m128d b)
{
#if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI)
return SIMDE__M128D_C(_mm_cmpgt_sd(a.n, b.n));
#else
simde__m128d r;
r.u64[0] = (a.f64[0] > b.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
r.u64[1] = a.u64[1];
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128d simde_mm_cmpge_pd(simde__m128d a, simde__m128d b)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128D_C(_mm_cmpge_pd(a.n, b.n));
#else
simde__m128d r;
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
r.u64[i] = (a.f64[i] >= b.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
}
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128d simde_mm_cmpge_sd(simde__m128d a, simde__m128d b)
{
#if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI)
return SIMDE__M128D_C(_mm_cmpge_sd(a.n, b.n));
#else
simde__m128d r;
r.u64[0] = (a.f64[0] >= b.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
r.u64[1] = a.u64[1];
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128d simde_mm_cmpnge_pd(simde__m128d a, simde__m128d b)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128D_C(_mm_cmpnge_pd(a.n, b.n));
#else
return simde_mm_cmplt_pd(a, b);
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128d simde_mm_cmpnge_sd(simde__m128d a, simde__m128d b)
{
#if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI)
return SIMDE__M128D_C(_mm_cmpnge_sd(a.n, b.n));
#else
return simde_mm_cmplt_sd(a, b);
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128d simde_mm_cmpnlt_pd(simde__m128d a, simde__m128d b)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128D_C(_mm_cmpnlt_pd(a.n, b.n));
#else
return simde_mm_cmpge_pd(a, b);
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128d simde_mm_cmpnlt_sd(simde__m128d a, simde__m128d b)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128D_C(_mm_cmpnlt_sd(a.n, b.n));
#else
return simde_mm_cmpge_sd(a, b);
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128d simde_mm_cmpnle_pd(simde__m128d a, simde__m128d b)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128D_C(_mm_cmpnle_pd(a.n, b.n));
#else
return simde_mm_cmpgt_pd(a, b);
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128d simde_mm_cmpnle_sd(simde__m128d a, simde__m128d b)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128D_C(_mm_cmpnle_sd(a.n, b.n));
#else
return simde_mm_cmpgt_sd(a, b);
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128d simde_mm_cmpord_pd(simde__m128d a, simde__m128d b)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128D_C(_mm_cmpord_pd(a.n, b.n));
#else
simde__m128d r;
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
r.u64[i] = (!isnan(a.f64[i]) && !isnan(b.f64[i])) ? ~UINT64_C(0)
: UINT64_C(0);
}
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128d simde_mm_cmpord_sd(simde__m128d a, simde__m128d b)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128D_C(_mm_cmpord_sd(a.n, b.n));
#else
simde__m128d r;
r.u64[0] = (!isnan(a.f64[0]) && !isnan(b.f64[0])) ? ~UINT64_C(0)
: UINT64_C(0);
r.u64[1] = a.u64[1];
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128d simde_mm_cmpunord_pd(simde__m128d a, simde__m128d b)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128D_C(_mm_cmpunord_pd(a.n, b.n));
#else
simde__m128d r;
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
r.u64[i] = (isnan(a.f64[i]) || isnan(b.f64[i])) ? ~UINT64_C(0)
: UINT64_C(0);
}
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128d simde_mm_cmpunord_sd(simde__m128d a, simde__m128d b)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128D_C(_mm_cmpunord_sd(a.n, b.n));
#else
simde__m128d r;
r.u64[0] = (isnan(a.f64[0]) || isnan(b.f64[0])) ? ~UINT64_C(0)
: UINT64_C(0);
r.u64[1] = a.u64[1];
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128d simde_mm_cvtepi32_pd(simde__m128i a)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128D_C(_mm_cvtepi32_pd(a.n));
#else
simde__m128d r;
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
r.f64[i] = (simde_float64)a.i32[i];
}
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128 simde_mm_cvtepi32_ps(simde__m128i a)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128_C(_mm_cvtepi32_ps(a.n));
#elif defined(SIMDE_SSE2_NEON)
return SIMDE__M128_NEON_C(f32, vcvtq_f32_s32(a.neon_i32));
#else
simde__m128 r;
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) {
r.f32[i] = (simde_float32)a.i32[i];
}
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_cvtpd_epi32(simde__m128d a)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128I_C(_mm_cvtpd_epi32(a.n));
#else
simde__m128i r;
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
r.i32[i] = (int32_t)a.f64[i];
}
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m64 simde_mm_cvtpd_pi32(simde__m128d a)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M64_C(_mm_cvtpd_pi32(a.n));
#else
simde__m64 r;
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
r.i32[i] = (int32_t)a.f64[i];
}
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128 simde_mm_cvtpd_ps(simde__m128d a)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128_C(_mm_cvtpd_ps(a.n));
#else
simde__m128 r;
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(a.f64) / sizeof(a.f64[0])); i++) {
r.f32[i] = (simde_float32)a.f64[i];
}
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128d simde_mm_cvtpi32_pd(simde__m64 a)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128D_C(_mm_cvtpi32_pd(a.n));
#else
simde__m128d r;
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
r.f64[i] = (simde_float64)a.i32[i];
}
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_cvtps_epi32(simde__m128 a)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128I_C(_mm_cvtps_epi32(a.n));
#elif defined(SIMDE_SSE2_NEON)
/* The default rounding mode on SSE is 'round to even', which ArmV7
does not support! It is supported on ARMv8 however. */
#if defined(SIMDE_ARCH_AARCH64)
return SIMDE__M128I_NEON_C(i32, vcvtnq_s32_f32(a.neon_f32));
#else
uint32x4_t signmask = vdupq_n_u32(0x80000000);
float32x4_t half = vbslq_f32(signmask, a.neon_f32,
vdupq_n_f32(0.5f)); /* +/- 0.5 */
int32x4_t r_normal = vcvtq_s32_f32(
vaddq_f32(a.neon_f32, half)); /* round to integer: [a + 0.5]*/
int32x4_t r_trunc =
vcvtq_s32_f32(a.neon_f32); /* truncate to integer: [a] */
int32x4_t plusone = vshrq_n_s32(vnegq_s32(r_trunc), 31); /* 1 or 0 */
int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
float32x4_t delta = vsubq_f32(
a.neon_f32,
vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
uint32x4_t is_delta_half =
vceqq_f32(delta, half); /* delta == +/- 0.5 */
return SIMDE__M128I_NEON_C(i32,
vbslq_s32(is_delta_half, r_even, r_normal));
#endif
#else
simde__m128i r;
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
r.i32[i] = (int32_t)a.f32[i];
}
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128d simde_mm_cvtps_pd(simde__m128 a)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128D_C(_mm_cvtps_pd(a.n));
#else
simde__m128d r;
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
r.f64[i] = a.f32[i];
}
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
double simde_mm_cvtsd_f64(simde__m128d a)
{
#if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI)
return _mm_cvtsd_f64(a.n);
#else
return a.f64[0];
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
int32_t simde_mm_cvtsd_si32(simde__m128d a)
{
#if defined(SIMDE_SSE2_NATIVE)
return _mm_cvtsd_si32(a.n);
#else
return (int32_t)a.f64[0];
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
int32_t simde_mm_cvtsd_si64(simde__m128d a)
{
#if defined(SIMDE_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
#if defined(__PGI)
return _mm_cvtsd_si64x(a.n);
#else
return _mm_cvtsd_si64(a.n);
#endif
#else
return (int32_t)a.f64[0];
#endif
}
#define simde_mm_cvtsd_si64x(a) simde_mm_cvtsd_si64(a)
SIMDE__FUNCTION_ATTRIBUTES
simde__m128 simde_mm_cvtsd_ss(simde__m128 a, simde__m128d b)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128_C(_mm_cvtsd_ss(a.n, b.n));
#else
simde__m128 r;
r.f32[0] = (simde_float32)b.f64[0];
SIMDE__VECTORIZE
for (size_t i = 1; i < (sizeof(r) / sizeof(r.i32[0])); i++) {
r.i32[i] = a.i32[i];
}
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
int32_t simde_mm_cvtsi128_si32(simde__m128i a)
{
#if defined(SIMDE_SSE2_NATIVE)
return _mm_cvtsi128_si32(a.n);
#elif defined(SIMDE_SSE2_NEON)
return vgetq_lane_s32(a.neon_i32, 0);
#else
return a.i32[0];
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
int64_t simde_mm_cvtsi128_si64(simde__m128i a)
{
#if defined(SIMDE_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
#if defined(__PGI)
return _mm_cvtsi128_si64x(a.n);
#else
return _mm_cvtsi128_si64(a.n);
#endif
#else
return a.i64[0];
#endif
}
#define simde_mm_cvtsi128_si64x(a) simde_mm_cvtsi128_si64(a)
SIMDE__FUNCTION_ATTRIBUTES
simde__m128d simde_mm_cvtsi32_sd(simde__m128d a, int32_t b)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128D_C(_mm_cvtsi32_sd(a.n, b));
#else
simde__m128d r;
r.f64[0] = (simde_float64)b;
r.i64[1] = a.i64[1];
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_cvtsi32_si128(int32_t a)
{
simde__m128i r;
#if defined(SIMDE_SSE2_NATIVE)
r.n = _mm_cvtsi32_si128(a);
#elif defined(SIMDE_SSE2_NEON)
r.neon_i32 = vsetq_lane_s32(a, vdupq_n_s32(0), 0);
#else
r.i32[0] = a;
r.i32[1] = 0;
r.i32[2] = 0;
r.i32[3] = 0;
#endif
return r;
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128d simde_mm_cvtsi64_sd(simde__m128d a, int32_t b)
{
simde__m128d r;
#if defined(SIMDE_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
#if !defined(__PGI)
r.n = _mm_cvtsi64_sd(a.n, b);
#else
r.n = _mm_cvtsi64x_sd(a.n, b);
#endif
#else
r.f64[0] = (simde_float64)b;
r.f64[1] = a.f64[1];
#endif
return r;
}
#define simde_mm_cvtsi64x_sd(a, b) simde_mm_cvtsi64(a, b)
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_cvtsi64_si128(int64_t a)
{
simde__m128i r;
#if defined(SIMDE_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
#if !defined(__PGI)
r.n = _mm_cvtsi64_si128(a);
#else
r.n = _mm_cvtsi64x_si128(a);
#endif
#else
r.i64[0] = a;
r.i64[1] = 0;
#endif
return r;
}
#define simde_mm_cvtsi64x_si128(a) simde_mm_cvtsi64_si128(a)
SIMDE__FUNCTION_ATTRIBUTES
simde__m128d simde_mm_cvtss_sd(simde__m128d a, simde__m128 b)
{
simde__m128d r;
#if defined(SIMDE_SSE2_NATIVE)
r.n = _mm_cvtss_sd(a.n, b.n);
#else
r.f64[0] = b.f32[0];
r.i64[1] = a.i64[1];
#endif
return r;
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_cvttpd_epi32(simde__m128d a)
{
simde__m128i r;
#if defined(SIMDE_SSE2_NATIVE)
r.n = _mm_cvttpd_epi32(a.n);
#else
for (size_t i = 0; i < (sizeof(a.f64) / sizeof(a.f64[0])); i++) {
r.i32[i] = (int32_t)trunc(a.f64[i]);
}
#endif
return r;
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m64 simde_mm_cvttpd_pi32(simde__m128d a)
{
simde__m64 r;
#if defined(SIMDE_SSE2_NATIVE)
r.n = _mm_cvttpd_pi32(a.n);
#else
for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
r.i32[i] = (int32_t)trunc(a.f64[i]);
}
#endif
return r;
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_cvttps_epi32(simde__m128 a)
{
simde__m128i r;
#if defined(SIMDE_SSE2_NATIVE)
r.n = _mm_cvttps_epi32(a.n);
#elif defined(SIMDE_SSE2_NEON)
r.neon_i32 = vcvtq_s32_f32(a.neon_f32);
#else
for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
r.i32[i] = (int32_t)truncf(a.f32[i]);
}
#endif
return r;
}
SIMDE__FUNCTION_ATTRIBUTES
int32_t simde_mm_cvttsd_si32(simde__m128d a)
{
#if defined(SIMDE_SSE2_NATIVE)
return _mm_cvttsd_si32(a.n);
#else
return (int32_t)trunc(a.f64[0]);
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
int64_t simde_mm_cvttsd_si64(simde__m128d a)
{
#if defined(SIMDE_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
#if !defined(__PGI)
return _mm_cvttsd_si64(a.n);
#else
return _mm_cvttsd_si64x(a.n);
#endif
#else
return (int64_t)trunc(a.f64[0]);
#endif
}
#define simde_mm_cvttsd_si64x(a) simde_mm_cvttsd_si64(a)
SIMDE__FUNCTION_ATTRIBUTES
simde__m128d simde_mm_div_pd(simde__m128d a, simde__m128d b)
{
simde__m128d r;
#if defined(SIMDE_SSE2_NATIVE)
r.n = _mm_div_pd(a.n, b.n);
#else
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
r.f64[i] = a.f64[i] / b.f64[i];
}
#endif
return r;
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128d simde_mm_div_sd(simde__m128d a, simde__m128d b)
{
simde__m128d r;
#if defined(SIMDE_SSE2_NATIVE)
r.n = _mm_div_sd(a.n, b.n);
#else
r.f64[0] = a.f64[0] / b.f64[0];
r.f64[1] = a.f64[1];
#endif
return r;
}
SIMDE__FUNCTION_ATTRIBUTES
int32_t simde_mm_extract_epi16(simde__m128i a, const int imm8)
{
return a.u16[imm8 & 7];
}
#if defined(SIMDE_SSE2_NATIVE) && \
(!defined(SIMDE__REALLY_GCC) || HEDLEY_GCC_VERSION_CHECK(4, 6, 0))
#define simde_mm_extract_epi16(a, imm8) _mm_extract_epi16(a.n, imm8)
#elif defined(SIMDE_SSE2_NEON)
#define simde_mm_extract_epi16(a, imm8) \
(vgetq_lane_s16((a).neon_i16, (imm8)) & ((int32_t)UINT32_C(0x0000ffff)))
#endif
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_insert_epi16(simde__m128i a, int32_t i, const int imm8)
{
a.u16[imm8 & 7] = (int16_t)i;
return a;
}
#if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI)
#define simde_mm_insert_epi16(a, i, imm8) \
SIMDE__M128I_C(_mm_insert_epi16((a).n, (i), (imm8)))
#elif defined(SIMDE_SSE2_NEON)
#define simde_mm_insert_epi16(a, i, imm8) \
SIMDE__M128I_NEON_C(i16, vsetq_lane_s16((i), a.neon_i16, (imm8)))
#endif
SIMDE__FUNCTION_ATTRIBUTES
simde__m128d
simde_mm_load_pd(simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM(2)])
{
simde__m128d r;
simde_assert_aligned(16, mem_addr);
#if defined(SIMDE_SSE2_NATIVE)
r.n = _mm_load_pd(mem_addr);
#elif defined(SIMDE_SSE2_NEON)
r.neon_u32 = vld1q_u32((uint32_t const *)mem_addr);
#else
SIMDE__ASSUME_ALIGNED(mem_addr, 16);
memcpy(&r, mem_addr, sizeof(r));
#endif
return r;
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128d simde_mm_load_pd1(simde_float64 const *mem_addr)
{
simde__m128d r;
#if defined(SIMDE_SSE2_NATIVE)
r.n = _mm_load_pd1(mem_addr);
#else
r.f64[0] = *mem_addr;
r.f64[1] = *mem_addr;
#endif
return r;
}
#define simde_mm_load1_pd(mem_addr) simde_mm_load_pd1(mem_addr)
SIMDE__FUNCTION_ATTRIBUTES
simde__m128d simde_mm_load_sd(simde_float64 const *mem_addr)
{
simde__m128d r;
#if defined(SIMDE_SSE2_NATIVE)
r.n = _mm_load_sd(mem_addr);
#else
memcpy(&r, mem_addr, sizeof(simde_float64));
r.u64[1] = 0;
#endif
return r;
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_load_si128(simde__m128i const *mem_addr)
{
simde__m128i r;
simde_assert_aligned(16, mem_addr);
#if defined(SIMDE_SSE2_NATIVE)
r.n = _mm_load_si128(&(mem_addr->n));
#elif defined(SIMDE_SSE2_NEON)
r.neon_i32 = vld1q_s32((int32_t const *)mem_addr);
#else
SIMDE__ASSUME_ALIGNED(mem_addr, 16);
memcpy(&r, mem_addr, sizeof(r));
#endif
return r;
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128d simde_mm_loadh_pd(simde__m128d a, simde_float64 const *mem_addr)
{
simde__m128d r;
#if defined(SIMDE_SSE2_NATIVE)
r.n = _mm_loadh_pd(a.n, mem_addr);
#else
simde_float64 t;
memcpy(&t, mem_addr, sizeof(t));
r.f64[0] = a.f64[0];
r.f64[1] = t;
#endif
return r;
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_loadl_epi64(simde__m128i const *mem_addr)
{
simde__m128i r;
#if defined(SIMDE_SSE2_NATIVE)
r.n = _mm_loadl_epi64(&mem_addr->n);
#elif defined(SIMDE_SSE2_NEON)
r.neon_i32 = vcombine_s32(vld1_s32((int32_t const *)mem_addr),
vcreate_s32(0));
#else
r.u64[0] = mem_addr->u64[0];
r.u64[1] = 0;
#endif
return r;
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128d simde_mm_loadl_pd(simde__m128d a, simde_float64 const *mem_addr)
{
simde__m128d r;
#if defined(SIMDE_SSE2_NATIVE)
r.n = _mm_loadl_pd(a.n, mem_addr);
#else
memcpy(&r, mem_addr, sizeof(simde_float64));
r.u64[1] = a.u64[1];
#endif
return r;
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128d
simde_mm_loadr_pd(simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM(2)])
{
simde__m128d r;
simde_assert_aligned(16, mem_addr);
#if defined(SIMDE_SSE2_NATIVE)
r.n = _mm_loadr_pd(mem_addr);
#else
SIMDE__ASSUME_ALIGNED(mem_addr, 16);
r.f64[0] = mem_addr[1];
r.f64[1] = mem_addr[0];
#endif
return r;
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128d
simde_mm_loadu_pd(simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM(2)])
{
simde__m128d r;
#if defined(SIMDE_SSE2_NATIVE)
r.n = _mm_loadu_pd(mem_addr);
#else
simde_float64 l, h;
memcpy(&l, &mem_addr[0], sizeof(l));
memcpy(&h, &mem_addr[1], sizeof(h));
r.f64[0] = l;
r.f64[1] = h;
#endif
return r;
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_loadu_si128(simde__m128i const *mem_addr)
{
simde__m128i r;
#if defined(SIMDE_SSE2_NATIVE)
r.n = _mm_loadu_si128(&((*mem_addr).n));
#elif defined(SIMDE_SSE2_NEON)
r.neon_i32 = vld1q_s32((int32_t const *)mem_addr);
#else
memcpy(&r, mem_addr, sizeof(r));
#endif
return r;
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_madd_epi16(simde__m128i a, simde__m128i b)
{
simde__m128i r;
#if defined(SIMDE_SSE2_NATIVE)
r.n = _mm_madd_epi16(a.n, b.n);
#elif defined(SIMDE_SSE2_NEON)
int32x4_t pl =
vmull_s16(vget_low_s16(a.neon_i16), vget_low_s16(b.neon_i16));
int32x4_t ph =
vmull_s16(vget_high_s16(a.neon_i16), vget_high_s16(b.neon_i16));
int32x2_t rl = vpadd_s32(vget_low_s32(pl), vget_high_s32(pl));
int32x2_t rh = vpadd_s32(vget_low_s32(ph), vget_high_s32(ph));
r.neon_i32 = vcombine_s32(rl, rh);
#else
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r) / sizeof(r.i16[0])); i += 2) {
r.i32[i / 2] =
(a.i16[i] * b.i16[i]) + (a.i16[i + 1] * b.i16[i + 1]);
}
#endif
return r;
}
SIMDE__FUNCTION_ATTRIBUTES
void simde_mm_maskmoveu_si128(simde__m128i a, simde__m128i mask,
int8_t mem_addr[HEDLEY_ARRAY_PARAM(16)])
{
#if defined(SIMDE_SSE2_NATIVE)
_mm_maskmoveu_si128(a.n, mask.n, (char *)mem_addr);
#else
for (size_t i = 0; i < 16; i++) {
if (mask.u8[i] & 0x80) {
mem_addr[i] = a.i8[i];
}
}
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
int32_t simde_mm_movemask_epi8(simde__m128i a)
{
#if defined(SIMDE_SSE2_NATIVE)
return _mm_movemask_epi8(a.n);
#elif defined(SIMDE_SSE2_NEON)
uint8x16_t input = a.neon_u8;
SIMDE_ALIGN(16)
static const int8_t xr[8] = {-7, -6, -5, -4, -3, -2, -1, 0};
uint8x8_t mask_and = vdup_n_u8(0x80);
int8x8_t mask_shift = vld1_s8(xr);
uint8x8_t lo = vget_low_u8(input);
uint8x8_t hi = vget_high_u8(input);
lo = vand_u8(lo, mask_and);
lo = vshl_u8(lo, mask_shift);
hi = vand_u8(hi, mask_and);
hi = vshl_u8(hi, mask_shift);
lo = vpadd_u8(lo, lo);
lo = vpadd_u8(lo, lo);
lo = vpadd_u8(lo, lo);
hi = vpadd_u8(hi, hi);
hi = vpadd_u8(hi, hi);
hi = vpadd_u8(hi, hi);
return ((hi[0] << 8) | (lo[0] & 0xFF));
#else
int32_t r = 0;
SIMDE__VECTORIZE_REDUCTION(| : r)
for (size_t i = 0; i < 16; i++) {
r |= (a.u8[15 - i] >> 7) << (15 - i);
}
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
int32_t simde_mm_movemask_pd(simde__m128d a)
{
#if defined(SIMDE_SSE2_NATIVE)
return _mm_movemask_pd(a.n);
#else
int32_t r = 0;
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(a.u64) / sizeof(a.u64[0])); i++) {
r |= (a.u64[i] >> 63) << i;
}
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m64 simde_mm_movepi64_pi64(simde__m128i a)
{
simde__m64 r;
#if defined(SIMDE_SSE2_NATIVE)
r.n = _mm_movepi64_pi64(a.n);
#else
r.i64[0] = a.i64[0];
#endif
return r;
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_movpi64_epi64(simde__m64 a)
{
simde__m128i r;
#if defined(SIMDE_SSE2_NATIVE)
r.n = _mm_movpi64_epi64(a.n);
#else
r.i64[0] = a.i64[0];
r.i64[1] = 0;
#endif
return r;
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_min_epi16(simde__m128i a, simde__m128i b)
{
simde__m128i r;
#if defined(SIMDE_SSE2_NATIVE)
r.n = _mm_min_epi16(a.n, b.n);
#elif defined(SIMDE_SSE2_NEON)
r.neon_i16 = vminq_s16(a.neon_i16, b.neon_i16);
#else
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) {
r.i16[i] = (a.i16[i] < b.i16[i]) ? a.i16[i] : b.i16[i];
}
#endif
return r;
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_min_epu8(simde__m128i a, simde__m128i b)
{
simde__m128i r;
#if defined(SIMDE_SSE2_NATIVE)
r.n = _mm_min_epu8(a.n, b.n);
#elif defined(SIMDE_SSE2_NEON)
r.neon_u8 = vminq_u8(a.neon_u8, b.neon_u8);
#else
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r.u8) / sizeof(r.u8[0])); i++) {
r.u8[i] = (a.u8[i] < b.u8[i]) ? a.u8[i] : b.u8[i];
}
#endif
return r;
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128d simde_mm_min_pd(simde__m128d a, simde__m128d b)
{
simde__m128d r;
#if defined(SIMDE_SSE2_NATIVE)
r.n = _mm_min_pd(a.n, b.n);
#else
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
r.f64[i] = (a.f64[i] < b.f64[i]) ? a.f64[i] : b.f64[i];
}
#endif
return r;
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128d simde_mm_min_sd(simde__m128d a, simde__m128d b)
{
simde__m128d r;
#if defined(SIMDE_SSE2_NATIVE)
r.n = _mm_min_sd(a.n, b.n);
#else
r.f64[0] = (a.f64[0] < b.f64[0]) ? a.f64[0] : b.f64[0];
r.f64[1] = a.f64[1];
#endif
return r;
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_max_epi16(simde__m128i a, simde__m128i b)
{
simde__m128i r;
#if defined(SIMDE_SSE2_NATIVE)
r.n = _mm_max_epi16(a.n, b.n);
#elif defined(SIMDE_SSE2_NEON)
r.neon_i16 = vmaxq_s16(a.neon_i16, b.neon_i16);
#else
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) {
r.i16[i] = (a.i16[i] > b.i16[i]) ? a.i16[i] : b.i16[i];
}
#endif
return r;
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_max_epu8(simde__m128i a, simde__m128i b)
{
simde__m128i r;
#if defined(SIMDE_SSE2_NATIVE)
r.n = _mm_max_epu8(a.n, b.n);
#elif defined(SIMDE_SSE2_NEON)
r.neon_u8 = vmaxq_u8(a.neon_u8, b.neon_u8);
#else
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r.u8) / sizeof(r.u8[0])); i++) {
r.u8[i] = (a.u8[i] > b.u8[i]) ? a.u8[i] : b.u8[i];
}
#endif
return r;
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128d simde_mm_max_pd(simde__m128d a, simde__m128d b)
{
simde__m128d r;
#if defined(SIMDE_SSE2_NATIVE)
r.n = _mm_max_pd(a.n, b.n);
#else
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
r.f64[i] = (a.f64[i] > b.f64[i]) ? a.f64[i] : b.f64[i];
}
#endif
return r;
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128d simde_mm_max_sd(simde__m128d a, simde__m128d b)
{
simde__m128d r;
#if defined(SIMDE_SSE2_NATIVE)
r.n = _mm_max_sd(a.n, b.n);
#else
r.f64[0] = (a.f64[0] > b.f64[0]) ? a.f64[0] : b.f64[0];
r.f64[1] = a.f64[1];
#endif
return r;
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_move_epi64(simde__m128i a)
{
simde__m128i r;
#if defined(SIMDE_SSE2_NATIVE)
r.n = _mm_move_epi64(a.n);
#elif defined(SIMDE_SSE2_NEON)
r.neon_i64 = vsetq_lane_s64(0, a.neon_i64, 1);
#else
r.i64[0] = a.i64[0];
r.i64[1] = 0;
#endif
return r;
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128d simde_mm_move_sd(simde__m128d a, simde__m128d b)
{
simde__m128d r;
#if defined(SIMDE_SSE2_NATIVE)
r.n = _mm_move_sd(a.n, b.n);
#else
r.f64[0] = b.f64[0];
r.f64[1] = a.f64[1];
#endif
return r;
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_mul_epu32(simde__m128i a, simde__m128i b)
{
simde__m128i r;
#if defined(SIMDE_SSE2_NATIVE)
r.n = _mm_mul_epu32(a.n, b.n);
#else
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r.u64) / sizeof(r.u64[0])); i++) {
r.u64[i] = ((uint64_t)a.u32[i * 2]) * ((uint64_t)b.u32[i * 2]);
}
#endif
return r;
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_x_mm_mul_epi64(simde__m128i a, simde__m128i b)
{
simde__m128i r;
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) {
r.i64[i] = a.i64[i] * b.i64[i];
}
return r;
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_x_mm_mod_epi64(simde__m128i a, simde__m128i b)
{
simde__m128i r;
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) {
r.i64[i] = a.i64[i] % b.i64[i];
}
return r;
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128d simde_mm_mul_pd(simde__m128d a, simde__m128d b)
{
simde__m128d r;
#if defined(SIMDE_SSE2_NATIVE)
r.n = _mm_mul_pd(a.n, b.n);
#else
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
r.f64[i] = a.f64[i] * b.f64[i];
}
#endif
return r;
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128d simde_mm_mul_sd(simde__m128d a, simde__m128d b)
{
simde__m128d r;
#if defined(SIMDE_SSE2_NATIVE)
r.n = _mm_mul_sd(a.n, b.n);
#else
r.f64[0] = a.f64[0] * b.f64[0];
r.f64[1] = a.f64[1];
#endif
return r;
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m64 simde_mm_mul_su32(simde__m64 a, simde__m64 b)
{
simde__m64 r;
#if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI)
r.n = _mm_mul_su32(a.n, b.n);
#else
r.u64[0] = ((uint64_t)a.u32[0]) * ((uint64_t)b.u32[0]);
#endif
return r;
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_mulhi_epi16(simde__m128i a, simde__m128i b)
{
simde__m128i r;
#if defined(SIMDE_SSE2_NATIVE)
r.n = _mm_mulhi_epi16(a.n, b.n);
#elif defined(SIMDE_SSE2_NEON)
int16x4_t a3210 = vget_low_s16(a.neon_i16);
int16x4_t b3210 = vget_low_s16(b.neon_i16);
int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */
int16x4_t a7654 = vget_high_s16(a.neon_i16);
int16x4_t b7654 = vget_high_s16(b.neon_i16);
int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */
uint16x8x2_t rv = vuzpq_u16(vreinterpretq_u16_s32(ab3210),
vreinterpretq_u16_s32(ab7654));
r.neon_u16 = rv.val[1];
#else
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) {
r.u16[i] = (uint16_t)(((uint32_t)(((int32_t)a.i16[i]) *
((int32_t)b.i16[i]))) >>
16);
}
#endif
return r;
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_mulhi_epu16(simde__m128i a, simde__m128i b)
{
simde__m128i r;
#if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI)
r.n = _mm_mulhi_epu16(a.n, b.n);
#else
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r.u16) / sizeof(r.u16[0])); i++) {
r.u16[i] = (uint16_t)(
(((uint32_t)a.u16[i]) * ((uint32_t)b.u16[i])) >> 16);
}
#endif
return r;
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_mullo_epi16(simde__m128i a, simde__m128i b)
{
simde__m128i r;
#if defined(SIMDE_SSE2_NATIVE)
r.n = _mm_mullo_epi16(a.n, b.n);
#elif defined(SIMDE_SSE2_NEON)
r.neon_i16 = vmulq_s16(a.neon_i16, b.neon_i16);
#else
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) {
r.u16[i] = (uint16_t)(((uint32_t)(((int32_t)a.i16[i]) *
((int32_t)b.i16[i]))) &
0xffff);
}
#endif
return r;
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128d simde_mm_or_pd(simde__m128d a, simde__m128d b)
{
simde__m128d r;
#if defined(SIMDE_SSE2_NATIVE)
r.n = _mm_or_pd(a.n, b.n);
#else
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) {
r.i64[i] = a.i64[i] | b.i64[i];
}
#endif
return r;
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_or_si128(simde__m128i a, simde__m128i b)
{
simde__m128i r;
#if defined(SIMDE_SSE2_NATIVE)
r.n = _mm_or_si128(a.n, b.n);
#elif defined(SIMDE_SSE2_NEON)
r.neon_i32 = vorrq_s32(a.neon_i32, b.neon_i32);
#else
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) {
r.i64[i] = a.i64[i] | b.i64[i];
}
#endif
return r;
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_packs_epi16(simde__m128i a, simde__m128i b)
{
simde__m128i r;
#if defined(SIMDE_SSE2_NATIVE)
r.n = _mm_packs_epi16(a.n, b.n);
#elif defined(SIMDE_SSE2_NEON)
r.neon_i8 = vcombine_s8(vqmovn_s16(a.neon_i16), vqmovn_s16(b.neon_i16));
#else
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) {
r.i8[i] = (a.i16[i] > INT8_MAX)
? INT8_MAX
: ((a.i16[i] < INT8_MIN)
? INT8_MIN
: ((int8_t)a.i16[i]));
r.i8[i + 8] = (b.i16[i] > INT8_MAX)
? INT8_MAX
: ((b.i16[i] < INT8_MIN)
? INT8_MIN
: ((int8_t)b.i16[i]));
}
#endif
return r;
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_packs_epi32(simde__m128i a, simde__m128i b)
{
simde__m128i r;
#if defined(SIMDE_SSE2_NATIVE)
r.n = _mm_packs_epi32(a.n, b.n);
#elif defined(SIMDE_SSE2_NEON)
r.neon_i16 =
vcombine_s16(vqmovn_s32(a.neon_i32), vqmovn_s32(b.neon_i32));
#else
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
r.i16[i] = (a.i32[i] > INT16_MAX)
? INT16_MAX
: ((a.i32[i] < INT16_MIN)
? INT16_MIN
: ((int16_t)a.i32[i]));
r.i16[i + 4] = (b.i32[i] > INT16_MAX)
? INT16_MAX
: ((b.i32[i] < INT16_MIN)
? INT16_MIN
: ((int16_t)b.i32[i]));
}
#endif
return r;
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_packus_epi16(simde__m128i a, simde__m128i b)
{
simde__m128i r;
#if defined(SIMDE_SSE2_NATIVE)
r.n = _mm_packus_epi16(a.n, b.n);
#elif defined(SIMDE_SSE2_NEON)
r.neon_u8 =
vcombine_u8(vqmovun_s16(a.neon_i16), vqmovun_s16(b.neon_i16));
#else
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) {
r.u8[i] = (a.i16[i] > UINT8_MAX)
? UINT8_MAX
: ((a.i16[i] < 0) ? 0 : ((int8_t)a.i16[i]));
r.u8[i + 8] =
(b.i16[i] > UINT8_MAX)
? UINT8_MAX
: ((b.i16[i] < 0) ? 0 : ((int8_t)b.i16[i]));
}
#endif
return r;
}
SIMDE__FUNCTION_ATTRIBUTES
void simde_mm_pause(void)
{
#if defined(SIMDE_SSE2_NATIVE)
_mm_pause();
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_sad_epu8(simde__m128i a, simde__m128i b)
{
simde__m128i r;
#if defined(SIMDE_SSE2_NATIVE)
r.n = _mm_sad_epu8(a.n, b.n);
#else
for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) {
uint16_t tmp = 0;
SIMDE__VECTORIZE_REDUCTION(+ : tmp)
for (size_t j = 0; j < ((sizeof(r.u8) / sizeof(r.u8[0])) / 2);
j++) {
const size_t e = j + (i * 8);
tmp += (a.u8[e] > b.u8[e]) ? (a.u8[e] - b.u8[e])
: (b.u8[e] - a.u8[e]);
}
r.i64[i] = tmp;
}
#endif
return r;
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_set_epi8(int8_t e15, int8_t e14, int8_t e13, int8_t e12,
int8_t e11, int8_t e10, int8_t e9, int8_t e8,
int8_t e7, int8_t e6, int8_t e5, int8_t e4,
int8_t e3, int8_t e2, int8_t e1, int8_t e0)
{
simde__m128i r;
#if defined(SIMDE_SSE2_NATIVE)
r.n = _mm_set_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4,
e3, e2, e1, e0);
#else
r.i8[0] = e0;
r.i8[1] = e1;
r.i8[2] = e2;
r.i8[3] = e3;
r.i8[4] = e4;
r.i8[5] = e5;
r.i8[6] = e6;
r.i8[7] = e7;
r.i8[8] = e8;
r.i8[9] = e9;
r.i8[10] = e10;
r.i8[11] = e11;
r.i8[12] = e12;
r.i8[13] = e13;
r.i8[14] = e14;
r.i8[15] = e15;
#endif
return r;
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_set_epi16(int16_t e7, int16_t e6, int16_t e5, int16_t e4,
int16_t e3, int16_t e2, int16_t e1, int16_t e0)
{
simde__m128i r;
#if defined(SIMDE_SSE2_NATIVE)
r.n = _mm_set_epi16(e7, e6, e5, e4, e3, e2, e1, e0);
#elif defined(SIMDE_SSE2_NEON)
SIMDE_ALIGN(16) int16_t data[8] = {e0, e1, e2, e3, e4, e5, e6, e7};
r.neon_i16 = vld1q_s16(data);
#else
r.i16[0] = e0;
r.i16[1] = e1;
r.i16[2] = e2;
r.i16[3] = e3;
r.i16[4] = e4;
r.i16[5] = e5;
r.i16[6] = e6;
r.i16[7] = e7;
#endif
return r;
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_set_epi32(int32_t e3, int32_t e2, int32_t e1, int32_t e0)
{
simde__m128i r;
#if defined(SIMDE_SSE2_NATIVE)
r.n = _mm_set_epi32(e3, e2, e1, e0);
#elif defined(SIMDE_SSE2_NEON)
SIMDE_ALIGN(16) int32_t data[4] = {e0, e1, e2, e3};
r.neon_i32 = vld1q_s32(data);
#else
r.i32[0] = e0;
r.i32[1] = e1;
r.i32[2] = e2;
r.i32[3] = e3;
#endif
return r;
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_set_epi64(simde__m64 e1, simde__m64 e0)
{
simde__m128i r;
#if defined(SIMDE_SSE2_NATIVE)
r.n = _mm_set_epi64(e1.n, e0.n);
#else
r.i64[0] = e0.i64[0];
r.i64[1] = e1.i64[0];
#endif
return r;
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_set_epi64x(int64_t e1, int64_t e0)
{
simde__m128i r;
#if defined(SIMDE_SSE2_NATIVE)
r.n = _mm_set_epi64x(e1, e0);
#elif defined(SIMDE_SSE2_NEON)
r = SIMDE__M128I_NEON_C(i64,
vcombine_s64(vdup_n_s64(e0), vdup_n_s64(e1)));
#else
r.i64[0] = e0;
r.i64[1] = e1;
#endif
return r;
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_x_mm_set_epu8(uint8_t e15, uint8_t e14, uint8_t e13,
uint8_t e12, uint8_t e11, uint8_t e10,
uint8_t e9, uint8_t e8, uint8_t e7, uint8_t e6,
uint8_t e5, uint8_t e4, uint8_t e3, uint8_t e2,
uint8_t e1, uint8_t e0)
{
simde__m128i r;
r.u8[0] = e0;
r.u8[1] = e1;
r.u8[2] = e2;
r.u8[3] = e3;
r.u8[4] = e4;
r.u8[5] = e5;
r.u8[6] = e6;
r.u8[7] = e7;
r.u8[8] = e8;
r.u8[9] = e9;
r.u8[10] = e10;
r.u8[11] = e11;
r.u8[12] = e12;
r.u8[13] = e13;
r.u8[14] = e14;
r.u8[15] = e15;
return r;
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_x_mm_set_epu16(uint16_t e7, uint16_t e6, uint16_t e5,
uint16_t e4, uint16_t e3, uint16_t e2,
uint16_t e1, uint16_t e0)
{
simde__m128i r;
r.u16[0] = e0;
r.u16[1] = e1;
r.u16[2] = e2;
r.u16[3] = e3;
r.u16[4] = e4;
r.u16[5] = e5;
r.u16[6] = e6;
r.u16[7] = e7;
return r;
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_x_mm_set_epu32(uint32_t e3, uint32_t e2, uint32_t e1,
uint32_t e0)
{
simde__m128i r;
r.u32[0] = e0;
r.u32[1] = e1;
r.u32[2] = e2;
r.u32[3] = e3;
return r;
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_x_mm_set_epu64x(uint64_t e1, uint64_t e0)
{
simde__m128i r;
r.u64[0] = e0;
r.u64[1] = e1;
return r;
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128d simde_mm_set_pd(simde_float64 e1, simde_float64 e0)
{
simde__m128d r;
#if defined(SIMDE_SSE2_NATIVE)
r.n = _mm_set_pd(e1, e0);
#else
r.f64[0] = e0;
r.f64[1] = e1;
#endif
return r;
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128d simde_mm_set_pd1(simde_float64 a)
{
simde__m128d r;
#if defined(SIMDE_SSE2_NATIVE)
r.n = _mm_set1_pd(a);
#else
r.f64[0] = a;
r.f64[1] = a;
#endif
return r;
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128d simde_mm_set_sd(simde_float64 a)
{
simde__m128d r;
#if defined(SIMDE_SSE2_NATIVE)
r.n = _mm_set_sd(a);
#else
r.f64[0] = a;
r.u64[1] = 0;
#endif
return r;
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_set1_epi8(int8_t a)
{
simde__m128i r;
#if defined(SIMDE_SSE2_NATIVE)
r.n = _mm_set1_epi8(a);
#elif defined(SIMDE_SSE2_NEON)
r.neon_i8 = vdupq_n_s8(a);
#else
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r.i8) / sizeof(r.i8[0])); i++) {
r.i8[i] = a;
}
#endif
return r;
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_set1_epi16(int16_t a)
{
simde__m128i r;
#if defined(SIMDE_SSE2_NATIVE)
r.n = _mm_set1_epi16(a);
#elif defined(SIMDE_SSE2_NEON)
r.neon_i16 = vdupq_n_s16(a);
#else
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) {
r.i16[i] = a;
}
#endif
return r;
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_set1_epi32(int32_t a)
{
simde__m128i r;
#if defined(SIMDE_SSE2_NATIVE)
r.n = _mm_set1_epi32(a);
#elif defined(SIMDE_SSE2_NEON)
r.neon_i32 = vdupq_n_s32(a);
#else
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
r.i32[i] = a;
}
#endif
return r;
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_set1_epi64x(int64_t a)
{
simde__m128i r;
#if defined(SIMDE_SSE2_NATIVE)
r.n = _mm_set1_epi64x(a);
#elif defined(SIMDE_SSE2_NEON)
r.neon_i64 = vmovq_n_s64(a);
#else
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) {
r.i64[i] = a;
}
#endif
return r;
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_set1_epi64(simde__m64 a)
{
simde__m128i r;
#if defined(SIMDE_SSE2_NATIVE)
r.n = _mm_set1_epi64(a.n);
#else
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) {
r.i64[i] = a.i64[0];
}
#endif
return r;
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128d simde_mm_set1_pd(simde_float64 a)
{
simde__m128d r;
#if defined(SIMDE_SSE2_NATIVE)
r.n = _mm_set1_pd(a);
#else
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) {
r.f64[i] = a;
}
#endif
return r;
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_setr_epi8(int8_t e15, int8_t e14, int8_t e13, int8_t e12,
int8_t e11, int8_t e10, int8_t e9, int8_t e8,
int8_t e7, int8_t e6, int8_t e5, int8_t e4,
int8_t e3, int8_t e2, int8_t e1, int8_t e0)
{
simde__m128i r;
#if defined(SIMDE_SSE2_NATIVE)
r.n = _mm_setr_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5,
e4, e3, e2, e1, e0);
#elif defined(SIMDE_SSE2_NEON)
int8_t t[] = {e15, e14, e13, e12, e11, e10, e9, e8,
e7, e6, e5, e4, e3, e2, e1, e0};
r.neon_i8 = vld1q_s8(t);
#else
r.i8[0] = e15;
r.i8[1] = e14;
r.i8[2] = e13;
r.i8[3] = e12;
r.i8[4] = e11;
r.i8[5] = e10;
r.i8[6] = e9;
r.i8[7] = e8;
r.i8[8] = e7;
r.i8[9] = e6;
r.i8[10] = e5;
r.i8[11] = e4;
r.i8[12] = e3;
r.i8[13] = e2;
r.i8[14] = e1;
r.i8[15] = e0;
#endif
return r;
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_setr_epi16(int16_t e7, int16_t e6, int16_t e5, int16_t e4,
int16_t e3, int16_t e2, int16_t e1, int16_t e0)
{
simde__m128i r;
#if defined(SIMDE_SSE2_NATIVE)
r.n = _mm_setr_epi16(e7, e6, e5, e4, e3, e2, e1, e0);
#elif defined(SIMDE_SSE2_NEON)
int16_t t[] = {e7, e6, e5, e4, e3, e2, e1, e0};
r.neon_i16 = vld1q_s16(t);
#else
r.i16[0] = e7;
r.i16[1] = e6;
r.i16[2] = e5;
r.i16[3] = e4;
r.i16[4] = e3;
r.i16[5] = e2;
r.i16[6] = e1;
r.i16[7] = e0;
#endif
return r;
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_setr_epi32(int32_t e3, int32_t e2, int32_t e1, int32_t e0)
{
simde__m128i r;
#if defined(SIMDE_SSE2_NATIVE)
r.n = _mm_setr_epi32(e3, e2, e1, e0);
#elif defined(SIMDE_SSE2_NEON)
int32_t t[] = {e3, e2, e1, e0};
r.neon_i32 = vld1q_s32(t);
#else
r.i32[0] = e3;
r.i32[1] = e2;
r.i32[2] = e1;
r.i32[3] = e0;
#endif
return r;
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_setr_epi64(simde__m64 e1, simde__m64 e0)
{
simde__m128i r;
#if defined(SIMDE_SSE2_NATIVE)
r.n = _mm_setr_epi64(e1.n, e0.n);
#elif defined(SIMDE_SSE2_NEON)
r.neon_i64 = vcombine_s64(e1.neon_i64, e0.neon_i64);
#else
r.i64[0] = e1.i64[0];
r.i64[1] = e0.i64[0];
#endif
return r;
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128d simde_mm_setr_pd(simde_float64 e1, simde_float64 e0)
{
simde__m128d r;
#if defined(SIMDE_SSE2_NATIVE)
r.n = _mm_setr_pd(e1, e0);
#else
r.f64[0] = e1;
r.f64[1] = e0;
#endif
return r;
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128d simde_mm_setzero_pd(void)
{
simde__m128d r;
#if defined(SIMDE_SSE2_NATIVE)
r.n = _mm_setzero_pd();
#else
r.u64[0] = 0;
r.u64[1] = 0;
#endif
return r;
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_setzero_si128(void)
{
simde__m128i r;
#if defined(SIMDE_SSE2_NATIVE)
r.n = _mm_setzero_si128();
#elif defined(SIMDE_SSE2_NEON)
r.neon_i32 = vdupq_n_s32(0);
#else
r.u64[0] = 0;
r.u64[1] = 0;
#endif
return r;
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_shuffle_epi32(simde__m128i a, const int imm8)
{
simde__m128i r;
for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
r.i32[i] = a.i32[(imm8 >> (i * 2)) & 3];
}
return r;
}
#if defined(SIMDE_SSE2_NATIVE)
#define simde_mm_shuffle_epi32(a, imm8) \
SIMDE__M128I_C(_mm_shuffle_epi32((a).n, (imm8)))
#elif defined(SIMDE__SHUFFLE_VECTOR)
#define simde_mm_shuffle_epi32(a, imm8) \
({ \
const simde__m128i simde__tmp_a_ = a; \
(simde__m128i){.i32 = SIMDE__SHUFFLE_VECTOR( \
32, 16, (simde__tmp_a_).i32, \
(simde__tmp_a_).i32, ((imm8)) & 3, \
((imm8) >> 2) & 3, ((imm8) >> 4) & 3, \
((imm8) >> 6) & 3)}; \
})
#endif
SIMDE__FUNCTION_ATTRIBUTES
simde__m128d simde_mm_shuffle_pd(simde__m128d a, simde__m128d b, const int imm8)
{
simde__m128d r;
r.f64[0] = ((imm8 & 1) == 0) ? a.f64[0] : a.f64[1];
r.f64[1] = ((imm8 & 2) == 0) ? b.f64[0] : b.f64[1];
return r;
}
#if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI)
#define simde_mm_shuffle_pd(a, b, imm8) \
SIMDE__M128D_C(_mm_shuffle_pd((a).n, (b).n, (imm8)))
#elif defined(SIMDE__SHUFFLE_VECTOR)
#define simde_mm_shuffle_pd(a, b, imm8) \
({ \
(simde__m128d){.f64 = SIMDE__SHUFFLE_VECTOR( \
64, 16, (a).f64, (b).f64, \
(((imm8)) & 1), \
(((imm8) >> 1) & 1) + 2)}; \
})
#endif
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_shufflehi_epi16(simde__m128i a, const int imm8)
{
simde__m128i r;
r.i64[0] = a.i64[0];
for (size_t i = 4; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) {
r.i16[i] = a.i16[((imm8 >> ((i - 4) * 2)) & 3) + 4];
}
return r;
}
#if defined(SIMDE_SSE2_NATIVE)
#define simde_mm_shufflehi_epi16(a, imm8) \
SIMDE__M128I_C(_mm_shufflehi_epi16((a).n, (imm8)))
#elif defined(SIMDE__SHUFFLE_VECTOR)
#define simde_mm_shufflehi_epi16(a, imm8) \
({ \
const simde__m128i simde__tmp_a_ = a; \
(simde__m128i){.i16 = SIMDE__SHUFFLE_VECTOR( \
16, 16, (simde__tmp_a_).i16, \
(simde__tmp_a_).i16, 0, 1, 2, 3, \
(((imm8)) & 3) + 4, \
(((imm8) >> 2) & 3) + 4, \
(((imm8) >> 4) & 3) + 4, \
(((imm8) >> 6) & 3) + 4)}; \
})
#endif
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_shufflelo_epi16(simde__m128i a, const int imm8)
{
simde__m128i r;
for (size_t i = 0; i < ((sizeof(r.i16) / sizeof(r.i16[0])) / 2); i++) {
r.i16[i] = a.i16[((imm8 >> (i * 2)) & 3)];
}
r.i64[1] = a.i64[1];
return r;
}
#if defined(SIMDE_SSE2_NATIVE)
#define simde_mm_shufflelo_epi16(a, imm8) \
SIMDE__M128I_C(_mm_shufflelo_epi16((a).n, (imm8)))
#elif defined(SIMDE__SHUFFLE_VECTOR)
#define simde_mm_shufflelo_epi16(a, imm8) \
({ \
const simde__m128i simde__tmp_a_ = a; \
(simde__m128i){.i16 = SIMDE__SHUFFLE_VECTOR( \
16, 16, (simde__tmp_a_).i16, \
(simde__tmp_a_).i16, (((imm8)) & 3), \
(((imm8) >> 2) & 3), \
(((imm8) >> 4) & 3), \
(((imm8) >> 6) & 3), 4, 5, 6, 7)}; \
})
#endif
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_sll_epi16(simde__m128i a, simde__m128i count)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128I_C(_mm_sll_epi16(a.n, count.n));
#else
simde__m128i r;
if (count.u64[0] > 15)
return simde_mm_setzero_si128();
const int s = (int)(count.u64[0]);
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r.u16) / sizeof(r.u16[0])); i++) {
r.u16[i] = a.u16[i] << s;
}
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_sll_epi32(simde__m128i a, simde__m128i count)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128I_C(_mm_sll_epi32(a.n, count.n));
#else
simde__m128i r;
if (count.u64[0] > 31)
return simde_mm_setzero_si128();
const int s = (int)(count.u64[0]);
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
r.i32[i] = a.i32[i] << s;
}
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_sll_epi64(simde__m128i a, simde__m128i count)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128I_C(_mm_sll_epi64(a.n, count.n));
#else
simde__m128i r;
if (count.u64[0] > 63)
return simde_mm_setzero_si128();
const int s = (int)(count.u64[0]);
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) {
r.i64[i] = a.i64[i] << s;
}
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128d simde_mm_sqrt_pd(simde__m128d a)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128D_C(_mm_sqrt_pd(a.n));
#else
simde__m128d r;
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
r.f64[i] = sqrt(a.f64[i]);
}
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128d simde_mm_sqrt_sd(simde__m128d a, simde__m128d b)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128D_C(_mm_sqrt_sd(a.n, b.n));
#else
simde__m128d r;
r.f64[0] = sqrt(b.f64[0]);
r.f64[1] = a.f64[1];
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_srl_epi16(simde__m128i a, simde__m128i count)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128I_C(_mm_srl_epi16(a.n, count.n));
#else
simde__m128i r;
if (count.u64[0] > 15)
return simde_mm_setzero_si128();
const int s = (int)(count.u64[0]);
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r.u16) / sizeof(r.u16[0])); i++) {
r.u16[i] = a.u16[i] >> s;
}
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_srl_epi32(simde__m128i a, simde__m128i count)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128I_C(_mm_srl_epi32(a.n, count.n));
#else
simde__m128i r;
if (count.u64[0] > 31)
return simde_mm_setzero_si128();
const int s = (int)(count.u64[0]);
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r.u32) / sizeof(r.u32[0])); i++) {
r.u32[i] = a.u32[i] >> s;
}
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_srl_epi64(simde__m128i a, simde__m128i count)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128I_C(_mm_srl_epi64(a.n, count.n));
#else
simde__m128i r;
if (count.u64[0] > 31)
return simde_mm_setzero_si128();
const int s = (int)(count.u64[0]);
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r.u64) / sizeof(r.u64[0])); i++) {
r.u64[i] = a.u64[i] >> s;
}
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_srai_epi16(simde__m128i a, int imm8)
{
simde__m128i r;
const uint16_t m =
(uint16_t)((~0U) << ((sizeof(int16_t) * CHAR_BIT) - imm8));
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r) / sizeof(r.u16[0])); i++) {
const uint16_t is_neg = ((uint16_t)(
((a.u16[i]) >> ((sizeof(int16_t) * CHAR_BIT) - 1))));
r.u16[i] = (a.u16[i] >> imm8) | (m * is_neg);
}
return r;
}
#if defined(SIMDE_SSE2_NATIVE)
#define simde_mm_srai_epi16(a, imm8) \
SIMDE__M128I_C(_mm_srai_epi16((a).n, (imm8)));
#endif
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_srai_epi32(simde__m128i a, int imm8)
{
simde__m128i r;
const uint32_t m =
(uint32_t)((~0U) << ((sizeof(int) * CHAR_BIT) - imm8));
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r) / sizeof(r.u32[0])); i++) {
uint32_t is_neg = ((uint32_t)(
((a.u32[i]) >> ((sizeof(int32_t) * CHAR_BIT) - 1))));
r.u32[i] = (a.u32[i] >> imm8) | (m * is_neg);
}
return r;
}
#if defined(SIMDE_SSE2_NATIVE)
#define simde_mm_srai_epi32(a, imm8) \
SIMDE__M128I_C(_mm_srai_epi32((a).n, (imm8)))
#elif defined(SIMDE_SSE2_NEON)
#define simde_mm_srai_epi32(a, imm8) \
SIMDE__M128I_NEON_C( \
i32, \
((imm8) <= 0) \
? (a.neon_i32) \
: (((imm8) > 31) \
? (vshrq_n_s32(vshrq_n_s32(a.neon_i32, 16), \
16)) \
: (vshrq_n_s32(a.neon_i32, (imm8)))))
#endif
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_sra_epi16(simde__m128i a, simde__m128i count)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128I_C(_mm_sra_epi16(a.n, count.n));
#else
simde__m128i r;
int cnt = (int)count.i64[0];
if (cnt > 15 || cnt < 0) {
for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0]));
i++) {
r.u16[i] = (a.i16[i] < 0) ? 0xffff : 0x0000;
}
} else {
const uint16_t m = (uint16_t)(
(~0U) << ((sizeof(int16_t) * CHAR_BIT) - cnt));
for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0]));
i++) {
const uint16_t is_neg = a.i16[i] < 0;
r.u16[i] = (a.u16[i] >> cnt) | (m * is_neg);
}
}
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_sra_epi32(simde__m128i a, simde__m128i count)
{
#if defined(SIMDE_SSE2_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_MM_SRA_EPI32)
return SIMDE__M128I_C(_mm_sra_epi32(a.n, count.n));
#else
simde__m128i r;
const uint64_t cnt = count.u64[0];
if (cnt > 31) {
for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0]));
i++) {
r.u32[i] = (a.i32[i] < 0) ? UINT32_MAX : 0;
}
} else if (cnt == 0) {
memcpy(&r, &a, sizeof(r));
} else {
const uint32_t m = (uint32_t)(
(~0U) << ((sizeof(int32_t) * CHAR_BIT) - cnt));
for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0]));
i++) {
const uint32_t is_neg = a.i32[i] < 0;
r.u32[i] = (a.u32[i] >> cnt) | (m * is_neg);
}
}
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_slli_epi16(simde__m128i a, const int imm8)
{
simde__m128i r;
const int s = (imm8 > ((int)sizeof(r.i16[0]) * CHAR_BIT) - 1) ? 0
: imm8;
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) {
r.i16[i] = a.i16[i] << s;
}
return r;
}
#if defined(SIMDE_SSE2_NATIVE)
#define simde_mm_slli_epi16(a, imm8) SIMDE__M128I_C(_mm_slli_epi16(a.n, imm8));
#elif defined(SIMDE_SSE2_NEON)
#define simde_mm_slli_epi16(a, imm8) \
SIMDE__M128I_NEON_C( \
i16, ((imm8) <= 0) \
? ((a).neon_i16) \
: (((imm8) > 31) ? (vdupq_n_s16(0)) \
: (vshlq_n_s16((a).neon_i16, \
(imm8)))))
#endif
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_slli_epi32(simde__m128i a, const int imm8)
{
simde__m128i r;
const int s = (imm8 > ((int)sizeof(r.i32[0]) * CHAR_BIT) - 1) ? 0
: imm8;
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
r.i32[i] = a.i32[i] << s;
}
return r;
}
#if defined(SIMDE_SSE2_NATIVE)
#define simde_mm_slli_epi32(a, imm8) SIMDE__M128I_C(_mm_slli_epi32(a.n, imm8));
#elif defined(SIMDE_SSE2_NEON)
#define simde_mm_slli_epi32(a, imm8) \
SIMDE__M128I_NEON_C( \
i32, ((imm8) <= 0) \
? ((a).neon_i32) \
: (((imm8) > 31) ? (vdupq_n_s32(0)) \
: (vshlq_n_s32((a).neon_i32, \
(imm8)))))
#endif
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_slli_epi64(simde__m128i a, const int imm8)
{
simde__m128i r;
const int s = (imm8 > ((int)sizeof(r.i64[0]) * CHAR_BIT) - 1) ? 0
: imm8;
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) {
r.i64[i] = a.i64[i] << s;
}
return r;
}
#if defined(SIMDE_SSE2_NATIVE)
#define simde_mm_slli_epi64(a, imm8) SIMDE__M128I_C(_mm_slli_epi64(a.n, imm8));
#endif
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_srli_epi16(simde__m128i a, const int imm8)
{
simde__m128i r;
const int s = (imm8 > ((int)sizeof(r.i16[0]) * CHAR_BIT) - 1) ? 0
: imm8;
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) {
r.u16[i] = a.u16[i] >> s;
}
return r;
}
#if defined(SIMDE_SSE2_NATIVE)
#define simde_mm_srli_epi16(a, imm8) SIMDE__M128I_C(_mm_srli_epi16(a.n, imm8));
#elif defined(SIMDE_SSE2_NEON)
#define simde_mm_srli_epi16(a, imm8) \
SIMDE__M128I_NEON_C( \
u16, ((imm8) <= 0) \
? ((a).neon_u16) \
: (((imm8) > 31) ? (vdupq_n_u16(0)) \
: (vshrq_n_u16((a).neon_u16, \
(imm8)))))
#endif
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_srli_epi32(simde__m128i a, const int imm8)
{
simde__m128i r;
const int s = (imm8 > ((int)sizeof(r.i32[0]) * CHAR_BIT) - 1) ? 0
: imm8;
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
r.u32[i] = a.u32[i] >> s;
}
return r;
}
#if defined(SIMDE_SSE2_NATIVE)
#define simde_mm_srli_epi32(a, imm8) SIMDE__M128I_C(_mm_srli_epi32(a.n, imm8))
#elif defined(SIMDE_SSE2_NEON)
#define simde_mm_srli_epi32(a, imm8) \
SIMDE__M128I_NEON_C( \
u32, ((imm8) <= 0) \
? ((a).neon_u32) \
: (((imm8) > 31) ? (vdupq_n_u32(0)) \
: (vshrq_n_u32((a).neon_u32, \
(imm8)))))
#endif
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_srli_epi64(simde__m128i a, const int imm8)
{
simde__m128i r;
const unsigned char s = imm8 & 255;
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) {
if (s > 63) {
r.u64[i] = 0;
} else {
r.u64[i] = a.u64[i] >> s;
}
}
return r;
}
#if defined(SIMDE_SSE2_NATIVE)
#define simde_mm_srli_epi64(a, imm8) SIMDE__M128I_C(_mm_srli_epi64(a.n, imm8))
#elif defined(SIMDE_SSE2_NEON)
#define simde_mm_srli_epi64(a, imm8) \
SIMDE__M128I_NEON_C( \
u64, \
(((imm8)&255) < 0 || ((imm8)&255) > 63) \
? (vdupq_n_u64(0)) \
: ((((imm8)&255) == 0) \
? (a.neon_u64) \
: (vshrq_n_u64((a).neon_u64, (imm8)&255))))
#endif
SIMDE__FUNCTION_ATTRIBUTES
void simde_mm_store_pd(simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(2)],
simde__m128d a)
{
simde_assert_aligned(16, mem_addr);
#if defined(SIMDE_SSE2_NATIVE)
_mm_store_pd(mem_addr, a.n);
#else
SIMDE__ASSUME_ALIGNED(mem_addr, 16);
memcpy(mem_addr, &a, sizeof(a));
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
void simde_mm_store1_pd(simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(2)],
simde__m128d a)
{
simde_assert_aligned(16, mem_addr);
#if defined(SIMDE_SSE2_NATIVE)
_mm_store1_pd(mem_addr, a.n);
#else
SIMDE__ASSUME_ALIGNED(mem_addr, 16);
mem_addr[0] = a.f64[0];
mem_addr[1] = a.f64[0];
#endif
}
#define simde_mm_store_pd1(mem_addr, a) simde_mm_store1_pd(mem_addr, a)
SIMDE__FUNCTION_ATTRIBUTES
void simde_mm_store_sd(simde_float64 *mem_addr, simde__m128d a)
{
#if defined(SIMDE_SSE2_NATIVE)
_mm_store_sd(mem_addr, a.n);
#else
memcpy(mem_addr, &a, sizeof(a.f64[0]));
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
void simde_mm_store_si128(simde__m128i *mem_addr, simde__m128i a)
{
#if defined(SIMDE_SSE2_NATIVE)
_mm_store_si128(&mem_addr->n, a.n);
#elif defined(SIMDE_SSE2_NEON)
vst1q_s32((int32_t *)mem_addr, a.neon_i32);
#else
SIMDE__ASSUME_ALIGNED(mem_addr, 16);
memcpy(mem_addr, &a, sizeof(a));
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
void simde_mm_storeh_pd(simde_float64 *mem_addr, simde__m128d a)
{
#if defined(SIMDE_SSE2_NATIVE)
_mm_storeh_pd(mem_addr, a.n);
#else
*mem_addr = a.f64[1];
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
void simde_mm_storel_epi64(simde__m128i *mem_addr, simde__m128i a)
{
#if defined(SIMDE_SSE2_NATIVE)
_mm_storel_epi64(&(mem_addr->n), a.n);
#elif defined(SIMDE_SSE2_NEON)
mem_addr->i64[0] = vgetq_lane_s64(a.neon_i64, 0);
#else
mem_addr->i64[0] = a.i64[0];
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
void simde_mm_storel_pd(simde_float64 *mem_addr, simde__m128d a)
{
#if defined(SIMDE_SSE2_NATIVE)
_mm_storel_pd(mem_addr, a.n);
#else
*mem_addr = a.f64[0];
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
void simde_mm_storer_pd(simde_float64 mem_addr[2], simde__m128d a)
{
simde_assert_aligned(16, mem_addr);
#if defined(SIMDE_SSE2_NATIVE)
_mm_storer_pd(mem_addr, a.n);
#else
SIMDE__ASSUME_ALIGNED(mem_addr, 16);
mem_addr[0] = a.f64[1];
mem_addr[1] = a.f64[0];
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
void simde_mm_storeu_pd(simde_float64 *mem_addr, simde__m128d a)
{
#if defined(SIMDE_SSE2_NATIVE)
_mm_storeu_pd(mem_addr, a.n);
#else
memcpy(mem_addr, &a, sizeof(a));
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
void simde_mm_storeu_si128(simde__m128i *mem_addr, simde__m128i a)
{
#if defined(SIMDE_SSE2_NATIVE)
_mm_storeu_si128(&mem_addr->n, a.n);
#elif defined(SIMDE_SSE2_NEON)
int32_t v[4];
vst1q_s32(v, a.neon_i32);
memcpy(mem_addr, v, sizeof(v));
#else
memcpy(mem_addr, &a, sizeof(a));
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
void simde_mm_stream_pd(simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(2)],
simde__m128d a)
{
#if defined(SIMDE_SSE2_NATIVE)
_mm_stream_pd(mem_addr, a.n);
#else
SIMDE__ASSUME_ALIGNED(mem_addr, 16);
memcpy(mem_addr, &a, sizeof(a));
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
void simde_mm_stream_si128(simde__m128i *mem_addr, simde__m128i a)
{
#if defined(SIMDE_SSE2_NATIVE)
_mm_stream_si128(&mem_addr->n, a.n);
#else
SIMDE__ASSUME_ALIGNED(mem_addr, 16);
memcpy(mem_addr, &a, sizeof(a));
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
void simde_mm_stream_si32(int32_t *mem_addr, int32_t a)
{
#if defined(SIMDE_SSE2_NATIVE)
_mm_stream_si32(mem_addr, a);
#else
*mem_addr = a;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
void simde_mm_stream_si64(int64_t *mem_addr, int64_t a)
{
#if defined(SIMDE_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
#if defined(SIMDE__REALLY_GCC) && !HEDLEY_GCC_VERSION_CHECK(4, 8, 0)
*mem_addr = a;
#elif defined(__GNUC__)
_mm_stream_si64((long long *)mem_addr, a);
#else
_mm_stream_si64(mem_addr, a);
#endif
#else
*mem_addr = a;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_sub_epi8(simde__m128i a, simde__m128i b)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128I_C(_mm_sub_epi8(a.n, b.n));
#elif defined(SIMDE_SSE2_NEON)
return SIMDE__M128I_NEON_C(i8, vsubq_s8(a.neon_i8, b.neon_i8));
#else
simde__m128i r;
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r.i8) / sizeof(r.i8[0])); i++) {
r.i8[i] = a.i8[i] - b.i8[i];
}
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_sub_epi16(simde__m128i a, simde__m128i b)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128I_C(_mm_sub_epi16(a.n, b.n));
#elif defined(SIMDE_SSE2_NEON)
return SIMDE__M128I_NEON_C(i16, vsubq_s16(a.neon_i16, b.neon_i16));
#else
simde__m128i r;
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) {
r.i16[i] = a.i16[i] - b.i16[i];
}
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_sub_epi32(simde__m128i a, simde__m128i b)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128I_C(_mm_sub_epi32(a.n, b.n));
#elif defined(SIMDE_SSE2_NEON)
return SIMDE__M128I_NEON_C(i32, vsubq_s32(a.neon_i32, b.neon_i32));
#else
simde__m128i r;
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
r.i32[i] = a.i32[i] - b.i32[i];
}
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_sub_epi64(simde__m128i a, simde__m128i b)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128I_C(_mm_sub_epi64(a.n, b.n));
#elif defined(SIMDE_SSE2_NEON)
return SIMDE__M128I_NEON_C(i64, vsubq_s64(a.neon_i64, b.neon_i64));
#else
simde__m128i r;
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) {
r.i64[i] = a.i64[i] - b.i64[i];
}
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128d simde_mm_sub_pd(simde__m128d a, simde__m128d b)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128D_C(_mm_sub_pd(a.n, b.n));
#else
simde__m128d r;
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
r.f64[i] = a.f64[i] - b.f64[i];
}
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128d simde_mm_sub_sd(simde__m128d a, simde__m128d b)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128D_C(_mm_sub_sd(a.n, b.n));
#else
simde__m128d r;
r.f64[0] = a.f64[0] - b.f64[0];
r.f64[1] = a.f64[1];
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m64 simde_mm_sub_si64(simde__m64 a, simde__m64 b)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M64_C(_mm_sub_si64(a.n, b.n));
#else
simde__m64 r;
r.i64[0] = a.i64[0] - b.i64[0];
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_subs_epi8(simde__m128i a, simde__m128i b)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128I_C(_mm_subs_epi8(a.n, b.n));
#elif defined(SIMDE_SSE2_NEON)
return SIMDE__M128I_NEON_C(i8, vqsubq_s8(a.neon_i8, b.neon_i8));
#else
simde__m128i r;
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r) / sizeof(r.i8[0])); i++) {
if (((b.i8[i]) > 0 && (a.i8[i]) < INT8_MIN + (b.i8[i]))) {
r.i8[i] = INT8_MIN;
} else if ((b.i8[i]) < 0 && (a.i8[i]) > INT8_MAX + (b.i8[i])) {
r.i8[i] = INT8_MAX;
} else {
r.i8[i] = (a.i8[i]) - (b.i8[i]);
}
}
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_subs_epi16(simde__m128i a, simde__m128i b)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128I_C(_mm_subs_epi16(a.n, b.n));
#elif defined(SIMDE_SSE2_NEON)
return SIMDE__M128I_NEON_C(i16, vqsubq_s16(a.neon_i16, b.neon_i16));
#else
simde__m128i r;
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r) / sizeof(r.i16[0])); i++) {
if (((b.i16[i]) > 0 && (a.i16[i]) < INT16_MIN + (b.i16[i]))) {
r.i16[i] = INT16_MIN;
} else if ((b.i16[i]) < 0 &&
(a.i16[i]) > INT16_MAX + (b.i16[i])) {
r.i16[i] = INT16_MAX;
} else {
r.i16[i] = (a.i16[i]) - (b.i16[i]);
}
}
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_subs_epu8(simde__m128i a, simde__m128i b)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128I_C(_mm_subs_epu8(a.n, b.n));
#elif defined(SIMDE_SSE2_NEON)
return SIMDE__M128I_NEON_C(u8, vqsubq_u8(a.neon_u8, b.neon_u8));
#else
simde__m128i r;
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r) / sizeof(r.i8[0])); i++) {
const int32_t x = a.u8[i] - b.u8[i];
if (x < 0) {
r.u8[i] = 0;
} else if (x > UINT8_MAX) {
r.u8[i] = UINT8_MAX;
} else {
r.u8[i] = (uint8_t)x;
}
}
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_subs_epu16(simde__m128i a, simde__m128i b)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128I_C(_mm_subs_epu16(a.n, b.n));
#elif defined(SIMDE_SSE2_NEON)
return SIMDE__M128I_NEON_C(u16, vqsubq_u16(a.neon_u16, b.neon_u16));
#else
simde__m128i r;
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r) / sizeof(r.i16[0])); i++) {
const int32_t x = a.u16[i] - b.u16[i];
if (x < 0) {
r.u16[i] = 0;
} else if (x > UINT16_MAX) {
r.u16[i] = UINT16_MAX;
} else {
r.u16[i] = (uint16_t)x;
}
}
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
int simde_mm_ucomieq_sd(simde__m128d a, simde__m128d b)
{
#if defined(SIMDE_SSE2_NATIVE)
return _mm_ucomieq_sd(a.n, b.n);
#else
fenv_t envp;
int x = feholdexcept(&envp);
int r = a.f64[0] == b.f64[0];
if (HEDLEY_LIKELY(x == 0))
fesetenv(&envp);
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
int simde_mm_ucomige_sd(simde__m128d a, simde__m128d b)
{
#if defined(SIMDE_SSE2_NATIVE)
return _mm_ucomige_sd(a.n, b.n);
#else
fenv_t envp;
int x = feholdexcept(&envp);
int r = a.f64[0] >= b.f64[0];
if (HEDLEY_LIKELY(x == 0))
fesetenv(&envp);
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
int simde_mm_ucomigt_sd(simde__m128d a, simde__m128d b)
{
#if defined(SIMDE_SSE2_NATIVE)
return _mm_ucomigt_sd(a.n, b.n);
#else
fenv_t envp;
int x = feholdexcept(&envp);
int r = a.f64[0] > b.f64[0];
if (HEDLEY_LIKELY(x == 0))
fesetenv(&envp);
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
int simde_mm_ucomile_sd(simde__m128d a, simde__m128d b)
{
#if defined(SIMDE_SSE2_NATIVE)
return _mm_ucomile_sd(a.n, b.n);
#else
fenv_t envp;
int x = feholdexcept(&envp);
int r = a.f64[0] <= b.f64[0];
if (HEDLEY_LIKELY(x == 0))
fesetenv(&envp);
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
int simde_mm_ucomilt_sd(simde__m128d a, simde__m128d b)
{
#if defined(SIMDE_SSE2_NATIVE)
return _mm_ucomilt_sd(a.n, b.n);
#else
fenv_t envp;
int x = feholdexcept(&envp);
int r = a.f64[0] < b.f64[0];
if (HEDLEY_LIKELY(x == 0))
fesetenv(&envp);
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
int simde_mm_ucomineq_sd(simde__m128d a, simde__m128d b)
{
#if defined(SIMDE_SSE2_NATIVE)
return _mm_ucomineq_sd(a.n, b.n);
#else
fenv_t envp;
int x = feholdexcept(&envp);
int r = a.f64[0] != b.f64[0];
if (HEDLEY_LIKELY(x == 0))
fesetenv(&envp);
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128d simde_mm_undefined_pd(void)
{
simde__m128d r;
#if defined(SIMDE_SSE2_NATIVE) && defined(SIMDE__HAVE_UNDEFINED128)
r.n = _mm_undefined_pd();
#else
r = simde_mm_setzero_pd();
#endif
return r;
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_undefined_si128(void)
{
simde__m128i r;
#if defined(SIMDE_SSE2_NATIVE) && defined(SIMDE__HAVE_UNDEFINED128)
r.n = _mm_undefined_si128();
#else
r = simde_mm_setzero_si128();
#endif
return r;
}
SIMDE__FUNCTION_ATTRIBUTES
void simde_mm_lfence(void)
{
#if defined(SIMDE_SSE2_NATIVE)
_mm_lfence();
#else
simde_mm_sfence();
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
void simde_mm_mfence(void)
{
#if defined(SIMDE_SSE2_NATIVE)
_mm_mfence();
#else
simde_mm_sfence();
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_unpackhi_epi8(simde__m128i a, simde__m128i b)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128I_C(_mm_unpackhi_epi8(a.n, b.n));
#elif defined(SIMDE_SSE2_NEON)
int8x8_t a1 = vreinterpret_s8_s16(vget_high_s16(a.neon_i16));
int8x8_t b1 = vreinterpret_s8_s16(vget_high_s16(b.neon_i16));
int8x8x2_t result = vzip_s8(a1, b1);
return SIMDE__M128I_NEON_C(i8,
vcombine_s8(result.val[0], result.val[1]));
#else
simde__m128i r;
SIMDE__VECTORIZE
for (size_t i = 0; i < ((sizeof(r) / sizeof(r.i8[0])) / 2); i++) {
r.i8[(i * 2)] = a.i8[i + ((sizeof(r) / sizeof(r.i8[0])) / 2)];
r.i8[(i * 2) + 1] =
b.i8[i + ((sizeof(r) / sizeof(r.i8[0])) / 2)];
}
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_unpackhi_epi16(simde__m128i a, simde__m128i b)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128I_C(_mm_unpackhi_epi16(a.n, b.n));
#elif defined(SIMDE_SSE2_NEON)
int16x4_t a1 = vget_high_s16(a.neon_i16);
int16x4_t b1 = vget_high_s16(b.neon_i16);
int16x4x2_t result = vzip_s16(a1, b1);
return SIMDE__M128I_NEON_C(i16,
vcombine_s16(result.val[0], result.val[1]));
#else
simde__m128i r;
SIMDE__VECTORIZE
for (size_t i = 0; i < ((sizeof(r) / sizeof(r.i16[0])) / 2); i++) {
r.i16[(i * 2)] =
a.i16[i + ((sizeof(r) / sizeof(r.i16[0])) / 2)];
r.i16[(i * 2) + 1] =
b.i16[i + ((sizeof(r) / sizeof(r.i16[0])) / 2)];
}
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_unpackhi_epi32(simde__m128i a, simde__m128i b)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128I_C(_mm_unpackhi_epi32(a.n, b.n));
#elif defined(SIMDE_SSE2_NEON)
int32x2_t a1 = vget_high_s32(a.neon_i32);
int32x2_t b1 = vget_high_s32(b.neon_i32);
int32x2x2_t result = vzip_s32(a1, b1);
return SIMDE__M128I_NEON_C(i32,
vcombine_s32(result.val[0], result.val[1]));
#else
simde__m128i r;
SIMDE__VECTORIZE
for (size_t i = 0; i < ((sizeof(r) / sizeof(r.i32[0])) / 2); i++) {
r.i32[(i * 2)] =
a.i32[i + ((sizeof(r) / sizeof(r.i32[0])) / 2)];
r.i32[(i * 2) + 1] =
b.i32[i + ((sizeof(r) / sizeof(r.i32[0])) / 2)];
}
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_unpackhi_epi64(simde__m128i a, simde__m128i b)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128I_C(_mm_unpackhi_epi64(a.n, b.n));
#else
simde__m128i r;
SIMDE__VECTORIZE
for (size_t i = 0; i < ((sizeof(r) / sizeof(r.i64[0])) / 2); i++) {
r.i64[(i * 2)] =
a.i64[i + ((sizeof(r) / sizeof(r.i64[0])) / 2)];
r.i64[(i * 2) + 1] =
b.i64[i + ((sizeof(r) / sizeof(r.i64[0])) / 2)];
}
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128d simde_mm_unpackhi_pd(simde__m128d a, simde__m128d b)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128D_C(_mm_unpackhi_pd(a.n, b.n));
#else
simde__m128d r;
SIMDE__VECTORIZE
for (size_t i = 0; i < ((sizeof(r) / sizeof(r.f64[0])) / 2); i++) {
r.f64[(i * 2)] =
a.f64[i + ((sizeof(r) / sizeof(r.f64[0])) / 2)];
r.f64[(i * 2) + 1] =
b.f64[i + ((sizeof(r) / sizeof(r.f64[0])) / 2)];
}
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_unpacklo_epi8(simde__m128i a, simde__m128i b)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128I_C(_mm_unpacklo_epi8(a.n, b.n));
#elif defined(SIMDE_SSE2_NEON)
int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(a.neon_i16));
int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(b.neon_i16));
int8x8x2_t result = vzip_s8(a1, b1);
return SIMDE__M128I_NEON_C(i8,
vcombine_s8(result.val[0], result.val[1]));
#else
simde__m128i r;
SIMDE__VECTORIZE
for (size_t i = 0; i < ((sizeof(r) / sizeof(r.i8[0])) / 2); i++) {
r.i8[(i * 2)] = a.i8[i];
r.i8[(i * 2) + 1] = b.i8[i];
}
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_unpacklo_epi16(simde__m128i a, simde__m128i b)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128I_C(_mm_unpacklo_epi16(a.n, b.n));
#elif defined(SIMDE_SSE2_NEON)
int16x4_t a1 = vget_low_s16(a.neon_i16);
int16x4_t b1 = vget_low_s16(b.neon_i16);
int16x4x2_t result = vzip_s16(a1, b1);
return SIMDE__M128I_NEON_C(i16,
vcombine_s16(result.val[0], result.val[1]));
#else
simde__m128i r;
SIMDE__VECTORIZE
for (size_t i = 0; i < ((sizeof(r) / sizeof(r.i16[0])) / 2); i++) {
r.i16[(i * 2)] = a.i16[i];
r.i16[(i * 2) + 1] = b.i16[i];
}
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_unpacklo_epi32(simde__m128i a, simde__m128i b)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128I_C(_mm_unpacklo_epi32(a.n, b.n));
#elif defined(SIMDE_SSE2_NEON)
int32x2_t a1 = vget_low_s32(a.neon_i32);
int32x2_t b1 = vget_low_s32(b.neon_i32);
int32x2x2_t result = vzip_s32(a1, b1);
return SIMDE__M128I_NEON_C(i32,
vcombine_s32(result.val[0], result.val[1]));
#else
simde__m128i r;
SIMDE__VECTORIZE
for (size_t i = 0; i < ((sizeof(r) / sizeof(r.i32[0])) / 2); i++) {
r.i32[(i * 2)] = a.i32[i];
r.i32[(i * 2) + 1] = b.i32[i];
}
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_unpacklo_epi64(simde__m128i a, simde__m128i b)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128I_C(_mm_unpacklo_epi64(a.n, b.n));
#else
simde__m128i r;
SIMDE__VECTORIZE
for (size_t i = 0; i < ((sizeof(r) / sizeof(r.i64[0])) / 2); i++) {
r.i64[(i * 2)] = a.i64[i];
r.i64[(i * 2) + 1] = b.i64[i];
}
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128d simde_mm_unpacklo_pd(simde__m128d a, simde__m128d b)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128D_C(_mm_unpacklo_pd(a.n, b.n));
#else
simde__m128d r;
SIMDE__VECTORIZE
for (size_t i = 0; i < ((sizeof(r) / sizeof(r.f64[0])) / 2); i++) {
r.f64[(i * 2)] = a.f64[i];
r.f64[(i * 2) + 1] = b.f64[i];
}
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128d simde_mm_xor_pd(simde__m128d a, simde__m128d b)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128D_C(_mm_xor_pd(a.n, b.n));
#else
simde__m128d r;
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) {
r.i64[i] = a.i64[i] ^ b.i64[i];
}
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_mm_xor_si128(simde__m128i a, simde__m128i b)
{
#if defined(SIMDE_SSE2_NATIVE)
return SIMDE__M128I_C(_mm_xor_si128(a.n, b.n));
#elif defined(SIMDE_SSE2_NEON)
return SIMDE__M128I_NEON_C(i32, veorq_s32(a.neon_i32, b.neon_i32));
#else
simde__m128i r;
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
r.i32[i] = a.i32[i] ^ b.i32[i];
}
return r;
#endif
}
SIMDE__FUNCTION_ATTRIBUTES
simde__m128i simde_x_mm_not_si128(simde__m128i a)
{
#if defined(SIMDE_SSE2_NEON)
return SIMDE__M128I_NEON_C(i32, vmvnq_s32(a.neon_i32));
#else
simde__m128i r;
SIMDE__VECTORIZE
for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
r.i32[i] = ~(a.i32[i]);
}
return r;
#endif
}
SIMDE__END_DECLS
#endif /* !defined(SIMDE__SSE2_H) */