111 #include <smmintrin.h>
112 #include <nmmintrin.h>
117 # error "SSE 4.2 must be enabled in the C++ compiler to use this header."
118 #endif // !__SSE4_2__
129 template <
int Lanes,
class T>
137 struct svec<4,int8_t>;
139 struct svec<4,uint8_t>;
141 struct svec<4,int16_t>;
143 struct svec<4,uint16_t>;
145 struct svec<4,int32_t>;
147 struct svec<4,uint32_t>;
149 struct svec<4,int64_t>;
151 struct svec<4,uint64_t>;
153 struct svec<4,float>;
155 struct svec<4,double>;
157 struct svec<4,void*>;
210 v = _mm_castsi128_ps(_mm_set_epi32(d ? -1 : 0, c ? -1 : 0,
211 b ? -1 : 0, a ? -1 : 0));
220 v = (a != 0) ? _mm_castsi128_ps(_mm_set1_epi32(-1)) : _mm_setzero_ps();
252 v = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
253 0, 0, 0, 0, d, c, b, a);
260 if(__builtin_constant_p(a) && a == 0) {
261 v = _mm_setzero_si128 ();
263 v = _mm_set1_epi8(a);
299 v = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
300 0, 0, 0, 0, d, c, b, a);
308 if(__builtin_constant_p(a) && a == 0) {
309 v = _mm_setzero_si128 ();
311 v = _mm_set1_epi8(a);
347 v = _mm_set_epi16(0, 0, 0, 0, d, c, b, a);
355 if(__builtin_constant_p(a) && a == 0) {
356 v = _mm_setzero_si128 ();
358 v = _mm_set1_epi16(a);
395 v = _mm_set_epi16(0, 0, 0, 0, d, c, b, a);
403 if(__builtin_constant_p(a) && a == 0) {
404 v = _mm_setzero_si128 ();
406 v = _mm_set1_epi16(a);
443 v = _mm_set_epi32(d, c, b, a);
451 if(__builtin_constant_p(a) && a == 0) {
452 v = _mm_setzero_si128 ();
454 v = _mm_set1_epi32(a);
461 FORCEINLINE operator __m128()
const {
return _mm_castsi128_ps(v); }
496 v = _mm_set_epi32(d, c, b, a);
504 if(__builtin_constant_p(a) && a == 0) {
505 v = _mm_setzero_si128 ();
507 v = _mm_set1_epi32(a);
514 FORCEINLINE operator __m128()
const {
return _mm_castsi128_ps(v); }
550 v[0] = _mm_set_epi32((b >> 32) & 0xffffffff, b & 0xffffffff,
551 (a >> 32) & 0xffffffff, a & 0xffffffff);
552 v[1] = _mm_set_epi32((d >> 32) & 0xffffffff, d & 0xffffffff,
553 (c >> 32) & 0xffffffff, c & 0xffffffff);
561 if(__builtin_constant_p(a) && a == 0) {
562 v[0] = v[1] = _mm_setzero_si128 ();
564 int a1 = (a >> 32) & 0xffffffff;
565 int a0 = a & 0xffffffff;
566 v[0] = v[1] = _mm_set_epi32(a1, a0, a1, a0);
604 v[0] = _mm_set_epi32((b >> 32) & 0xffffffff, b & 0xffffffff,
605 (a >> 32) & 0xffffffff, a & 0xffffffff);
606 v[1] = _mm_set_epi32((d >> 32) & 0xffffffff, d & 0xffffffff,
607 (c >> 32) & 0xffffffff, c & 0xffffffff);
615 if(__builtin_constant_p(a) && a == 0) {
616 v[0] = v[1] = _mm_setzero_si128 ();
618 int a1 = (a >> 32) & 0xffffffff;
619 int a0 = a & 0xffffffff;
620 v[0] = v[1] = _mm_set_epi32(a1, a0, a1, a0);
656 v = _mm_set_ps(d, c, b, a);
664 if(__builtin_constant_p(a) && a == 0) {
665 v = _mm_setzero_ps();
705 v[0] = _mm_set_pd(b, a);
706 v[1] = _mm_set_pd(d, c);
714 if (__builtin_constant_p(a) && a == 0) {
715 v[0] = v[1] = _mm_setzero_pd();
717 v[0] = v[1] = _mm_set1_pd(a);
746 #define INSERT_EXTRACT_SSE(STYPE) \
747 static FORCEINLINE STYPE svec_extract(svec<LANES,STYPE> v, int index) { \
748 return ((STYPE*)&v)[index]; \
750 static FORCEINLINE void svec_insert(svec<LANES,STYPE> *v, int index, STYPE val) { \
751 ((STYPE*)v)[index] = val; \
754 #define INSERT_EXTRACT_SSEOPT(STYPE, FUNC) \
755 static FORCEINLINE STYPE svec_extract(svec<LANES,STYPE> v, int index) { \
756 if(__builtin_constant_p(index) && index >=0 && index < 4) { \
757 return (STYPE)_mm_extract_##FUNC(v.v, index); \
759 return ((STYPE*)&v)[index]; \
762 static FORCEINLINE void svec_insert(svec<LANES,STYPE> *v, int index, STYPE val) { \
763 if(__builtin_constant_p(index) && index >=0 && index < 4) { \
764 v->v = _mm_insert_##FUNC(v->v, val, index); \
766 ((STYPE*)v)[index] = val; \
770 #define INSERT_EXTRACT_SSEOPT64(STYPE, FUNC) \
771 static FORCEINLINE STYPE svec_extract(svec<LANES,STYPE> v, int index) { \
772 if(__builtin_constant_p(index) && index >=0 && index < 4) { \
773 return (STYPE)_mm_extract_##FUNC(v.v[index>>1], index%2); \
775 return ((STYPE*)&v)[index]; \
778 static FORCEINLINE void svec_insert(svec<LANES,STYPE> *v, int index, STYPE val) { \
779 if(__builtin_constant_p(index) && index >=0 && index < 4) { \
780 v->v[index>>1] = _mm_insert_##FUNC(v->v[index>>1], val, index%2); \
782 ((STYPE*)v)[index] = val; \
787 static FORCEINLINE uint32_t svec_extract(svec<4,bool> v,
int index) {
788 if(__builtin_constant_p(index) && index >=0 && index < 4) {
789 return _mm_extract_epi32(_mm_castps_si128(v.v), index);
792 return ((uint32_t*)&v)[index];
796 static FORCEINLINE void svec_insert(svec<4,bool> *v,
int index, uint32_t val) {
797 if(__builtin_constant_p(index) && index >=0 && index < 4) {
798 v->v = _mm_castsi128_ps(_mm_insert_epi32(_mm_castps_si128(v->v), val ? -1 : 0, index));
800 ((uint32_t *)v)[index] = val ? -1 : 0;
827 static FORCEINLINE svec<4,bool> svec_load(
const svec<4,bool> *p) {
828 return svec<4,bool>(_mm_loadu_ps((
float *)(&p->v)));
838 static FORCEINLINE void svec_store(svec<4,bool> *p, svec<4,bool> v) {
839 _mm_storeu_ps((
float *)(&p->v), v.v);
847 static FORCEINLINE svec<4,int8_t> svec_load(
const svec<4,int8_t> *p) {
848 int8_t *ptr = (int8_t *)(&p->v);
849 return svec<4,int8_t>(ptr[0], ptr[1], ptr[2], ptr[3]);
858 static FORCEINLINE void svec_store(svec<4,int8_t> *p, svec<4,int8_t> v) {
859 int8_t *ptr = (int8_t *)(&p->v);
860 ptr[0] = _mm_extract_epi8(v.v, 0);
861 ptr[1] = _mm_extract_epi8(v.v, 1);
862 ptr[2] = _mm_extract_epi8(v.v, 2);
863 ptr[3] = _mm_extract_epi8(v.v, 3);
871 static FORCEINLINE svec<4,uint8_t> svec_load(
const svec<4,uint8_t> *p) {
872 uint8_t *ptr = (uint8_t *)(&p->v);
873 return svec<4,uint8_t>(ptr[0], ptr[1], ptr[2], ptr[3]);
882 static FORCEINLINE void svec_store(svec<4,uint8_t> *p, svec<4,uint8_t> v) {
883 uint8_t *ptr = (uint8_t *)(&p->v);
884 ptr[0] = _mm_extract_epi8(v.v, 0);
885 ptr[1] = _mm_extract_epi8(v.v, 1);
886 ptr[2] = _mm_extract_epi8(v.v, 2);
887 ptr[3] = _mm_extract_epi8(v.v, 3);
895 static FORCEINLINE svec<4,int16_t> svec_load(
const svec<4,int16_t> *p) {
896 int16_t *ptr = (int16_t *)(&p->v);
897 return svec<4,int16_t>(ptr[0], ptr[1], ptr[2], ptr[3]);
906 static FORCEINLINE void svec_store(svec<4,int16_t> *p, svec<4,int16_t> v) {
907 int16_t *ptr = (int16_t *)(&p->v);
908 ptr[0] = _mm_extract_epi16(v.v, 0);
909 ptr[1] = _mm_extract_epi16(v.v, 1);
910 ptr[2] = _mm_extract_epi16(v.v, 2);
911 ptr[3] = _mm_extract_epi16(v.v, 3);
919 static FORCEINLINE svec<4,uint16_t> svec_load(
const svec<4,uint16_t> *p) {
920 uint16_t *ptr = (uint16_t *)(&p->v);
921 return svec<4,uint16_t>(ptr[0], ptr[1], ptr[2], ptr[3]);
930 static FORCEINLINE void svec_store(svec<4,uint16_t> *p, svec<4,uint16_t> v) {
931 uint16_t *ptr = (uint16_t *)(&p->v);
932 ptr[0] = _mm_extract_epi16(v.v, 0);
933 ptr[1] = _mm_extract_epi16(v.v, 1);
934 ptr[2] = _mm_extract_epi16(v.v, 2);
935 ptr[3] = _mm_extract_epi16(v.v, 3);
943 static FORCEINLINE svec<4,int32_t> svec_load(
const svec<4,int32_t> *p) {
944 return svec<4,int32_t>(_mm_loadu_si128((__m128i *)(&p->v)));
953 static FORCEINLINE void svec_store(svec<4,int32_t> *p, svec<4,int32_t> v) {
954 _mm_storeu_si128((__m128i *)(&p->v), v.v);
963 static FORCEINLINE svec<4,uint32_t> svec_load(
const svec<4,uint32_t> *p) {
964 return svec<4,uint32_t>(_mm_loadu_si128((__m128i *)(&p->v)));
973 static FORCEINLINE void svec_store(svec<4,uint32_t> *p, svec<4,uint32_t> v) {
974 _mm_storeu_si128((__m128i *)(&p->v), v.v);
982 static FORCEINLINE svec<4,int64_t> svec_load(
const svec<4,int64_t> *p) {
983 return svec<4,int64_t>(_mm_loadu_si128((__m128i *)(&p->v[0])),
984 _mm_loadu_si128((__m128i *)(&p->v[1])));
993 static FORCEINLINE void svec_store(svec<4,int64_t> *p, svec<4,int64_t> v) {
994 _mm_storeu_si128((__m128i *)(&p->v[0]), v.v[0]);
995 _mm_storeu_si128((__m128i *)(&p->v[1]), v.v[1]);
1004 static FORCEINLINE svec<4,uint64_t> svec_load(
const svec<4,uint64_t> *p) {
1005 return svec<4,uint64_t>(_mm_loadu_si128((__m128i *)(&p->v[0])),
1006 _mm_loadu_si128((__m128i *)(&p->v[1])));
1014 static FORCEINLINE void svec_store(svec<4,uint64_t> *p, svec<4,uint64_t> v) {
1015 _mm_storeu_si128((__m128i *)(&p->v[0]), v.v[0]);
1016 _mm_storeu_si128((__m128i *)(&p->v[1]), v.v[1]);
1024 static FORCEINLINE svec<4,float> svec_load(
const svec<4,float> *p) {
1025 return svec<4,float>(_mm_loadu_ps((
float *)(&p->v)));
1034 static FORCEINLINE void svec_store(svec<4,float> *p, svec<4,float> v) {
1035 _mm_storeu_ps((
float *)(&p->v), v.v);
1044 static FORCEINLINE svec<4,double> svec_load(
const svec<4,double> *p) {
1045 return svec<4,double>(_mm_loadu_pd((
double *)(&p->v[0])),
1046 _mm_loadu_pd((
double *)(&p->v[1])));
1055 static FORCEINLINE void svec_store(svec<4,double> *p, svec<4,double> v) {
1056 _mm_storeu_pd((
double *)(&p->v[0]), v.v[0]);
1057 _mm_storeu_pd((
double *)(&p->v[1]), v.v[1]);
1071 return _mm_blendv_ps(b.
v, a.
v, mask.
v);
1080 return svec<4,int8_t>((_mm_extract_ps(mask.
v, 0) != 0) ? _mm_extract_epi8(a.
v, 0) :
1081 _mm_extract_epi8(b.
v, 0),
1082 (_mm_extract_ps(mask.
v, 1) != 0) ? _mm_extract_epi8(a.
v, 1) :
1083 _mm_extract_epi8(b.
v, 1),
1084 (_mm_extract_ps(mask.
v, 2) != 0) ? _mm_extract_epi8(a.
v, 2) :
1085 _mm_extract_epi8(b.
v, 2),
1086 (_mm_extract_ps(mask.
v, 3) != 0) ? _mm_extract_epi8(a.
v, 3) :
1087 _mm_extract_epi8(b.
v, 3));
1096 return svec<4,uint8_t>((_mm_extract_ps(mask.
v, 0) != 0) ? _mm_extract_epi8(a.
v, 0) :
1097 _mm_extract_epi8(b.
v, 0),
1098 (_mm_extract_ps(mask.
v, 1) != 0) ? _mm_extract_epi8(a.
v, 1) :
1099 _mm_extract_epi8(b.
v, 1),
1100 (_mm_extract_ps(mask.
v, 2) != 0) ? _mm_extract_epi8(a.
v, 2) :
1101 _mm_extract_epi8(b.
v, 2),
1102 (_mm_extract_ps(mask.
v, 3) != 0) ? _mm_extract_epi8(a.
v, 3) :
1103 _mm_extract_epi8(b.
v, 3));
1112 return svec<4,int16_t>((_mm_extract_ps(mask.
v, 0) != 0) ? _mm_extract_epi16(a.
v, 0) :
1113 _mm_extract_epi16(b.
v, 0),
1114 (_mm_extract_ps(mask.
v, 1) != 0) ? _mm_extract_epi16(a.
v, 1) :
1115 _mm_extract_epi16(b.
v, 1),
1116 (_mm_extract_ps(mask.
v, 2) != 0) ? _mm_extract_epi16(a.
v, 2) :
1117 _mm_extract_epi16(b.
v, 2),
1118 (_mm_extract_ps(mask.
v, 3) != 0) ? _mm_extract_epi16(a.
v, 3) :
1119 _mm_extract_epi16(b.
v, 3));
1128 return svec<4,uint16_t>((_mm_extract_ps(mask.
v, 0) != 0) ? _mm_extract_epi16(a.
v, 0) :
1129 _mm_extract_epi16(b.
v, 0),
1130 (_mm_extract_ps(mask.
v, 1) != 0) ? _mm_extract_epi16(a.
v, 1) :
1131 _mm_extract_epi16(b.
v, 1),
1132 (_mm_extract_ps(mask.
v, 2) != 0) ? _mm_extract_epi16(a.
v, 2) :
1133 _mm_extract_epi16(b.
v, 2),
1134 (_mm_extract_ps(mask.
v, 3) != 0) ? _mm_extract_epi16(a.
v, 3) :
1135 _mm_extract_epi16(b.
v, 3));
1143 return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(b.
v),
1144 _mm_castsi128_ps(a.
v), mask.
v));
1152 return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(b.
v),
1153 _mm_castsi128_ps(a.
v), mask.
v));
1161 __m128 m0 = _mm_shuffle_ps(mask.
v, mask.
v, _MM_SHUFFLE(1, 1, 0, 0));
1162 __m128 m1 = _mm_shuffle_ps(mask.
v, mask.
v, _MM_SHUFFLE(3, 3, 2, 2));
1163 __m128d m0d = _mm_castps_pd(m0);
1164 __m128d m1d = _mm_castps_pd(m1);
1165 __m128d r0 = _mm_blendv_pd(_mm_castsi128_pd(b.
v[0]), _mm_castsi128_pd(a.
v[0]), m0d);
1166 __m128d r1 = _mm_blendv_pd(_mm_castsi128_pd(b.
v[1]), _mm_castsi128_pd(a.
v[1]), m1d);
1175 __m128 m0 = _mm_shuffle_ps(mask.
v, mask.
v, _MM_SHUFFLE(1, 1, 0, 0));
1176 __m128 m1 = _mm_shuffle_ps(mask.
v, mask.
v, _MM_SHUFFLE(3, 3, 2, 2));
1177 __m128d m0d = _mm_castps_pd(m0);
1178 __m128d m1d = _mm_castps_pd(m1);
1179 __m128d r0 = _mm_blendv_pd(_mm_castsi128_pd(b.
v[0]), _mm_castsi128_pd(a.
v[0]), m0d);
1180 __m128d r1 = _mm_blendv_pd(_mm_castsi128_pd(b.
v[1]), _mm_castsi128_pd(a.
v[1]), m1d);
1189 return _mm_blendv_ps(b.
v, a.
v, mask.
v);
1197 __m128 m0 = _mm_shuffle_ps(mask.
v, mask.
v, _MM_SHUFFLE(1, 1, 0, 0));
1198 __m128 m1 = _mm_shuffle_ps(mask.
v, mask.
v, _MM_SHUFFLE(3, 3, 2, 2));
1199 __m128d m0d = _mm_castps_pd(m0);
1200 __m128d m1d = _mm_castps_pd(m1);
1201 __m128d r0 = _mm_blendv_pd(b.
v[0], a.
v[0], m0d);
1202 __m128d r1 = _mm_blendv_pd(b.
v[1], a.
v[1], m1d);
1220 return _mm_set1_epi8(v[index]);
1222 static FORCEINLINE svec<4,uint8_t> svec_broadcast(svec<4,uint8_t> v,
int index) {
1223 return _mm_set1_epi8(v[index]);
1225 static FORCEINLINE svec<4,int16_t> svec_broadcast(svec<4,int16_t> v,
int index) {
1226 return _mm_set1_epi16(v[index]);
1228 static FORCEINLINE svec<4,uint16_t> svec_broadcast(svec<4,uint16_t> v,
int index) {
1229 return _mm_set1_epi16(v[index]);
1231 static FORCEINLINE svec<4,int32_t> svec_broadcast(svec<4,int32_t> v,
int index) {
1232 return _mm_set1_epi32(v[index]);
1234 static FORCEINLINE svec<4,uint32_t> svec_broadcast(svec<4,uint32_t> v,
int index) {
1235 return _mm_set1_epi32(v[index]);
1238 static FORCEINLINE svec<4,int64_t> svec_broadcast(svec<4,int64_t> v,
int index) {
1239 int64_t val = v[index];
1240 return svec<4,int64_t>(val);
1242 static FORCEINLINE svec<4,uint64_t> svec_broadcast(svec<4,uint64_t> v,
int index) {
1243 uint64_t val = v[index];
1244 return svec<4,uint64_t>(val);
1247 static FORCEINLINE svec<4,float> svec_broadcast(svec<4,float> v,
int index) {
1248 return _mm_set1_ps(v[index]);
1250 static FORCEINLINE svec<4,double> svec_broadcast(svec<4,double> v,
int index) {
1251 return svec<4,double>(_mm_set1_pd(v[index]),
1252 _mm_set1_pd(v[index]));
1280 #define LOAD_CONST_SSE(STYPE) \
1281 template <class RetVecType> static RetVecType svec_load_const(const STYPE* p); \
1283 FORCEINLINE svec<LANES,STYPE> svec_load_const<svec<LANES,STYPE> >(const STYPE* p) { \
1284 return svec<LANES,STYPE>(*p); \
1286 template <class RetVecType> static RetVecType svec_load_and_splat(STYPE* p); \
1288 FORCEINLINE svec<LANES,STYPE> svec_load_and_splat<svec<LANES,STYPE> >(STYPE* p) { \
1289 return svec<LANES,STYPE>(*p);\
1317 #if defined(__x86_64__) || defined(__PPC64__)
1325 svec<4,uint64_t>((uint64_t)(p0),(uint64_t)(p1),(uint64_t)(p2),(uint64_t)(p3)){}
1335 svec<4,uint32_t>((uint32_t)(p0),(uint32_t)(p1),(uint32_t)(p2),(uint32_t)(p3)){}
1339 #ifndef DOXYGEN_SHOULD_SKIP_THIS //not want generate svec_gather*/svec_scatter methods
1341 template <
class RetVecType>
static RetVecType svec_gather(svec<4,uint32_t> ptrs, svec<4,bool> mask);
1349 FORCEINLINE svec<4,int32_t> svec_gather<svec<4,int32_t> >(svec<4,uint32_t> ptrs, svec<4,bool> mask) {
1350 svec<4,int32_t> ret;
1351 if(svec_extract(mask,0)) { svec_insert(&ret, 0, *((int32_t*)svec_extract(ptrs,0)));}
1352 if(svec_extract(mask,1)) { svec_insert(&ret, 1, *((int32_t*)svec_extract(ptrs,1)));}
1353 if(svec_extract(mask,2)) { svec_insert(&ret, 2, *((int32_t*)svec_extract(ptrs,2)));}
1354 if(svec_extract(mask,3)) { svec_insert(&ret, 3, *((int32_t*)svec_extract(ptrs,3)));}
1360 FORCEINLINE svec<4,uint32_t> svec_gather<svec<4,uint32_t> >(svec<4,uint32_t> ptrs, svec<4,bool> mask) {
1361 return svec<4,uint32_t>(svec_gather<svec<4,int32_t> >(ptrs, mask).v);
1368 FORCEINLINE svec<4,float> svec_gather<svec<4,float> >(svec<4,uint32_t> ptrs, svec<4,bool> mask) {
1369 return svec<4,float>(_mm_castsi128_ps(svec_gather<svec<4,int32_t> >(ptrs, mask).v));
1373 template <
class RetVecType>
static RetVecType svec_gather(svec<4,uint64_t> ptrs, svec<4,bool> mask);
1380 FORCEINLINE svec<4,int32_t> svec_gather<svec<4,int32_t> >(svec<4,uint64_t> ptrs, svec<4,bool> mask) {
1381 svec<4,int32_t> ret;
1382 if(svec_extract(mask,0)) { svec_insert(&ret, 0, *((int32_t*)svec_extract(ptrs,0)));}
1383 if(svec_extract(mask,1)) { svec_insert(&ret, 1, *((int32_t*)svec_extract(ptrs,1)));}
1384 if(svec_extract(mask,2)) { svec_insert(&ret, 2, *((int32_t*)svec_extract(ptrs,2)));}
1385 if(svec_extract(mask,3)) { svec_insert(&ret, 3, *((int32_t*)svec_extract(ptrs,3)));}
1391 FORCEINLINE svec<4,uint32_t> svec_gather<svec<4,uint32_t> >(svec<4,uint64_t> ptrs, svec<4,bool> mask) {
1392 return svec<4,uint32_t>(svec_gather<svec<4,int32_t> >(ptrs, mask).v);
1398 FORCEINLINE svec<4,float> svec_gather<svec<4,float> >(svec<4,uint64_t> ptrs, svec<4,bool> mask) {
1399 return svec<4,float>(_mm_castsi128_ps(svec_gather<svec<4,int32_t> >(ptrs, mask).v));
1515 #endif //DOXYGEN_SHOULD_SKIP_THIS
1546 static FORCEINLINE bool svec_any_true(
const svec<4,bool>& mask) {
1547 return (_mm_movemask_ps(mask.v)!=0);
1555 static FORCEINLINE bool svec_all_true(
const svec<4,bool>& mask) {
1556 return (_mm_movemask_ps(mask.v)==0xF);
1565 static FORCEINLINE bool svec_none_true(
const svec<4,bool>& mask) {
1566 return (_mm_movemask_ps(mask.v)==0);
1574 static FORCEINLINE svec<4,bool> svec_and(svec<4,bool> a, svec<4,bool> b) {
1575 return _mm_and_ps(a.v, b.v);
1582 static FORCEINLINE svec<4,bool> svec_or(svec<4,bool> a, svec<4,bool> b) {
1583 return _mm_or_ps(a.v, b.v);
1589 static FORCEINLINE svec<4,bool> svec_xor(svec<4,bool> a, svec<4,bool> b) {
1590 return _mm_xor_ps(a.v, b.v);
1596 static FORCEINLINE svec<4,bool> svec_not(svec<4,bool> a) {
1597 __m128 allon = _mm_castsi128_ps(_mm_set1_epi32(-1));
1598 return _mm_xor_ps(a.v, allon);
1607 static FORCEINLINE uint64_t svec_movmsk(svec<4,bool> mask) {
1608 return (uint64_t)_mm_movemask_ps(mask.v);
1619 #define UNARY_OP_OPT(STYPE, NAME, OP)\
1620 static FORCEINLINE svec<LANES,STYPE> NAME(svec<LANES,STYPE> a) { \
1627 #define UNARY_OP_OPT64(STYPE, NAME, OP)\
1628 static FORCEINLINE svec<LANES,STYPE> NAME(svec<LANES,STYPE> a) { \
1629 return svec<LANES,STYPE>(OP(a.v[0]), OP(a.v[1])); \
1633 static FORCEINLINE svec<4,int8_t> svec_neg(svec<4,int8_t> a) {
1634 return _mm_sub_epi8(_mm_setzero_si128(), (a.v));
1636 static FORCEINLINE svec<4,uint8_t> svec_neg(svec<4,uint8_t> a) {
1637 return _mm_sub_epi8(_mm_setzero_si128(), (a.v));
1639 static FORCEINLINE svec<4,int16_t> svec_neg(svec<4,int16_t> a) {
1640 return _mm_sub_epi16(_mm_setzero_si128(), (a.v));
1642 static FORCEINLINE svec<4,uint16_t> svec_neg(svec<4,uint16_t> a) {
1643 return _mm_sub_epi16(_mm_setzero_si128(), (a.v));
1645 static FORCEINLINE svec<4,int32_t> svec_neg(svec<4,int32_t> a) {
1646 return _mm_sub_epi32(_mm_setzero_si128(), (a.v));
1648 static FORCEINLINE svec<4,uint32_t> svec_neg(svec<4,uint32_t> a) {
1649 return _mm_sub_epi32(_mm_setzero_si128(), (a.v));
1659 static FORCEINLINE svec<4,float> svec_round(svec<4,float> a) {
1660 return _mm_round_ps(a.v, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
1662 static FORCEINLINE svec<4,double> svec_round(svec<4,double> a) {
1663 return svec<4,double>(
1664 _mm_round_pd(a.v[0], _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC),
1665 _mm_round_pd(a.v[1], _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
1674 static FORCEINLINE svec<4,float> svec_rcp(svec<4,float> v) {
1675 __m128 rcp = _mm_rcp_ps(v.v);
1677 __m128 m = _mm_mul_ps(v.v, rcp);
1678 __m128 twominus = _mm_sub_ps(_mm_set1_ps(2.f), m);
1679 __m128 r = _mm_mul_ps(rcp, twominus);
1684 static FORCEINLINE svec<4,float> svec_rsqrt(svec<4,float> v) {
1685 __m128 rsqrt = _mm_rsqrt_ps(v.v);
1688 __m128 v_rsqrt = _mm_mul_ps(rsqrt, v.v);
1689 __m128 v_r_r = _mm_mul_ps(v_rsqrt, rsqrt);
1690 __m128 three_sub = _mm_sub_ps(_mm_set1_ps(3.f), v_r_r);
1691 __m128 rs_mul = _mm_mul_ps(rsqrt, three_sub);
1692 __m128 half_scale = _mm_mul_ps(_mm_set1_ps(0.5), rs_mul);
1707 static FORCEINLINE svec<4,uint8_t> svec_abs(svec<4,uint8_t> v) {
return v;}
1709 static FORCEINLINE svec<4,uint16_t> svec_abs(svec<4,uint16_t> v) {
return v;}
1711 static FORCEINLINE svec<4,uint32_t> svec_abs(svec<4,uint32_t> v) {
return v;}
1713 static FORCEINLINE svec<4,uint64_t> svec_abs(svec<4,uint64_t> v) {
return v;}
1715 static FORCEINLINE svec<4,float> svec_abs(svec<4,float> v) {
1716 unsigned int x = 0x7fffffff;
1717 float &f = * (
float *)( &x );
1718 __m128 tmp = _mm_set1_ps(f);
1719 return _mm_and_ps(v.v, tmp);
1725 #define BINARY_OP_OPT_FUNC(STYPE, STYPE2, NAME, FUNC) \
1726 static FORCEINLINE svec<LANES,STYPE> NAME(svec<LANES,STYPE> a, svec<LANES,STYPE2> b) { \
1727 return svec<LANES,STYPE>(FUNC(a.v, b.v)); \
1730 #define BINARY_OP_OPT_FUNC64(STYPE, STYPE2, NAME, FUNC) \
1731 static FORCEINLINE svec<LANES,STYPE> NAME(svec<LANES,STYPE> a, svec<LANES,STYPE> b) { \
1732 return svec<LANES,STYPE>(FUNC(a.v[0], b.v[0]), FUNC(a.v[1], b.v[1])); \
1785 #define BIN_VEC_SCAL(STYPE) \
1786 static FORCEINLINE svec<LANES,STYPE> svec_add_scalar(svec<LANES,STYPE> a, STYPE s) { \
1787 return svec_add(a, svec<LANES,STYPE>(s)); \
1789 static FORCEINLINE svec<LANES,STYPE> svec_scalar_add(STYPE s, svec<LANES,STYPE> a) { \
1790 return svec_add(svec<LANES,STYPE>(s), a); \
1792 static FORCEINLINE svec<LANES,STYPE> svec_sub_scalar(svec<LANES,STYPE> a, STYPE s) { \
1793 return svec_sub(a, svec<LANES,STYPE>(s)); \
1795 static FORCEINLINE svec<LANES,STYPE> svec_scalar_sub(STYPE s, svec<LANES,STYPE> a) { \
1796 return svec_sub(svec<LANES,STYPE>(s), a); \
1798 static FORCEINLINE svec<LANES,STYPE> svec_mul_scalar(svec<LANES,STYPE> a, STYPE s) { \
1799 return svec_mul(a, svec<LANES,STYPE>(s)); \
1801 static FORCEINLINE svec<LANES,STYPE> svec_scalar_mul(STYPE s, svec<LANES,STYPE> a) { \
1802 return svec_mul(svec<LANES,STYPE>(s), a); \
1804 static FORCEINLINE svec<LANES,STYPE> svec_div_scalar(svec<LANES,STYPE> a, STYPE s) { \
1805 return svec_div(a, svec<LANES,STYPE>(s)); \
1807 static FORCEINLINE svec<LANES,STYPE> svec_scalar_div(STYPE s, svec<LANES,STYPE> a) { \
1808 return svec_div(svec<LANES,STYPE>(s), a); \
1823 #define INT_BINARY_OP_METHODS(STYPE) \
1824 BINARY_OP_OPT_FUNC(STYPE, STYPE, svec_or, _mm_or_si128); \
1825 BINARY_OP_OPT_FUNC(STYPE, STYPE, svec_and, _mm_and_si128); \
1826 BINARY_OP_OPT_FUNC(STYPE, STYPE, svec_xor, _mm_xor_si128); \
1827 BINARY_OP_L4(STYPE, svec_rem, %); \
1828 BINARY_OP_SCALAR_L4(STYPE, STYPE, svec_rem, %);
1830 #define INT_BINARY_OP_METHODS64(STYPE) \
1831 BINARY_OP_OPT_FUNC64(STYPE, STYPE, svec_or, _mm_or_si128); \
1832 BINARY_OP_OPT_FUNC64(STYPE, STYPE, svec_and, _mm_and_si128); \
1833 BINARY_OP_OPT_FUNC64(STYPE, STYPE, svec_xor, _mm_xor_si128); \
1834 BINARY_OP_L4(STYPE, svec_rem, %); \
1835 BINARY_OP_SCALAR_L4(STYPE, STYPE, svec_rem, %);
1878 static FORCEINLINE svec<4,int16_t> svec_shl(svec<4,int16_t> a, int32_t s) {
1879 return svec<4,int16_t>(_mm_sll_epi16(a.v, _mm_set_epi32(0, 0, 0, s))); \
1883 static FORCEINLINE svec<4,uint16_t> svec_shl(svec<4,uint16_t> a, int32_t s) {
1884 return svec<4,uint16_t>(_mm_sll_epi16(a.v, _mm_set_epi32(0, 0, 0, s))); \
1887 static FORCEINLINE svec<4,int32_t> svec_shl(svec<4,int32_t> a, int32_t s) {
1888 return svec<4,int32_t>(_mm_sll_epi32(a.v, _mm_set_epi32(0, 0, 0, s))); \
1891 static FORCEINLINE svec<4,uint32_t> svec_shl(svec<4,uint32_t> a, int32_t s) {
1892 return svec<4,uint32_t>(_mm_sll_epi32(a.v, _mm_set_epi32(0, 0, 0, s))); \
1895 static FORCEINLINE svec<4,int64_t> svec_shl(svec<4,int64_t> a, int32_t s) {
1896 __m128i amt = _mm_set_epi32(0, 0, 0, s);
1897 return svec<4,int64_t>(_mm_sll_epi64(a.v[0], amt),
1898 _mm_sll_epi64(a.v[1], amt));
1901 static FORCEINLINE svec<4,uint64_t> svec_shl(svec<4,uint64_t> a, int32_t s) {
1902 __m128i amt = _mm_set_epi32(0, 0, 0, s);
1903 return svec<4,uint64_t>(_mm_sll_epi64(a.v[0], amt),
1904 _mm_sll_epi64(a.v[1], amt));
1911 static FORCEINLINE svec<4,int16_t> svec_shr(svec<4,int16_t> a, int32_t s) {
1912 return svec<4,int16_t>(_mm_sra_epi16(a.v, _mm_set_epi32(0, 0, 0, s))); \
1915 static FORCEINLINE svec<4,uint16_t> svec_shr(svec<4,uint16_t> a, int32_t s) {
1916 return svec<4,uint16_t>(_mm_srl_epi16(a.v, _mm_set_epi32(0, 0, 0, s))); \
1919 static FORCEINLINE svec<4,int32_t> svec_shr(svec<4,int32_t> a, int32_t s) {
1920 return svec<4,int32_t>(_mm_sra_epi32(a.v, _mm_set_epi32(0, 0, 0, s))); \
1923 static FORCEINLINE svec<4,uint32_t> svec_shr(svec<4,uint32_t> a, int32_t s) {
1924 return svec<4,uint32_t>(_mm_srl_epi32(a.v, _mm_set_epi32(0, 0, 0, s))); \
1928 static FORCEINLINE svec<4,uint64_t> svec_shr(svec<4,uint64_t> a, int32_t s) {
1929 __m128i amt = _mm_set_epi32(0, 0, 0, s);
1930 return svec<4,uint64_t>(_mm_srl_epi64(a.v[0], amt),
1931 _mm_srl_epi64(a.v[1], amt));
1937 #define TERNERY_OPT(STYPE) \
1941 FORCEINLINE svec<LANES,STYPE> svec_madd(svec<LANES,STYPE> a, svec<LANES,STYPE> b, svec<LANES,STYPE> c) { \
1947 FORCEINLINE svec<LANES,STYPE> svec_msub(svec<LANES,STYPE> a, svec<LANES,STYPE> b, svec<LANES,STYPE> c) { \
1953 FORCEINLINE svec<LANES,STYPE> svec_nmsub(svec<LANES,STYPE> a, svec<LANES,STYPE> b, svec<LANES,STYPE> c) { \
1993 #define MAX_MIN_REDUCE_METHODS(STYPE) \
1994 BINARY_OP_REDUCE_FUNC(STYPE, svec_reduce_add, add<STYPE>); \
1995 BINARY_OP_REDUCE_FUNC(STYPE, svec_reduce_max, max<STYPE>); \
1996 BINARY_OP_REDUCE_FUNC(STYPE, svec_reduce_min, min<STYPE>); \
2011 __m128 s0 = _mm_hadd_ps(v0.v,v1.v);
2012 __m128 s1 = _mm_hadd_ps(v2.v,v3.v);
2013 __m128 s = _mm_hadd_ps(s0, s1);
2018 __m128d s00 = _mm_add_pd(v0.
v[0], v0.
v[1]);
2019 __m128d s01 = _mm_add_pd(v1.
v[0], v1.
v[1]);
2020 __m128d s02 = _mm_add_pd(v2.
v[0], v2.
v[1]);
2021 __m128d s03 = _mm_add_pd(v3.
v[0], v3.
v[1]);
2023 __m128d s0 = _mm_hadd_pd(s00, s01);
2024 __m128d s1 = _mm_hadd_pd(s02, s03);
2036 static FORCEINLINE svec<4,bool> svec_equal(svec<4,bool> a, svec<4,bool> b) {
2037 return _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_castps_si128(a.v), _mm_castps_si128(b.v)));
2040 static FORCEINLINE svec<4,bool> svec_not_equal(svec<4,bool> a, svec<4,bool> b) {
2041 return svec_not(svec_equal(a, b));
2044 static FORCEINLINE svec<4,bool> svec_equal(svec<4,int8_t> a, svec<4,int8_t> b) {
2045 __m128i cmp = _mm_cmpeq_epi8(a.v, b.v);
2046 return svec<4,bool>(_mm_extract_epi8(cmp, 0),
2047 _mm_extract_epi8(cmp, 1),
2048 _mm_extract_epi8(cmp, 2),
2049 _mm_extract_epi8(cmp, 3));
2051 static FORCEINLINE svec<4,bool> svec_not_equal(svec<4,int8_t> a, svec<4,int8_t> b) {
2055 static FORCEINLINE svec<4,bool> svec_less_than(svec<4,int8_t> a, svec<4,int8_t> b) {
2056 __m128i cmp = _mm_cmplt_epi8(a.v, b.v);
2057 return svec<4,bool>(_mm_extract_epi8(cmp, 0),
2058 _mm_extract_epi8(cmp, 1),
2059 _mm_extract_epi8(cmp, 2),
2060 _mm_extract_epi8(cmp, 3));
2063 static FORCEINLINE svec<4,bool> svec_less_equal(svec<4,int8_t> a, svec<4,int8_t> b) {
2064 return (a < b) | (a == b);
2067 static FORCEINLINE svec<4,bool> svec_greater_than(svec<4,int8_t> a, svec<4,int8_t> b) {
2068 __m128i cmp = _mm_cmpgt_epi8(a.v, b.v);
2069 return svec<4,bool>(_mm_extract_epi8(cmp, 0),
2070 _mm_extract_epi8(cmp, 1),
2071 _mm_extract_epi8(cmp, 2),
2072 _mm_extract_epi8(cmp, 3));
2075 static FORCEINLINE svec<4,bool> svec_greater_equal(svec<4,int8_t> a, svec<4,int8_t> b) {
2076 return (a > b) | (a == b);
2081 __m128i cmp = _mm_cmpeq_epi8(a.
v, b.
v);
2083 _mm_extract_epi8(cmp, 1),
2084 _mm_extract_epi8(cmp, 2),
2085 _mm_extract_epi8(cmp, 3));
2088 static FORCEINLINE svec<4,bool> svec_not_equal(svec<4,uint8_t> a, svec<4,uint8_t> b) {
2100 __m128i cmp = _mm_cmpeq_epi16(a.
v, b.
v);
2102 _mm_extract_epi16(cmp, 1),
2103 _mm_extract_epi16(cmp, 2),
2104 _mm_extract_epi16(cmp, 3));
2106 static FORCEINLINE svec<4,bool> svec_not_equal(svec<4,int16_t> a, svec<4,int16_t> b) {
2110 static FORCEINLINE svec<4,bool> svec_less_than(svec<4,int16_t> a, svec<4,int16_t> b) {
2111 __m128i cmp = _mm_cmplt_epi16(a.v, b.v);
2112 return svec<4,bool>(_mm_extract_epi16(cmp, 0),
2113 _mm_extract_epi16(cmp, 1),
2114 _mm_extract_epi16(cmp, 2),
2115 _mm_extract_epi16(cmp, 3));
2118 static FORCEINLINE svec<4,bool> svec_less_equal(svec<4,int16_t> a, svec<4,int16_t> b) {
2119 return (a < b) | (a == b);
2122 static FORCEINLINE svec<4,bool> svec_greater_than(svec<4,int16_t> a, svec<4,int16_t> b) {
2123 __m128i cmp = _mm_cmpgt_epi16(a.v, b.v);
2124 return svec<4,bool>(_mm_extract_epi16(cmp, 0),
2125 _mm_extract_epi16(cmp, 1),
2126 _mm_extract_epi16(cmp, 2),
2127 _mm_extract_epi16(cmp, 3));
2130 static FORCEINLINE svec<4,bool> svec_greater_equal(svec<4,int16_t> a, svec<4,int16_t> b) {
2131 return (a > b) | (a == b);
2136 __m128i cmp = _mm_cmpeq_epi16(a.
v, b.
v);
2138 _mm_extract_epi16(cmp, 1),
2139 _mm_extract_epi16(cmp, 2),
2140 _mm_extract_epi16(cmp, 3));
2143 static FORCEINLINE svec<4,bool> svec_not_equal(svec<4,uint16_t> a, svec<4,uint16_t> b) {
2155 __m128i cmp = _mm_cmpeq_epi32(a.
v, b.
v);
2157 _mm_extract_epi32(cmp, 1),
2158 _mm_extract_epi32(cmp, 2),
2159 _mm_extract_epi32(cmp, 3));
2161 static FORCEINLINE svec<4,bool> svec_not_equal(svec<4,int32_t> a, svec<4,int32_t> b) {
2165 static FORCEINLINE svec<4,bool> svec_less_than(svec<4,int32_t> a, svec<4,int32_t> b) {
2166 __m128i cmp = _mm_cmplt_epi32(a.v, b.v);
2167 return svec<4,bool>(_mm_extract_epi32(cmp, 0),
2168 _mm_extract_epi32(cmp, 1),
2169 _mm_extract_epi32(cmp, 2),
2170 _mm_extract_epi32(cmp, 3));
2173 static FORCEINLINE svec<4,bool> svec_less_equal(svec<4,int32_t> a, svec<4,int32_t> b) {
2174 return (a < b) | (a == b);
2177 static FORCEINLINE svec<4,bool> svec_greater_than(svec<4,int32_t> a, svec<4,int32_t> b) {
2178 __m128i cmp = _mm_cmpgt_epi32(a.v, b.v);
2179 return svec<4,bool>(_mm_extract_epi32(cmp, 0),
2180 _mm_extract_epi32(cmp, 1),
2181 _mm_extract_epi32(cmp, 2),
2182 _mm_extract_epi32(cmp, 3));
2185 static FORCEINLINE svec<4,bool> svec_greater_equal(svec<4,int32_t> a, svec<4,int32_t> b) {
2186 return (a > b) | (a == b);
2191 __m128i cmp = _mm_cmpeq_epi32(a.
v, b.
v);
2193 _mm_extract_epi32(cmp, 1),
2194 _mm_extract_epi32(cmp, 2),
2195 _mm_extract_epi32(cmp, 3));
2198 static FORCEINLINE svec<4,bool> svec_not_equal(svec<4,uint32_t> a, svec<4,uint32_t> b) {
2210 __m128i cmp0 = _mm_cmpeq_epi64(a.
v[0], b.
v[0]);
2211 __m128i cmp1 = _mm_cmpeq_epi64(a.
v[1], b.
v[1]);
2212 return svec<4,bool>(_mm_shuffle_ps(_mm_castsi128_ps(cmp0), _mm_castsi128_ps(cmp1),
2213 _MM_SHUFFLE(2, 0, 2, 0)));
2215 static FORCEINLINE svec<4,bool> svec_not_equal(svec<4,int64_t> a, svec<4,int64_t> b) {
2219 static FORCEINLINE svec<4,bool> svec_less_than(svec<4,int64_t> a, svec<4,int64_t> b) {
2223 static FORCEINLINE svec<4,bool> svec_less_equal(svec<4,int64_t> a, svec<4,int64_t> b) {
2224 return (a < b) | (a == b);
2227 static FORCEINLINE svec<4,bool> svec_greater_than(svec<4,int64_t> a, svec<4,int64_t> b) {
2228 __m128i cmp0 = _mm_cmpgt_epi64(a.v[0], b.v[0]);
2229 __m128i cmp1 = _mm_cmpgt_epi64(a.v[1], b.v[1]);
2230 return svec<4,bool>(_mm_shuffle_ps(_mm_castsi128_ps(cmp0), _mm_castsi128_ps(cmp1),
2231 _MM_SHUFFLE(2, 0, 2, 0)));
2234 static FORCEINLINE svec<4,bool> svec_greater_equal(svec<4,int64_t> a, svec<4,int64_t> b) {
2235 return (a > b) | (a == b);
2240 __m128i cmp0 = _mm_cmpeq_epi64(a.
v[0], b.
v[0]);
2241 __m128i cmp1 = _mm_cmpeq_epi64(a.
v[1], b.
v[1]);
2242 return svec<4,bool>(_mm_shuffle_ps(_mm_castsi128_ps(cmp0), _mm_castsi128_ps(cmp1),
2243 _MM_SHUFFLE(2, 0, 2, 0)));
2246 static FORCEINLINE svec<4,bool> svec_not_equal(svec<4,uint64_t> a, svec<4,uint64_t> b) {
2258 return _mm_cmpeq_ps(a.
v, b.
v);
2261 static FORCEINLINE svec<4,bool> svec_not_equal(svec<4,float> a, svec<4,float> b) {
2262 return _mm_cmpneq_ps(a.v, b.v);
2265 static FORCEINLINE svec<4,bool> svec_less_than(svec<4,float> a, svec<4,float> b) {
2266 return _mm_cmplt_ps(a.v, b.v);
2269 static FORCEINLINE svec<4,bool> svec_less_equal(svec<4,float> a, svec<4,float> b) {
2270 return _mm_cmple_ps(a.v, b.v);
2273 static FORCEINLINE svec<4,bool> svec_greater_than(svec<4,float> a, svec<4,float> b) {
2274 return _mm_cmpgt_ps(a.v, b.v);
2277 static FORCEINLINE svec<4,bool> svec_greater_equal(svec<4,float> a, svec<4,float> b) {
2278 return _mm_cmpge_ps(a.v, b.v);
2284 __m128d cmp0 = _mm_cmpeq_pd(a.
v[0], b.
v[0]);
2285 __m128d cmp1 = _mm_cmpeq_pd(a.
v[1], b.
v[1]);
2286 return _mm_shuffle_ps(_mm_castpd_ps(cmp0), _mm_castpd_ps(cmp1),
2287 _MM_SHUFFLE(2, 0, 2, 0));
2290 static FORCEINLINE svec<4,bool> svec_not_equal(svec<4,double> a, svec<4,double> b) {
2291 __m128d cmp0 = _mm_cmpneq_pd(a.v[0], b.v[0]);
2292 __m128d cmp1 = _mm_cmpneq_pd(a.v[1], b.v[1]);
2293 return _mm_shuffle_ps(_mm_castpd_ps(cmp0), _mm_castpd_ps(cmp1),
2294 _MM_SHUFFLE(2, 0, 2, 0));
2297 static FORCEINLINE svec<4,bool> svec_less_than(svec<4,double> a, svec<4,double> b) {
2298 __m128d cmp0 = _mm_cmplt_pd(a.v[0], b.v[0]);
2299 __m128d cmp1 = _mm_cmplt_pd(a.v[1], b.v[1]);
2300 return _mm_shuffle_ps(_mm_castpd_ps(cmp0), _mm_castpd_ps(cmp1),
2301 _MM_SHUFFLE(2, 0, 2, 0));
2304 static FORCEINLINE svec<4,bool> svec_less_equal(svec<4,double> a, svec<4,double> b) {
2305 __m128d cmp0 = _mm_cmple_pd(a.v[0], b.v[0]);
2306 __m128d cmp1 = _mm_cmple_pd(a.v[1], b.v[1]);
2307 return _mm_shuffle_ps(_mm_castpd_ps(cmp0), _mm_castpd_ps(cmp1),
2308 _MM_SHUFFLE(2, 0, 2, 0));
2311 static FORCEINLINE svec<4,bool> svec_greater_than(svec<4,double> a, svec<4,double> b) {
2312 __m128d cmp0 = _mm_cmpgt_pd(a.v[0], b.v[0]);
2313 __m128d cmp1 = _mm_cmpgt_pd(a.v[1], b.v[1]);
2314 return _mm_shuffle_ps(_mm_castpd_ps(cmp0), _mm_castpd_ps(cmp1),
2315 _MM_SHUFFLE(2, 0, 2 ,0));
2318 static FORCEINLINE svec<4,bool> svec_greater_equal(svec<4,double> a, svec<4,double> b) {
2319 __m128d cmp0 = _mm_cmpge_pd(a.v[0], b.v[0]);
2320 __m128d cmp1 = _mm_cmpge_pd(a.v[1], b.v[1]);
2321 return _mm_shuffle_ps(_mm_castpd_ps(cmp0), _mm_castpd_ps(cmp1),
2322 _MM_SHUFFLE(2, 0, 2, 0));
2340 #define CAST_OPT(SFROM, STO) \
2341 template <class T> static T svec_cast(svec<LANES,SFROM> val); \
2345 template <> FORCEINLINE svec<LANES,STO> svec_cast<svec<LANES,STO> >(svec<LANES,SFROM> val) { \
2346 return svec<LANES,STO>((val.v)); \
2352 #define CAST_OPT64(SFROM, STO) \
2353 template <class T> static T svec_cast(svec<LANES,SFROM> val); \
2357 template <> FORCEINLINE svec<LANES,STO> svec_cast<svec<LANES,STO> >(svec<LANES,SFROM> val) { \
2358 return svec<LANES,STO>((val.v[0]),(val.v[1])); \
2365 template <
class T>
static T svec_cast(svec<4,bool> val);
2373 template <
class T>
static T svec_cast(svec<4,bool> val);
2381 template <
class T>
static T svec_cast(svec<4,bool> val);
2389 template <
class T>
static T svec_cast(svec<4,bool> val);
2397 template <
class T>
static T svec_cast(svec<4,bool> val);
2402 return _mm_castps_si128(val.v);
2405 template <
class T>
static T svec_cast(svec<4,bool> val);
2410 return _mm_and_si128(_mm_castps_si128(val.v), _mm_set1_epi32(-1));
2415 template <
class T>
static T svec_cast(
svec<4,bool> val);
2423 template <
class T>
static T svec_cast(svec<4,bool> val);
2499 return _mm_cvtepi32_ps(val.v);
2502 template <
class T>
static T svec_cast(svec<4,int32_t> val);
2507 __m128d r0 = _mm_cvtepi32_pd(val.v);
2508 __m128 shuf = _mm_shuffle_ps(_mm_castsi128_ps(val.v),
2509 _mm_castsi128_ps(val.v),
2510 _MM_SHUFFLE(3, 2, 3, 2));
2511 __m128d r1 = _mm_cvtepi32_pd(_mm_castps_si128(shuf));
2566 return _mm_cvttps_epi32(val.v);
2579 _mm_cvtps_pd(_mm_shuffle_ps(val.v, val.v,
2580 _MM_SHUFFLE(3, 2, 3, 2))));
2595 __m128i r0 = _mm_cvtpd_epi32(val.v[0]);
2596 __m128i r1 = _mm_cvtpd_epi32(val.v[1]);
2597 return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(r0), _mm_castsi128_ps(r1),
2598 _MM_SHUFFLE(1, 0, 1, 0)));
2610 __m128 r0 = _mm_cvtpd_ps(val.v[0]);
2611 __m128 r1 = _mm_cvtpd_ps(val.v[1]);
2612 return _mm_shuffle_ps(r0, r1, _MM_SHUFFLE(1, 0, 1, 0));
2622 #define CAST_BITS_OPT(SFROM, STO, func) \
2623 template <class T> static T svec_cast_bits(svec<LANES,SFROM> val); \
2627 template <> FORCEINLINE svec<LANES,STO> svec_cast_bits<svec<LANES,STO> >(svec<LANES,SFROM> val) { \
2628 return svec<LANES,STO>(func(val.v)); \
2634 #define CAST_BITS_OPT64(SFROM, STO, func) \
2635 template <class T> static T svec_cast_bits(svec<LANES,SFROM> val); \
2639 template <> FORCEINLINE svec<LANES,STO> svec_cast_bits<svec<LANES,STO> >(svec<LANES,SFROM> val) { \
2640 return svec<LANES,STO>(func(val.v[0]), func(val.v[1])); \
2667 #define SUBSCRIPT_FUNC_IMPL_SSE(STYPE) \
2668 FORCEINLINE STYPE& svec<LANES,STYPE>::operator[](int index) { \
2669 return ((STYPE *)&v)[index]; \
2671 const FORCEINLINE STYPE svec<LANES,STYPE>::operator[](int index) const { \
2672 return svec_extract(*this, index); \
2677 svec_insert(m_self, m_index, value);
2680 svec_insert(m_self, m_index, helper.operator uint32_t());
2683 return svec_extract(*m_self, m_index);
2686 return svec_extract(*
this, index);
2768 return svec_equal(*
this, a);
2777 return svec_not_equal(*
this, a);
#define COUT_FUNC_BOOL_DECL()
Definition: gsimd_utility.h:266
#define INT_BINARY_OP_METHODS64(STYPE)
Definition: sse4.h:1830
svec()
Default constructor.
Definition: sse4.h:190
svec(int16_t a)
Constructor.
Definition: sse4.h:354
#define CAST_OPT(SFROM, STO)
cast based on directly change the __mm object type type
Definition: sse4.h:2340
#define GATHER_STRIDE_L4(STYPE, OSTYPE)
macros for fast impl of gather base step
Definition: gsimd_utility.h:682
svec< 4, bool > svec_select(svec< 4, bool > mask, svec< 4, bool > a, svec< 4, bool > b)
construct c by selecting elements from two input vectors according to the mask
Definition: sse4.h:1070
svec(__m128i vv)
For internal use only.
Definition: sse4.h:437
#define VEC_INT_CLASS_METHOD_DECL(STYPE, USTYPE)
macros method definition for integer vector only Note: shift's operator can only be unsigned vector ...
Definition: gsimd_utility.h:379
svec(int64_t a, int64_t b, int64_t c, int64_t d)
Constructor.
Definition: sse4.h:549
#define CMP_ALL_MASKED_OP(STYPE)
Definition: gsimd_utility.h:1099
#define SCATTER_BASE_OFFSETS_L4(STYPE, OSTYPE)
Definition: gsimd_utility.h:789
svec(void *p0, void *p1, void *p2, void *p3)
Constructor.
Definition: sse4.h:1334
Definition: gsimd_utility.h:93
data representation and operations on a vector of 4 unsigned long long.
Definition: sse4.h:584
svec()
Default constructor,.
Definition: sse4.h:536
svec(__m128i vv)
For internal use only.
Definition: sse4.h:202
#define BINARY_OP_OPT_FUNC64(STYPE, STYPE2, NAME, FUNC)
Definition: sse4.h:1730
#define BINARY_OP2_L4(STYPE, STYPE2, NAME, OP)
macros for generic slow imple of binary operation, style 2
Definition: gsimd_utility.h:893
#define GATHER_GENERAL_L4(STYPE, PSTYPE)
slow implementation of gather general Must use template to specify the return type ...
Definition: gsimd_utility.h:617
data representation and operations on a vector of 4 double.
Definition: sse4.h:685
svec(uint8_t a, uint8_t b, uint8_t c, uint8_t d)
Constructor.
Definition: sse4.h:298
svec(uint32_t a, uint32_t b, uint32_t c, uint32_t d)
Constructor.
Definition: sse4.h:495
data representation and operations on a vector of 4 unsigned short.
Definition: sse4.h:377
svec(__m128 vv)
For internal use only.
Definition: sse4.h:650
#define VEC_FLOAT_CLASS_METHOD_DECL(STYPE)
Definition: gsimd_utility.h:393
#define VEC_CLASS_METHOD_DECL(STYPE)
macros for non-mask i8 - double types's method
Definition: gsimd_utility.h:350
#define BIN_VEC_SCAL(STYPE)
Definition: sse4.h:1785
#define BINARY_OP_FUNC(STYPE, NAME, FUNC)
Definition: gsimd_utility.h:869
data representation and operations on a vector of 4 unsigned int.
Definition: sse4.h:478
svec()
Default constructor.
Definition: sse4.h:590
__m128i v[2]
Definition: sse4.h:585
#define SVEC_BOOL_CLASS_METHOD_DECL()
macros for svec<N,bool> class's class method
Definition: gsimd_utility.h:330
__m128i v
Definition: sse4.h:330
#define LOAD_CONST_SSE(STYPE)
Definition: sse4.h:1280
#define SHUFFLES_L4(STYPE)
macro for shuffle/shuffle2 methods implementation
Definition: gsimd_utility.h:537
#define INSERT_EXTRACT_SSEOPT(STYPE, FUNC)
Definition: sse4.h:754
svec(uint64_t a, uint64_t b, uint64_t c, uint64_t d)
Constructor.
Definition: sse4.h:603
svec(float a, float b, float c, float d)
Constructor.
Definition: sse4.h:655
svec(__m128i a, __m128i b)
For internal use only. Construct svec<4,int64_t> with two _m128i objects.
Definition: sse4.h:595
__m128 v
Definition: sse4.h:184
data representation and operations on a vector of 4 float.
Definition: sse4.h:638
svec< 4,float > svec_preduce_add(svec< 4, float > v0, svec< 4, float > v1, svec< 4, float > v2, svec< 4, float > v3)
Definition: sse4.h:2010
#define INC_STATS_NAME(stat, inc, opname)
Definition: gsimd_utility.h:156
#define COUT_FUNC_DECL(STYPE)
Definition: gsimd_utility.h:283
data representation and operations on a vector of 4 unsigned chars.
Definition: sse4.h:281
svec(uint8_t a)
Constructor.
Definition: sse4.h:307
__m128i v
Definition: sse4.h:234
svec(int64_t a)
Constructor.
Definition: sse4.h:560
#define CAST_BITS_OPT(SFROM, STO, func)
cast based on directly change the __vector type
Definition: sse4.h:2622
svec(__m128 vv)
For internal use only.
Definition: sse4.h:196
svec(int8_t a, int8_t b, int8_t c, int8_t d)
Constructor.
Definition: sse4.h:251
#define UNARY_OP_L4(STYPE, NAME, OP)
Definition: gsimd_utility.h:841
svec(uint16_t a, uint16_t b, uint16_t c, uint16_t d)
Constructor.
Definition: sse4.h:394
#define SUBSCRIPT_FUNC_IMPL_SSE(STYPE)
this macro uses sse specific intrinsics to do extract, insert
Definition: sse4.h:2667
svec()
Default constructor.
Definition: sse4.h:691
#define VEC_CMP_IMPL(STYPE)
Definition: gsimd_utility.h:1175
#define SUBSCRIPT_FUNC_DECL(STYPE)
macros to define a intrinsic based subscript opertor
Definition: gsimd_utility.h:247
__m128i v
Definition: sse4.h:378
#define INSERT_EXTRACT_SSEOPT64(STYPE, FUNC)
Definition: sse4.h:770
svec(uint64_t a)
Constructor.
Definition: sse4.h:614
#define MVEC_CLASS_METHOD_IMPL(STYPE)
mask class's class method impl
Definition: gsimd_utility.h:1285
data representation and operations on a vector of 4 signed int.
Definition: sse4.h:425
#define SUBSCRIPT_FUNC_BOOL_DECL(STYPE)
Definition: gsimd_utility.h:251
#define VEC_CLASS_METHOD_IMPL(STYPE)
Definition: gsimd_utility.h:1301
svec(__m128i vv)
For internal use only.
Definition: sse4.h:246
#define GATHER_BASE_OFFSETS_L4(STYPE, OSTYPE)
Definition: gsimd_utility.h:658
#define INT_BINARY_OP_METHODS(STYPE)
Definition: sse4.h:1823
svec(int a, int b, int c, int d)
Constructor.
Definition: sse4.h:442
data representation and operations on a vector of 4 signed short.
Definition: sse4.h:329
data representation and operations on a vector of 4 signed chars.
Definition: sse4.h:233
#define MASKED_LOAD_STORE_L4(STYPE)
Definition: gsimd_utility.h:797
#define VEC_FLOAT_CLASS_METHOD_IMPL(STYPE)
Definition: gsimd_utility.h:1433
#define SCATTER_STRIDE_L4(STYPE, OSTYPE)
Definition: gsimd_utility.h:715
#define INSERT_EXTRACT_SSE(STYPE)
macros for svec's insert extract method implementation The implementation is based on vector type's s...
Definition: sse4.h:746
svec(uint32_t a)
Constructor.
Definition: sse4.h:219
data representation and operations on a vector of 4 signed long long.
Definition: sse4.h:530
#define TERNERY_OPT(STYPE)
Definition: sse4.h:1937
__m128i v
Definition: sse4.h:479
__m128i v
Definition: sse4.h:282
#define CAST_BITS_OPT64(SFROM, STO, func)
cast based on directly change the __vector type
Definition: sse4.h:2634
svec(uint32_t a)
Constructor.
Definition: sse4.h:503
svec(uint32_t a, uint32_t b, uint32_t c, uint32_t d)
Constructor.
Definition: sse4.h:209
svec()
Default constructor.
Definition: sse4.h:644
#define CAST_L4(SFROM, STO)
Definition: gsimd_utility.h:1124
#define UNARY_OP_OPT(STYPE, NAME, OP)
Definition: sse4.h:1619
#define BINARY_OP_L4(STYPE, NAME, OP)
macros for generic slow imple of binary operation
Definition: gsimd_utility.h:880
__m128i v
Definition: sse4.h:426
__m128i v[2]
Definition: sse4.h:531
svec()
Default constructor.
Definition: sse4.h:240
#define UNARY_OP_OPT64(STYPE, NAME, OP)
macros for 64bit object, i64/u64/double
Definition: sse4.h:1627
svec()
Default constructor.
Definition: sse4.h:431
svec()
Default constructor.
Definition: sse4.h:335
#define BINARY_OP_OPT_FUNC(STYPE, STYPE2, NAME, FUNC)
Definition: sse4.h:1725
svec(double a)
Constructor.
Definition: sse4.h:713
svec(int8_t a)
Constructor.
Definition: sse4.h:259
svec()
Default constructor.
Definition: sse4.h:383
svec(__m128d a, __m128d b)
For internal use only. Construct svec<4,double> with two __vector double values.
Definition: sse4.h:696
#define SCATTER_GENERAL_L4(STYPE, PSTYPE)
Definition: gsimd_utility.h:756
#define BINARY_OP_FUNC_L4(STYPE, NAME, FUNC)
Definition: gsimd_utility.h:904
#define ROTATE_L4(STYPE)
macro for rotate method implementation
Definition: gsimd_utility.h:507
svec(uint16_t a)
Constructor.
Definition: sse4.h:402
#define BINARY_OP_SCALAR_L4(STYPE, STYPE2, NAME, OP)
macros for binary: vector op scalar
Definition: gsimd_utility.h:917
#define COUT_FUNC_CHAR_DECL(STYPE)
Definition: gsimd_utility.h:275
svec(__m128i vv)
For internal use only.
Definition: sse4.h:490
svec(__m128i vv)
For internal use only.
Definition: sse4.h:341
svec(double a, double b, double c, double d)
Constructor.
Definition: sse4.h:704
svec(int16_t a, int16_t b, int16_t c, int16_t d)
Constructor.
Definition: sse4.h:346
svec(int32_t a)
Constructor.
Definition: sse4.h:450
svec(__m128i a, __m128i b)
For internal use only. Construct svec<4,int64_t> with two _m128i objects.
Definition: sse4.h:541
svec()
Default constructor.
Definition: sse4.h:287
__m128 v
Definition: sse4.h:639
svec(__m128i vv)
For internal use only.
Definition: sse4.h:389
svec()
Default constructor.
Definition: sse4.h:484
#define CMP_OP_L4(STYPE, NAME, OP)
Definition: gsimd_utility.h:1057
#define CAST_OPT64(SFROM, STO)
cast based on directly change the __vector type
Definition: sse4.h:2352
#define SELECT_BOOLCOND(STYPE)
macros for svec's select by bool scalar method implementation
Definition: gsimd_utility.h:459
__m128d v[2]
Definition: sse4.h:686
#define VEC_INT_CLASS_METHOD_IMPL(STYPE, STYPE2)
Definition: gsimd_utility.h:1394
#define FORCEINLINE
Definition: gsimd_utility.h:175
Data representation and operations on a vector of 4 boolean values. This is used in predicated vector...
Definition: sse4.h:182
svec(float a)
Constructor.
Definition: sse4.h:663
svec(__m128i vv)
For internal use only.
Definition: sse4.h:293
#define MAX_MIN_REDUCE_METHODS(STYPE)
Definition: sse4.h:1993