Generic SIMD Intrinsic Library API  0.6
 All Classes Namespaces Files Functions Variables Typedefs Friends Macros
sse4.h
Go to the documentation of this file.
1 
103 #ifndef SSE4_H_
104 #define SSE4_H_
105 
106 #include <stdint.h>
107 #include <math.h>
108 #include <assert.h>
109 #include <iostream>
110 
111 #include <smmintrin.h>
112 #include <nmmintrin.h>
113 #include "gsimd_utility.h"
114 
115 
116 #ifndef __SSE4_2__
117 # error "SSE 4.2 must be enabled in the C++ compiler to use this header."
118 #endif // !__SSE4_2__
119 
120 namespace sse {
121 
122 #define LANES 4
123 
125 //
126 // Constructor Section
127 //
129 template <int Lanes, class T>
130 struct svec : public invalid_template_arguments<Lanes,T>::type {
131  //here we need to add the static assert
132 };
133 
134 template <>
135 struct svec<4,bool>;
136 template <>
137  struct svec<4,int8_t>;
138 template <>
139  struct svec<4,uint8_t>;
140 template <>
141  struct svec<4,int16_t>;
142 template <>
143  struct svec<4,uint16_t>;
144 template <>
145  struct svec<4,int32_t>;
146 template <>
147  struct svec<4,uint32_t>;
148 template <>
149  struct svec<4,int64_t>;
150 template <>
151  struct svec<4,uint64_t>;
152 template <>
153  struct svec<4,float>;
154 template <>
155  struct svec<4,double>;
156 template <>
157  struct svec<4,void*>;
158 
159 //required because macros are confused by the , in the template declaration
160 //typedef svec<4,bool> _svec4_i1;
161 //typedef svec<4,int8_t> svec<4,int8_t>;
162 //typedef svec<4,uint8_t> svec<4,uint8_t>;
163 //typedef svec<4,int16_t> svec<4,int16_t>;
164 //typedef svec<4,uint16_t> svec<4,uint16_t>;
165 //typedef svec<4,int32_t> svec<4,int32_t>;
166 //typedef svec<4,uint32_t> svec<4,uint32_t>;
167 //typedef svec<4,int64_t> _svec4_i64;
168 //typedef svec<4,uint64_t> _svec4_u64;
169 //typedef svec<4,float> _svec4_f;
170 //typedef svec<4,double> _svec4_d;
171 //typedef svec<4,void*> _svec4_ptr;
172 
181 template<>
182 struct svec<4,bool> {
183 
184  __m128 v; //only use 4 bits
185 
196  FORCEINLINE svec(__m128 vv) : v(vv) { }
202  FORCEINLINE svec(__m128i vv) : v(_mm_castsi128_ps(vv)) { }
209  FORCEINLINE svec(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
210  v = _mm_castsi128_ps(_mm_set_epi32(d ? -1 : 0, c ? -1 : 0,
211  b ? -1 : 0, a ? -1 : 0));
212  }
219  FORCEINLINE svec(uint32_t a){
220  v = (a != 0) ? _mm_castsi128_ps(_mm_set1_epi32(-1)) : _mm_setzero_ps();
221  }
222 
226 };
227 
228 
232 template <>
233 struct svec<4,int8_t> {
234  __m128i v;
235 
246  FORCEINLINE svec(__m128i vv) : v(vv) { }
251  FORCEINLINE svec(int8_t a, int8_t b, int8_t c, int8_t d) {
252  v = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
253  0, 0, 0, 0, d, c, b, a);
254  }
259  FORCEINLINE svec( int8_t a) {
260  if(__builtin_constant_p(a) && a == 0) {
261  v = _mm_setzero_si128 ();
262  } else {
263  v = _mm_set1_epi8(a);
264  }
265  }
270  SUBSCRIPT_FUNC_DECL(int8_t);
272 
273  VEC_CLASS_METHOD_DECL(int8_t);
274  VEC_INT_CLASS_METHOD_DECL(int8_t, uint8_t);
275 };
276 
280 template<>
281 struct svec<4,uint8_t> {
282  __m128i v;
293  FORCEINLINE svec(__m128i vv) : v(vv) { }
298  FORCEINLINE svec(uint8_t a, uint8_t b, uint8_t c, uint8_t d) {
299  v = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
300  0, 0, 0, 0, d, c, b, a);
301  }
307  FORCEINLINE svec(uint8_t a){
308  if(__builtin_constant_p(a) && a == 0) {
309  v = _mm_setzero_si128 ();
310  } else {
311  v = _mm_set1_epi8(a);
312  }
313  }
318  SUBSCRIPT_FUNC_DECL(uint8_t);
320 
321  VEC_CLASS_METHOD_DECL(uint8_t);
322  VEC_INT_CLASS_METHOD_DECL(uint8_t, uint8_t);
323 };
324 
328 template <>
329 struct svec<4,int16_t> {
330  __m128i v;
341  FORCEINLINE svec(__m128i vv) : v(vv) { }
346  FORCEINLINE svec(int16_t a, int16_t b, int16_t c, int16_t d) {
347  v = _mm_set_epi16(0, 0, 0, 0, d, c, b, a);
348  }
354  FORCEINLINE svec( int16_t a) {
355  if(__builtin_constant_p(a) && a == 0) {
356  v = _mm_setzero_si128 ();
357  } else {
358  v = _mm_set1_epi16(a);
359  }
360  }
365  SUBSCRIPT_FUNC_DECL(int16_t);
366  COUT_FUNC_DECL(int16_t);
367 
368  VEC_CLASS_METHOD_DECL(int16_t);
369  VEC_INT_CLASS_METHOD_DECL(int16_t, uint16_t);
370 
371 };
372 
376 template <>
377 struct svec<4,uint16_t> {
378  __m128i v;
389  FORCEINLINE svec(__m128i vv) : v(vv) { }
394  FORCEINLINE svec(uint16_t a, uint16_t b, uint16_t c, uint16_t d) {
395  v = _mm_set_epi16(0, 0, 0, 0, d, c, b, a);
396  }
402  FORCEINLINE svec(uint16_t a) {
403  if(__builtin_constant_p(a) && a == 0) {
404  v = _mm_setzero_si128 ();
405  } else {
406  v = _mm_set1_epi16(a);
407  }
408  }
413  SUBSCRIPT_FUNC_DECL(uint16_t);
414  COUT_FUNC_DECL(uint16_t);
415 
416  VEC_CLASS_METHOD_DECL(uint16_t);
417  VEC_INT_CLASS_METHOD_DECL(uint16_t, uint16_t);
418 
419 };
420 
424 template <>
425 struct svec<4,int32_t> {
426  __m128i v;
437  FORCEINLINE svec(__m128i vv) : v(vv) { }
442  FORCEINLINE svec(int a, int b, int c, int d) {
443  v = _mm_set_epi32(d, c, b, a);
444  }
450  FORCEINLINE svec(int32_t a) {
451  if(__builtin_constant_p(a) && a == 0) {
452  v = _mm_setzero_si128 ();
453  } else {
454  v = _mm_set1_epi32(a);
455  }
456  }
461  FORCEINLINE operator __m128() const { return _mm_castsi128_ps(v); }
466  SUBSCRIPT_FUNC_DECL(int32_t);
467  COUT_FUNC_DECL(int32_t);
468 
469  VEC_CLASS_METHOD_DECL(int32_t);
470  VEC_INT_CLASS_METHOD_DECL(int32_t, uint32_t);
471 
472 };
473 
477 template <>
478 struct svec<4,uint32_t> {
479  __m128i v;
490  FORCEINLINE svec(__m128i vv) : v(vv) { }
495  FORCEINLINE svec(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
496  v = _mm_set_epi32(d, c, b, a);
497  }
503  FORCEINLINE svec(uint32_t a) {
504  if(__builtin_constant_p(a) && a == 0) {
505  v = _mm_setzero_si128 ();
506  } else {
507  v = _mm_set1_epi32(a);
508  }
509  }
514  FORCEINLINE operator __m128() const { return _mm_castsi128_ps(v); }
519  SUBSCRIPT_FUNC_DECL(uint32_t);
520  COUT_FUNC_DECL(uint32_t);
521 
522  VEC_CLASS_METHOD_DECL(uint32_t);
523  VEC_INT_CLASS_METHOD_DECL(uint32_t, uint32_t);
524 };
525 
529 template <>
530 struct svec<4,int64_t> {
531  __m128i v[2];
541  FORCEINLINE svec(__m128i a, __m128i b){
542  v[0] = a;
543  v[1] = b;
544  }
549  FORCEINLINE svec(int64_t a, int64_t b, int64_t c, int64_t d) {
550  v[0] = _mm_set_epi32((b >> 32) & 0xffffffff, b & 0xffffffff,
551  (a >> 32) & 0xffffffff, a & 0xffffffff);
552  v[1] = _mm_set_epi32((d >> 32) & 0xffffffff, d & 0xffffffff,
553  (c >> 32) & 0xffffffff, c & 0xffffffff);
554  }
560  FORCEINLINE svec( int64_t a) {
561  if(__builtin_constant_p(a) && a == 0) {
562  v[0] = v[1] = _mm_setzero_si128 ();
563  } else {
564  int a1 = (a >> 32) & 0xffffffff;
565  int a0 = a & 0xffffffff;
566  v[0] = v[1] = _mm_set_epi32(a1, a0, a1, a0);
567  }
568  }
573  SUBSCRIPT_FUNC_DECL(int64_t);
574  COUT_FUNC_DECL(int64_t);
575 
576  VEC_CLASS_METHOD_DECL(int64_t);
577  VEC_INT_CLASS_METHOD_DECL(int64_t, uint64_t);
578 };
579 
583 template <>
584 struct svec<4,uint64_t> {
585  __m128i v[2];
595  FORCEINLINE svec(__m128i a, __m128i b){
596  v[0] = a;
597  v[1] = b;
598  }
603  FORCEINLINE svec(uint64_t a, uint64_t b, uint64_t c, uint64_t d) {
604  v[0] = _mm_set_epi32((b >> 32) & 0xffffffff, b & 0xffffffff,
605  (a >> 32) & 0xffffffff, a & 0xffffffff);
606  v[1] = _mm_set_epi32((d >> 32) & 0xffffffff, d & 0xffffffff,
607  (c >> 32) & 0xffffffff, c & 0xffffffff);
608  }
614  FORCEINLINE svec( uint64_t a) {
615  if(__builtin_constant_p(a) && a == 0) {
616  v[0] = v[1] = _mm_setzero_si128 ();
617  } else {
618  int a1 = (a >> 32) & 0xffffffff;
619  int a0 = a & 0xffffffff;
620  v[0] = v[1] = _mm_set_epi32(a1, a0, a1, a0);
621  }
622  }
627  SUBSCRIPT_FUNC_DECL(uint64_t);
628  COUT_FUNC_DECL(uint64_t);
629 
630  VEC_CLASS_METHOD_DECL(uint64_t);
631  VEC_INT_CLASS_METHOD_DECL(uint64_t, uint64_t);
632 };
633 
637 template<>
638 struct svec<4,float> {
639  __m128 v;
650  FORCEINLINE svec(__m128 vv) : v(vv) { }
655  FORCEINLINE svec(float a, float b, float c, float d) {
656  v = _mm_set_ps(d, c, b, a);
657  }
663  FORCEINLINE svec( float a) {
664  if(__builtin_constant_p(a) && a == 0) {
665  v = _mm_setzero_ps();
666  } else {
667  v = _mm_set1_ps(a);
668  }
669  }
674  SUBSCRIPT_FUNC_DECL(float);
676 
677  VEC_CLASS_METHOD_DECL(float);
679 };
680 
684 template<>
685 struct svec<4,double> {
686  __m128d v[2];
696  FORCEINLINE svec(__m128d a, __m128d b){
697  v[0] = a;
698  v[1] = b;
699  }
704  FORCEINLINE svec(double a, double b, double c, double d) {
705  v[0] = _mm_set_pd(b, a);
706  v[1] = _mm_set_pd(d, c);
707  }
713  FORCEINLINE svec( double a) {
714  if (__builtin_constant_p(a) && a == 0) {
715  v[0] = v[1] = _mm_setzero_pd();
716  } else {
717  v[0] = v[1] = _mm_set1_pd(a);
718  }
719  }
724  SUBSCRIPT_FUNC_DECL(double);
725  COUT_FUNC_DECL(double);
726 
727  VEC_CLASS_METHOD_DECL(double);
729 };
730 
731 
733 //
734 // Data operation interfaces
735 //
737 
738 //
740 //
741 
746 #define INSERT_EXTRACT_SSE(STYPE) \
747  static FORCEINLINE STYPE svec_extract(svec<LANES,STYPE> v, int index) { \
748  return ((STYPE*)&v)[index]; \
749  } \
750  static FORCEINLINE void svec_insert(svec<LANES,STYPE> *v, int index, STYPE val) { \
751  ((STYPE*)v)[index] = val; \
752  }
753 
754 #define INSERT_EXTRACT_SSEOPT(STYPE, FUNC) \
755  static FORCEINLINE STYPE svec_extract(svec<LANES,STYPE> v, int index) { \
756  if(__builtin_constant_p(index) && index >=0 && index < 4) { \
757  return (STYPE)_mm_extract_##FUNC(v.v, index); \
758  } else { \
759  return ((STYPE*)&v)[index]; \
760  } \
761  } \
762  static FORCEINLINE void svec_insert(svec<LANES,STYPE> *v, int index, STYPE val) { \
763  if(__builtin_constant_p(index) && index >=0 && index < 4) { \
764  v->v = _mm_insert_##FUNC(v->v, val, index); \
765  } else {\
766  ((STYPE*)v)[index] = val; \
767  } \
768  }
769 
770 #define INSERT_EXTRACT_SSEOPT64(STYPE, FUNC) \
771  static FORCEINLINE STYPE svec_extract(svec<LANES,STYPE> v, int index) { \
772  if(__builtin_constant_p(index) && index >=0 && index < 4) { \
773  return (STYPE)_mm_extract_##FUNC(v.v[index>>1], index%2); \
774  } else { \
775  return ((STYPE*)&v)[index]; \
776  } \
777  } \
778  static FORCEINLINE void svec_insert(svec<LANES,STYPE> *v, int index, STYPE val) { \
779  if(__builtin_constant_p(index) && index >=0 && index < 4) { \
780  v->v[index>>1] = _mm_insert_##FUNC(v->v[index>>1], val, index%2); \
781  } else { \
782  ((STYPE*)v)[index] = val; \
783  } \
784  }
785 
786 //i1 use different approach
787 static FORCEINLINE uint32_t svec_extract(svec<4,bool> v, int index) {
788  if(__builtin_constant_p(index) && index >=0 && index < 4) {
789  return _mm_extract_epi32(_mm_castps_si128(v.v), index);
790  } else
791  {
792  return ((uint32_t*)&v)[index];
793  }
794 
795 }
796 static FORCEINLINE void svec_insert(svec<4,bool> *v, int index, uint32_t val) {
797  if(__builtin_constant_p(index) && index >=0 && index < 4) {
798  v->v = _mm_castsi128_ps(_mm_insert_epi32(_mm_castps_si128(v->v), val ? -1 : 0, index));
799  } else {
800  ((uint32_t *)v)[index] = val ? -1 : 0;
801  }
802 
803 }
804 INSERT_EXTRACT_SSEOPT(int8_t, epi8);
805 INSERT_EXTRACT_SSEOPT(uint8_t, epi8);
806 INSERT_EXTRACT_SSEOPT(int16_t, epi16);
807 INSERT_EXTRACT_SSEOPT(uint16_t, epi16);
808 INSERT_EXTRACT_SSEOPT(int32_t, epi32);
809 INSERT_EXTRACT_SSEOPT(uint32_t, epi32);
810 #ifdef __x86_64__
811 INSERT_EXTRACT_SSEOPT64(int64_t, epi64);
812 INSERT_EXTRACT_SSEOPT64(uint64_t, epi64);
813 #else
814 INSERT_EXTRACT_SSE(int64_t);
815 INSERT_EXTRACT_SSE(uint64_t);
816 #endif
817 INSERT_EXTRACT_SSE(float); //no intrinsics to insert/extract
818 INSERT_EXTRACT_SSE(double); //no intrinsics to insert/extract
819 
820 // 1. Load / Store
827 static FORCEINLINE svec<4,bool> svec_load(const svec<4,bool> *p) {
828  return svec<4,bool>(_mm_loadu_ps((float *)(&p->v)));
829 
830 }
831 
838 static FORCEINLINE void svec_store(svec<4,bool> *p, svec<4,bool> v) {
839  _mm_storeu_ps((float *)(&p->v), v.v);
840 }
847 static FORCEINLINE svec<4,int8_t> svec_load(const svec<4,int8_t> *p) {
848  int8_t *ptr = (int8_t *)(&p->v);
849  return svec<4,int8_t>(ptr[0], ptr[1], ptr[2], ptr[3]);
850 }
851 
858 static FORCEINLINE void svec_store(svec<4,int8_t> *p, svec<4,int8_t> v) {
859  int8_t *ptr = (int8_t *)(&p->v);
860  ptr[0] = _mm_extract_epi8(v.v, 0);
861  ptr[1] = _mm_extract_epi8(v.v, 1);
862  ptr[2] = _mm_extract_epi8(v.v, 2);
863  ptr[3] = _mm_extract_epi8(v.v, 3);
864 }
871 static FORCEINLINE svec<4,uint8_t> svec_load(const svec<4,uint8_t> *p) {
872  uint8_t *ptr = (uint8_t *)(&p->v);
873  return svec<4,uint8_t>(ptr[0], ptr[1], ptr[2], ptr[3]);
874 }
875 
882 static FORCEINLINE void svec_store(svec<4,uint8_t> *p, svec<4,uint8_t> v) {
883  uint8_t *ptr = (uint8_t *)(&p->v);
884  ptr[0] = _mm_extract_epi8(v.v, 0);
885  ptr[1] = _mm_extract_epi8(v.v, 1);
886  ptr[2] = _mm_extract_epi8(v.v, 2);
887  ptr[3] = _mm_extract_epi8(v.v, 3);
888 }
895 static FORCEINLINE svec<4,int16_t> svec_load(const svec<4,int16_t> *p) {
896  int16_t *ptr = (int16_t *)(&p->v);
897  return svec<4,int16_t>(ptr[0], ptr[1], ptr[2], ptr[3]);
898 }
899 
906 static FORCEINLINE void svec_store(svec<4,int16_t> *p, svec<4,int16_t> v) {
907  int16_t *ptr = (int16_t *)(&p->v);
908  ptr[0] = _mm_extract_epi16(v.v, 0);
909  ptr[1] = _mm_extract_epi16(v.v, 1);
910  ptr[2] = _mm_extract_epi16(v.v, 2);
911  ptr[3] = _mm_extract_epi16(v.v, 3);
912 }
919 static FORCEINLINE svec<4,uint16_t> svec_load(const svec<4,uint16_t> *p) {
920  uint16_t *ptr = (uint16_t *)(&p->v);
921  return svec<4,uint16_t>(ptr[0], ptr[1], ptr[2], ptr[3]);
922 }
923 
930 static FORCEINLINE void svec_store(svec<4,uint16_t> *p, svec<4,uint16_t> v) {
931  uint16_t *ptr = (uint16_t *)(&p->v);
932  ptr[0] = _mm_extract_epi16(v.v, 0);
933  ptr[1] = _mm_extract_epi16(v.v, 1);
934  ptr[2] = _mm_extract_epi16(v.v, 2);
935  ptr[3] = _mm_extract_epi16(v.v, 3);
936 }
943 static FORCEINLINE svec<4,int32_t> svec_load(const svec<4,int32_t> *p) {
944  return svec<4,int32_t>(_mm_loadu_si128((__m128i *)(&p->v)));
945 }
946 
953 static FORCEINLINE void svec_store(svec<4,int32_t> *p, svec<4,int32_t> v) {
954  _mm_storeu_si128((__m128i *)(&p->v), v.v);
955 }
956 
963 static FORCEINLINE svec<4,uint32_t> svec_load(const svec<4,uint32_t> *p) {
964  return svec<4,uint32_t>(_mm_loadu_si128((__m128i *)(&p->v)));
965 }
966 
973 static FORCEINLINE void svec_store(svec<4,uint32_t> *p, svec<4,uint32_t> v) {
974  _mm_storeu_si128((__m128i *)(&p->v), v.v);
975 }
982 static FORCEINLINE svec<4,int64_t> svec_load(const svec<4,int64_t> *p) {
983  return svec<4,int64_t>(_mm_loadu_si128((__m128i *)(&p->v[0])),
984  _mm_loadu_si128((__m128i *)(&p->v[1])));
985 }
986 
993 static FORCEINLINE void svec_store(svec<4,int64_t> *p, svec<4,int64_t> v) {
994  _mm_storeu_si128((__m128i *)(&p->v[0]), v.v[0]);
995  _mm_storeu_si128((__m128i *)(&p->v[1]), v.v[1]);
996 }
997 
1004 static FORCEINLINE svec<4,uint64_t> svec_load(const svec<4,uint64_t> *p) {
1005  return svec<4,uint64_t>(_mm_loadu_si128((__m128i *)(&p->v[0])),
1006  _mm_loadu_si128((__m128i *)(&p->v[1])));
1007 }
1014 static FORCEINLINE void svec_store(svec<4,uint64_t> *p, svec<4,uint64_t> v) {
1015  _mm_storeu_si128((__m128i *)(&p->v[0]), v.v[0]);
1016  _mm_storeu_si128((__m128i *)(&p->v[1]), v.v[1]);
1017 }
1024 static FORCEINLINE svec<4,float> svec_load(const svec<4,float> *p) {
1025  return svec<4,float>(_mm_loadu_ps((float *)(&p->v)));
1026 }
1027 
1034 static FORCEINLINE void svec_store(svec<4,float> *p, svec<4,float> v) {
1035  _mm_storeu_ps((float *)(&p->v), v.v);
1036 }
1037 
1044 static FORCEINLINE svec<4,double> svec_load(const svec<4,double> *p) {
1045  return svec<4,double>(_mm_loadu_pd((double *)(&p->v[0])),
1046  _mm_loadu_pd((double *)(&p->v[1])));
1047 }
1048 
1055 static FORCEINLINE void svec_store(svec<4,double> *p, svec<4,double> v) {
1056  _mm_storeu_pd((double *)(&p->v[0]), v.v[0]);
1057  _mm_storeu_pd((double *)(&p->v[1]), v.v[1]);
1058 }
1059 
1060 // 3. Select
1061 
1062 
1071  return _mm_blendv_ps(b.v, a.v, mask.v);
1072 }
1073 
1079  INC_STATS_NAME(STATS_OTHER_SLOW, 1, "select i8");
1080  return svec<4,int8_t>((_mm_extract_ps(mask.v, 0) != 0) ? _mm_extract_epi8(a.v, 0) :
1081  _mm_extract_epi8(b.v, 0),
1082  (_mm_extract_ps(mask.v, 1) != 0) ? _mm_extract_epi8(a.v, 1) :
1083  _mm_extract_epi8(b.v, 1),
1084  (_mm_extract_ps(mask.v, 2) != 0) ? _mm_extract_epi8(a.v, 2) :
1085  _mm_extract_epi8(b.v, 2),
1086  (_mm_extract_ps(mask.v, 3) != 0) ? _mm_extract_epi8(a.v, 3) :
1087  _mm_extract_epi8(b.v, 3));
1088 }
1089 
1095  INC_STATS_NAME(STATS_OTHER_SLOW, 1, "select u8");
1096  return svec<4,uint8_t>((_mm_extract_ps(mask.v, 0) != 0) ? _mm_extract_epi8(a.v, 0) :
1097  _mm_extract_epi8(b.v, 0),
1098  (_mm_extract_ps(mask.v, 1) != 0) ? _mm_extract_epi8(a.v, 1) :
1099  _mm_extract_epi8(b.v, 1),
1100  (_mm_extract_ps(mask.v, 2) != 0) ? _mm_extract_epi8(a.v, 2) :
1101  _mm_extract_epi8(b.v, 2),
1102  (_mm_extract_ps(mask.v, 3) != 0) ? _mm_extract_epi8(a.v, 3) :
1103  _mm_extract_epi8(b.v, 3));
1104 }
1105 
1111  INC_STATS_NAME(STATS_OTHER_SLOW, 1, "select i16");
1112  return svec<4,int16_t>((_mm_extract_ps(mask.v, 0) != 0) ? _mm_extract_epi16(a.v, 0) :
1113  _mm_extract_epi16(b.v, 0),
1114  (_mm_extract_ps(mask.v, 1) != 0) ? _mm_extract_epi16(a.v, 1) :
1115  _mm_extract_epi16(b.v, 1),
1116  (_mm_extract_ps(mask.v, 2) != 0) ? _mm_extract_epi16(a.v, 2) :
1117  _mm_extract_epi16(b.v, 2),
1118  (_mm_extract_ps(mask.v, 3) != 0) ? _mm_extract_epi16(a.v, 3) :
1119  _mm_extract_epi16(b.v, 3));
1120 }
1121 
1127  INC_STATS_NAME(STATS_OTHER_SLOW, 1, "select u16");
1128  return svec<4,uint16_t>((_mm_extract_ps(mask.v, 0) != 0) ? _mm_extract_epi16(a.v, 0) :
1129  _mm_extract_epi16(b.v, 0),
1130  (_mm_extract_ps(mask.v, 1) != 0) ? _mm_extract_epi16(a.v, 1) :
1131  _mm_extract_epi16(b.v, 1),
1132  (_mm_extract_ps(mask.v, 2) != 0) ? _mm_extract_epi16(a.v, 2) :
1133  _mm_extract_epi16(b.v, 2),
1134  (_mm_extract_ps(mask.v, 3) != 0) ? _mm_extract_epi16(a.v, 3) :
1135  _mm_extract_epi16(b.v, 3));
1136 }
1137 
1143  return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(b.v),
1144  _mm_castsi128_ps(a.v), mask.v));
1145 }
1146 
1152  return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(b.v),
1153  _mm_castsi128_ps(a.v), mask.v));
1154 }
1155 
1161  __m128 m0 = _mm_shuffle_ps(mask.v, mask.v, _MM_SHUFFLE(1, 1, 0, 0));
1162  __m128 m1 = _mm_shuffle_ps(mask.v, mask.v, _MM_SHUFFLE(3, 3, 2, 2));
1163  __m128d m0d = _mm_castps_pd(m0);
1164  __m128d m1d = _mm_castps_pd(m1);
1165  __m128d r0 = _mm_blendv_pd(_mm_castsi128_pd(b.v[0]), _mm_castsi128_pd(a.v[0]), m0d);
1166  __m128d r1 = _mm_blendv_pd(_mm_castsi128_pd(b.v[1]), _mm_castsi128_pd(a.v[1]), m1d);
1167  return svec<4,int64_t>(_mm_castpd_si128(r0), _mm_castpd_si128(r1));
1168 }
1169 
1175  __m128 m0 = _mm_shuffle_ps(mask.v, mask.v, _MM_SHUFFLE(1, 1, 0, 0));
1176  __m128 m1 = _mm_shuffle_ps(mask.v, mask.v, _MM_SHUFFLE(3, 3, 2, 2));
1177  __m128d m0d = _mm_castps_pd(m0);
1178  __m128d m1d = _mm_castps_pd(m1);
1179  __m128d r0 = _mm_blendv_pd(_mm_castsi128_pd(b.v[0]), _mm_castsi128_pd(a.v[0]), m0d);
1180  __m128d r1 = _mm_blendv_pd(_mm_castsi128_pd(b.v[1]), _mm_castsi128_pd(a.v[1]), m1d);
1181  return svec<4,uint64_t>(_mm_castpd_si128(r0), _mm_castpd_si128(r1));
1182 }
1183 
1189  return _mm_blendv_ps(b.v, a.v, mask.v);
1190 }
1191 
1197  __m128 m0 = _mm_shuffle_ps(mask.v, mask.v, _MM_SHUFFLE(1, 1, 0, 0));
1198  __m128 m1 = _mm_shuffle_ps(mask.v, mask.v, _MM_SHUFFLE(3, 3, 2, 2));
1199  __m128d m0d = _mm_castps_pd(m0);
1200  __m128d m1d = _mm_castps_pd(m1);
1201  __m128d r0 = _mm_blendv_pd(b.v[0], a.v[0], m0d);
1202  __m128d r1 = _mm_blendv_pd(b.v[1], a.v[1], m1d);
1203  return svec<4,double>(r0, r1);
1204 }
1205 
1217 
1218 // 4. broadcast/rotate/shuffle/smear/setzero
1219 static FORCEINLINE svec<4,int8_t> svec_broadcast(svec<4,int8_t> v, int index) {
1220  return _mm_set1_epi8(v[index]);
1221 }
1222 static FORCEINLINE svec<4,uint8_t> svec_broadcast(svec<4,uint8_t> v, int index) {
1223  return _mm_set1_epi8(v[index]);
1224 }
1225 static FORCEINLINE svec<4,int16_t> svec_broadcast(svec<4,int16_t> v, int index) {
1226  return _mm_set1_epi16(v[index]);
1227 }
1228 static FORCEINLINE svec<4,uint16_t> svec_broadcast(svec<4,uint16_t> v, int index) {
1229  return _mm_set1_epi16(v[index]);
1230 }
1231 static FORCEINLINE svec<4,int32_t> svec_broadcast(svec<4,int32_t> v, int index) {
1232  return _mm_set1_epi32(v[index]);
1233 }
1234 static FORCEINLINE svec<4,uint32_t> svec_broadcast(svec<4,uint32_t> v, int index) {
1235  return _mm_set1_epi32(v[index]);
1236 }
1237 
1238 static FORCEINLINE svec<4,int64_t> svec_broadcast(svec<4,int64_t> v, int index) {
1239  int64_t val = v[index];
1240  return svec<4,int64_t>(val);
1241 }
1242 static FORCEINLINE svec<4,uint64_t> svec_broadcast(svec<4,uint64_t> v, int index) {
1243  uint64_t val = v[index];
1244  return svec<4,uint64_t>(val);
1245 }
1246 
1247 static FORCEINLINE svec<4,float> svec_broadcast(svec<4,float> v, int index) {
1248  return _mm_set1_ps(v[index]);
1249 }
1250 static FORCEINLINE svec<4,double> svec_broadcast(svec<4,double> v, int index) {
1251  return svec<4,double>(_mm_set1_pd(v[index]),
1252  _mm_set1_pd(v[index]));
1253 }
1254 
1255 
1256 ROTATE_L4(int8_t);
1257 ROTATE_L4(uint8_t);
1258 ROTATE_L4(int16_t);
1259 ROTATE_L4(uint16_t);
1260 ROTATE_L4(int32_t);
1261 ROTATE_L4(uint32_t);
1262 ROTATE_L4(int64_t);
1263 ROTATE_L4(uint64_t);
1264 ROTATE_L4(float);
1265 ROTATE_L4(double);
1266 
1267 SHUFFLES_L4(int8_t);
1268 SHUFFLES_L4(uint8_t);
1269 SHUFFLES_L4(int16_t);
1270 SHUFFLES_L4(uint16_t);
1271 SHUFFLES_L4(int32_t);
1272 SHUFFLES_L4(uint32_t);
1273 SHUFFLES_L4(int64_t);
1274 SHUFFLES_L4(uint64_t);
1275 SHUFFLES_L4(float);
1276 SHUFFLES_L4(double);
1277 
1278 
1279 //load const
1280 #define LOAD_CONST_SSE(STYPE) \
1281 template <class RetVecType> static RetVecType svec_load_const(const STYPE* p); \
1282 template<> \
1283  FORCEINLINE svec<LANES,STYPE> svec_load_const<svec<LANES,STYPE> >(const STYPE* p) { \
1284  return svec<LANES,STYPE>(*p); \
1285 } \
1286 template <class RetVecType> static RetVecType svec_load_and_splat(STYPE* p); \
1287 template<> \
1288 FORCEINLINE svec<LANES,STYPE> svec_load_and_splat<svec<LANES,STYPE> >(STYPE* p) { \
1289  return svec<LANES,STYPE>(*p);\
1290 }
1291 
1295 LOAD_CONST_SSE(uint16_t);
1297 LOAD_CONST_SSE(uint32_t);
1299 LOAD_CONST_SSE(uint64_t);
1302 
1303 
1304 // 5. Gather / Scatter
1317 #if defined(__x86_64__) || defined(__PPC64__)
1318 template <>
1319  struct svec<4,void*> : public svec<4,uint64_t> {
1324  FORCEINLINE svec(void* p0, void* p1, void* p2, void* p3):
1325  svec<4,uint64_t>((uint64_t)(p0),(uint64_t)(p1),(uint64_t)(p2),(uint64_t)(p3)){}
1326 };
1327 #else // 32-bit
1328 template <>
1329  struct svec<4,void*>: public svec<4,uint32_t>{
1334  FORCEINLINE svec(void* p0, void* p1, void* p2, void* p3):
1335  svec<4,uint32_t>((uint32_t)(p0),(uint32_t)(p1),(uint32_t)(p2),(uint32_t)(p3)){}
1336 };
1337 #endif // __PPC64__
1338 
1339 #ifndef DOXYGEN_SHOULD_SKIP_THIS //not want generate svec_gather*/svec_scatter methods
1340 
1341 template <class RetVecType> static RetVecType svec_gather(svec<4,uint32_t> ptrs, svec<4,bool> mask);
1342 
1343 GATHER_GENERAL_L4(int8_t, uint32_t);
1344 GATHER_GENERAL_L4(uint8_t, uint32_t);
1345 GATHER_GENERAL_L4(int16_t, uint32_t);
1346 GATHER_GENERAL_L4(uint16_t, uint32_t);
1347 //GATHER_GENERAL_L4(int32_t, uint32_t);
1348 template<>
1349 FORCEINLINE svec<4,int32_t> svec_gather<svec<4,int32_t> >(svec<4,uint32_t> ptrs, svec<4,bool> mask) {
1350  svec<4,int32_t> ret;
1351  if(svec_extract(mask,0)) { svec_insert(&ret, 0, *((int32_t*)svec_extract(ptrs,0)));}
1352  if(svec_extract(mask,1)) { svec_insert(&ret, 1, *((int32_t*)svec_extract(ptrs,1)));}
1353  if(svec_extract(mask,2)) { svec_insert(&ret, 2, *((int32_t*)svec_extract(ptrs,2)));}
1354  if(svec_extract(mask,3)) { svec_insert(&ret, 3, *((int32_t*)svec_extract(ptrs,3)));}
1355  INC_STATS_NAME(STATS_GATHER_SLOW,1, "Gather general");
1356  return ret;
1357 }
1358 //GATHER_GENERAL_L4(uint32_t, uint32_t);
1359 template<>
1360 FORCEINLINE svec<4,uint32_t> svec_gather<svec<4,uint32_t> >(svec<4,uint32_t> ptrs, svec<4,bool> mask) {
1361  return svec<4,uint32_t>(svec_gather<svec<4,int32_t> >(ptrs, mask).v);
1362 }
1363 
1364 GATHER_GENERAL_L4(int64_t, uint32_t);
1365 GATHER_GENERAL_L4(uint64_t, uint32_t);
1366 //GATHER_GENERAL_L4(float, uint32_t);
1367 template<>
1368 FORCEINLINE svec<4,float> svec_gather<svec<4,float> >(svec<4,uint32_t> ptrs, svec<4,bool> mask) {
1369  return svec<4,float>(_mm_castsi128_ps(svec_gather<svec<4,int32_t> >(ptrs, mask).v));
1370 }
1371 GATHER_GENERAL_L4(double, uint32_t);
1372 
1373 template <class RetVecType> static RetVecType svec_gather(svec<4,uint64_t> ptrs, svec<4,bool> mask);
1374 GATHER_GENERAL_L4(int8_t, uint64_t);
1375 GATHER_GENERAL_L4(int16_t, uint64_t);
1376 GATHER_GENERAL_L4(uint16_t, uint64_t);
1377 GATHER_GENERAL_L4(uint8_t, uint64_t);
1378 //GATHER_GENERAL_L4(int32_t, uint64_t);
1379 template<>
1380 FORCEINLINE svec<4,int32_t> svec_gather<svec<4,int32_t> >(svec<4,uint64_t> ptrs, svec<4,bool> mask) {
1381  svec<4,int32_t> ret;
1382  if(svec_extract(mask,0)) { svec_insert(&ret, 0, *((int32_t*)svec_extract(ptrs,0)));}
1383  if(svec_extract(mask,1)) { svec_insert(&ret, 1, *((int32_t*)svec_extract(ptrs,1)));}
1384  if(svec_extract(mask,2)) { svec_insert(&ret, 2, *((int32_t*)svec_extract(ptrs,2)));}
1385  if(svec_extract(mask,3)) { svec_insert(&ret, 3, *((int32_t*)svec_extract(ptrs,3)));}
1386  INC_STATS_NAME(STATS_GATHER_SLOW,1, "Gather general");
1387  return ret;
1388 }
1389 //GATHER_GENERAL_L4(svec<4,uint32_t>, uint32_t, svec<4,uint64_t>, svec<4,bool>);
1390 template<>
1391 FORCEINLINE svec<4,uint32_t> svec_gather<svec<4,uint32_t> >(svec<4,uint64_t> ptrs, svec<4,bool> mask) {
1392  return svec<4,uint32_t>(svec_gather<svec<4,int32_t> >(ptrs, mask).v);
1393 }
1394 GATHER_GENERAL_L4(int64_t, uint64_t);
1395 GATHER_GENERAL_L4(uint64_t, uint64_t);
1396 //GATHER_GENERAL_L4(float, uint64_t);
1397 template<>
1398 FORCEINLINE svec<4,float> svec_gather<svec<4,float> >(svec<4,uint64_t> ptrs, svec<4,bool> mask) {
1399  return svec<4,float>(_mm_castsi128_ps(svec_gather<svec<4,int32_t> >(ptrs, mask).v));
1400 }
1401 GATHER_GENERAL_L4(double, uint64_t);
1402 
1403 
1404 
1405 GATHER_BASE_OFFSETS_L4(int8_t, int32_t);
1406 GATHER_BASE_OFFSETS_L4(uint8_t, int32_t);
1407 GATHER_BASE_OFFSETS_L4(int16_t, int32_t);
1408 GATHER_BASE_OFFSETS_L4(uint16_t, int32_t);
1409 GATHER_BASE_OFFSETS_L4(int32_t, int32_t);
1410 GATHER_BASE_OFFSETS_L4(uint32_t, int32_t);
1411 GATHER_BASE_OFFSETS_L4(int64_t, int32_t);
1412 GATHER_BASE_OFFSETS_L4(uint64_t, int32_t);
1413 GATHER_BASE_OFFSETS_L4(float, int32_t);
1414 GATHER_BASE_OFFSETS_L4(double, int32_t);
1415 
1416 
1417 GATHER_BASE_OFFSETS_L4(int8_t, int64_t);
1418 GATHER_BASE_OFFSETS_L4(uint8_t, int64_t);
1419 GATHER_BASE_OFFSETS_L4(int16_t, int64_t);
1420 GATHER_BASE_OFFSETS_L4(uint16_t, int64_t);
1421 GATHER_BASE_OFFSETS_L4(int32_t, int64_t);
1422 GATHER_BASE_OFFSETS_L4(uint32_t, int64_t);
1423 GATHER_BASE_OFFSETS_L4(int64_t, int64_t);
1424 GATHER_BASE_OFFSETS_L4(uint64_t, int64_t);
1425 GATHER_BASE_OFFSETS_L4(float, int64_t);
1426 GATHER_BASE_OFFSETS_L4(double, int64_t);
1427 
1428 GATHER_STRIDE_L4(int8_t, int32_t);
1429 GATHER_STRIDE_L4(int8_t, int64_t);
1430 GATHER_STRIDE_L4(uint8_t, int32_t);
1431 GATHER_STRIDE_L4(uint8_t, int64_t);
1432 GATHER_STRIDE_L4(int16_t, int32_t);
1433 GATHER_STRIDE_L4(int16_t, int64_t);
1434 GATHER_STRIDE_L4(uint16_t, int32_t);
1435 GATHER_STRIDE_L4(uint16_t, int64_t);
1436 GATHER_STRIDE_L4(int32_t, int32_t);
1437 GATHER_STRIDE_L4(int32_t, int64_t);
1438 GATHER_STRIDE_L4(uint32_t, int32_t);
1439 GATHER_STRIDE_L4(uint32_t, int64_t);
1440 GATHER_STRIDE_L4(int64_t, int32_t);
1441 GATHER_STRIDE_L4(int64_t, int64_t);
1442 GATHER_STRIDE_L4(uint64_t, int32_t);
1443 GATHER_STRIDE_L4(uint64_t, int64_t);
1444 GATHER_STRIDE_L4(float, int32_t);
1445 GATHER_STRIDE_L4(float, int64_t);
1446 GATHER_STRIDE_L4(double, int32_t);
1447 GATHER_STRIDE_L4(double, int64_t);
1448 
1449 
1450 SCATTER_GENERAL_L4(int8_t, uint32_t);
1451 SCATTER_GENERAL_L4(int8_t, uint64_t);
1452 SCATTER_GENERAL_L4(uint8_t, uint32_t);
1453 SCATTER_GENERAL_L4(uint8_t, uint64_t);
1454 SCATTER_GENERAL_L4(int16_t, uint32_t);
1455 SCATTER_GENERAL_L4(int16_t, uint64_t);
1456 SCATTER_GENERAL_L4(uint16_t, uint32_t);
1457 SCATTER_GENERAL_L4(uint16_t, uint64_t);
1458 SCATTER_GENERAL_L4(int32_t, uint32_t);
1459 SCATTER_GENERAL_L4(int32_t, uint64_t);
1460 SCATTER_GENERAL_L4(uint32_t, uint32_t);
1461 SCATTER_GENERAL_L4(uint32_t, uint64_t);
1462 SCATTER_GENERAL_L4(int64_t, uint32_t);
1463 SCATTER_GENERAL_L4(int64_t, uint64_t);
1464 SCATTER_GENERAL_L4(uint64_t, uint32_t);
1465 SCATTER_GENERAL_L4(uint64_t, uint64_t);
1466 SCATTER_GENERAL_L4(float, uint32_t);
1467 SCATTER_GENERAL_L4(float, uint64_t);
1468 SCATTER_GENERAL_L4(double, uint32_t);
1469 SCATTER_GENERAL_L4(double, uint64_t);
1470 
1471 
1472 SCATTER_BASE_OFFSETS_L4(int8_t, int32_t);
1473 SCATTER_BASE_OFFSETS_L4(int8_t, int64_t);
1474 SCATTER_BASE_OFFSETS_L4(uint8_t, int32_t);
1475 SCATTER_BASE_OFFSETS_L4(uint8_t, int64_t);
1476 SCATTER_BASE_OFFSETS_L4(int16_t, int32_t);
1477 SCATTER_BASE_OFFSETS_L4(int16_t, int64_t);
1478 SCATTER_BASE_OFFSETS_L4(uint16_t, int32_t);
1479 SCATTER_BASE_OFFSETS_L4(uint16_t, int64_t);
1480 SCATTER_BASE_OFFSETS_L4(int32_t, int32_t);
1481 SCATTER_BASE_OFFSETS_L4(int32_t, int64_t);
1482 SCATTER_BASE_OFFSETS_L4(uint32_t, int32_t);
1483 SCATTER_BASE_OFFSETS_L4(uint32_t, int64_t);
1484 SCATTER_BASE_OFFSETS_L4(int64_t, int32_t);
1485 SCATTER_BASE_OFFSETS_L4(int64_t, int64_t);
1486 SCATTER_BASE_OFFSETS_L4(uint64_t, int32_t);
1487 SCATTER_BASE_OFFSETS_L4(uint64_t, int64_t);
1488 SCATTER_BASE_OFFSETS_L4(float, int32_t);
1489 SCATTER_BASE_OFFSETS_L4(float, int64_t);
1490 SCATTER_BASE_OFFSETS_L4(double, int32_t);
1491 SCATTER_BASE_OFFSETS_L4(double, int64_t);
1492 
1493 SCATTER_STRIDE_L4(int8_t, int32_t);
1494 SCATTER_STRIDE_L4(int8_t, int64_t);
1495 SCATTER_STRIDE_L4(uint8_t, int32_t);
1496 SCATTER_STRIDE_L4(uint8_t, int64_t);
1497 SCATTER_STRIDE_L4(int16_t, int32_t);
1498 SCATTER_STRIDE_L4(int16_t, int64_t);
1499 SCATTER_STRIDE_L4(uint16_t, int32_t);
1500 SCATTER_STRIDE_L4(uint16_t, int64_t);
1501 SCATTER_STRIDE_L4(int32_t, int32_t);
1502 SCATTER_STRIDE_L4(int32_t, int64_t);
1503 SCATTER_STRIDE_L4(uint32_t, int32_t);
1504 SCATTER_STRIDE_L4(uint32_t, int64_t);
1505 SCATTER_STRIDE_L4(int64_t, int32_t);
1506 SCATTER_STRIDE_L4(int64_t, int64_t);
1507 SCATTER_STRIDE_L4(uint64_t, int32_t);
1508 SCATTER_STRIDE_L4(uint64_t, int64_t);
1509 SCATTER_STRIDE_L4(float, int32_t);
1510 SCATTER_STRIDE_L4(float, int64_t);
1511 SCATTER_STRIDE_L4(double, int32_t);
1512 SCATTER_STRIDE_L4(double, int64_t);
1513 
1514 
1515 #endif //DOXYGEN_SHOULD_SKIP_THIS
1516 
1517 
1518 // 5. masked load/masked store
1519 
1520 //Masked load/store is implemented based on gather_base_offsets/scatter_base_offsets
1521 //Here we only use offsets with 32bit
1522 
1523 MASKED_LOAD_STORE_L4(int8_t);
1524 MASKED_LOAD_STORE_L4(uint8_t);
1525 MASKED_LOAD_STORE_L4(int16_t);
1526 MASKED_LOAD_STORE_L4(uint16_t);
1527 MASKED_LOAD_STORE_L4(int32_t);
1528 MASKED_LOAD_STORE_L4(uint32_t);
1529 MASKED_LOAD_STORE_L4(int64_t);
1530 MASKED_LOAD_STORE_L4(uint64_t);
1531 MASKED_LOAD_STORE_L4(float);
1532 MASKED_LOAD_STORE_L4(double);
1533 
1535 //
1536 // Mask type (i1) interfaces
1537 //
1539 
1540 // 1. mask construction
1546 static FORCEINLINE bool svec_any_true(const svec<4,bool>& mask) {
1547  return (_mm_movemask_ps(mask.v)!=0);
1548 }
1549 
1555 static FORCEINLINE bool svec_all_true(const svec<4,bool>& mask) {
1556  return (_mm_movemask_ps(mask.v)==0xF);
1557 }
1558 
1559 
1565 static FORCEINLINE bool svec_none_true(const svec<4,bool>& mask) {
1566  return (_mm_movemask_ps(mask.v)==0);
1567 }
1568 
1569 // 2. bit operations
1570 
1574 static FORCEINLINE svec<4,bool> svec_and(svec<4,bool> a, svec<4,bool> b) {
1575  return _mm_and_ps(a.v, b.v);
1576 }
1577 
1578 
1582 static FORCEINLINE svec<4,bool> svec_or(svec<4,bool> a, svec<4,bool> b) {
1583  return _mm_or_ps(a.v, b.v);
1584 }
1585 
1589 static FORCEINLINE svec<4,bool> svec_xor(svec<4,bool> a, svec<4,bool> b) {
1590  return _mm_xor_ps(a.v, b.v);
1591 }
1592 
1596 static FORCEINLINE svec<4,bool> svec_not(svec<4,bool> a) {
1597  __m128 allon = _mm_castsi128_ps(_mm_set1_epi32(-1));
1598  return _mm_xor_ps(a.v, allon);
1599 }
1600 
1607 static FORCEINLINE uint64_t svec_movmsk(svec<4,bool> mask) {
1608  return (uint64_t)_mm_movemask_ps(mask.v);
1609 }
1610 
1611 
1613 //
1614 // General data operation interfaces
1615 //
1617 // 1. Unary
1618 
1619 #define UNARY_OP_OPT(STYPE, NAME, OP)\
1620 static FORCEINLINE svec<LANES,STYPE> NAME(svec<LANES,STYPE> a) { \
1621  return OP(a.v); \
1622 }
1623 
1627 #define UNARY_OP_OPT64(STYPE, NAME, OP)\
1628 static FORCEINLINE svec<LANES,STYPE> NAME(svec<LANES,STYPE> a) { \
1629  return svec<LANES,STYPE>(OP(a.v[0]), OP(a.v[1])); \
1630 }
1631 
1632 // neg operation
1633 static FORCEINLINE svec<4,int8_t> svec_neg(svec<4,int8_t> a) {
1634  return _mm_sub_epi8(_mm_setzero_si128(), (a.v));
1635 }
1636 static FORCEINLINE svec<4,uint8_t> svec_neg(svec<4,uint8_t> a) {
1637  return _mm_sub_epi8(_mm_setzero_si128(), (a.v));
1638 }
1639 static FORCEINLINE svec<4,int16_t> svec_neg(svec<4,int16_t> a) {
1640  return _mm_sub_epi16(_mm_setzero_si128(), (a.v));
1641 }
1642 static FORCEINLINE svec<4,uint16_t> svec_neg(svec<4,uint16_t> a) {
1643  return _mm_sub_epi16(_mm_setzero_si128(), (a.v));
1644 }
1645 static FORCEINLINE svec<4,int32_t> svec_neg(svec<4,int32_t> a) {
1646  return _mm_sub_epi32(_mm_setzero_si128(), (a.v));
1647 }
1648 static FORCEINLINE svec<4,uint32_t> svec_neg(svec<4,uint32_t> a) {
1649  return _mm_sub_epi32(_mm_setzero_si128(), (a.v));
1650 }
1651 //it seems i64/f/d sse overload's "-" operator.
1652 UNARY_OP_OPT64(int64_t, svec_neg, -);
1653 UNARY_OP_OPT64(uint64_t, svec_neg, -);
1654 UNARY_OP_OPT(float, svec_neg, -);
1655 UNARY_OP_OPT64(double, svec_neg, -);
1656 
1657 // 2. Math unary
1658 //round
1659 static FORCEINLINE svec<4,float> svec_round(svec<4,float> a) {
1660  return _mm_round_ps(a.v, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
1661 }
1662 static FORCEINLINE svec<4,double> svec_round(svec<4,double> a) {
1663  return svec<4,double>(
1664  _mm_round_pd(a.v[0], _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC),
1665  _mm_round_pd(a.v[1], _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
1666 }
1667 //floor
1668 UNARY_OP_OPT(float, svec_floor, _mm_floor_ps);
1669 UNARY_OP_OPT64(double, svec_floor, _mm_floor_pd);
1670 //ceil
1671 UNARY_OP_OPT(float, svec_ceil, _mm_ceil_ps);
1672 UNARY_OP_OPT64(double, svec_ceil, _mm_ceil_pd);
1673 //reverse 1/
1674 static FORCEINLINE svec<4,float> svec_rcp(svec<4,float> v) {
1675  __m128 rcp = _mm_rcp_ps(v.v);
1676  // N-R iteration:
1677  __m128 m = _mm_mul_ps(v.v, rcp);
1678  __m128 twominus = _mm_sub_ps(_mm_set1_ps(2.f), m);
1679  __m128 r = _mm_mul_ps(rcp, twominus);
1680  return r;
1681 }
1682 UNARY_OP_L4(double, svec_rcp, 1.0/);
1683 //reverse sqrt
1684 static FORCEINLINE svec<4,float> svec_rsqrt(svec<4,float> v) {
1685  __m128 rsqrt = _mm_rsqrt_ps(v.v);
1686  // Newton-Raphson iteration to improve precision
1687  // return 0.5 * rsqrt * (3. - (v * rsqrt) * rsqrt);
1688  __m128 v_rsqrt = _mm_mul_ps(rsqrt, v.v);
1689  __m128 v_r_r = _mm_mul_ps(v_rsqrt, rsqrt);
1690  __m128 three_sub = _mm_sub_ps(_mm_set1_ps(3.f), v_r_r);
1691  __m128 rs_mul = _mm_mul_ps(rsqrt, three_sub);
1692  __m128 half_scale = _mm_mul_ps(_mm_set1_ps(0.5), rs_mul);
1693  return half_scale;
1694 }
1695 UNARY_OP_L4(double, svec_rsqrt, 1.0/sqrt);
1696 //sqrt
1697 UNARY_OP_OPT(float, svec_sqrt, _mm_sqrt_ps);
1698 UNARY_OP_OPT64(double, svec_sqrt, _mm_sqrt_pd);
1699 //exp - _mm_exp_ps/_mm_exp_pd not in gcc but in ICC
1700 UNARY_OP_L4(float, svec_exp, expf);
1701 UNARY_OP_L4(double, svec_exp, exp);
1702 //log - _mm_log_ps / _mm_log_pd not in gcc but in ICC
1703 UNARY_OP_L4(float, svec_log, logf);
1704 UNARY_OP_L4(double, svec_log, log);
1705 //abs - for all types
1706 UNARY_OP_L4(int8_t, svec_abs, abs<int8_t>);
1707 static FORCEINLINE svec<4,uint8_t> svec_abs(svec<4,uint8_t> v) { return v;}
1708 UNARY_OP_L4(int16_t, svec_abs, abs<int16_t>);
1709 static FORCEINLINE svec<4,uint16_t> svec_abs(svec<4,uint16_t> v) { return v;}
1710 UNARY_OP_L4(int32_t, svec_abs, abs<int32_t>);
1711 static FORCEINLINE svec<4,uint32_t> svec_abs(svec<4,uint32_t> v) { return v;}
1712 UNARY_OP_L4(int64_t, svec_abs, abs<int64_t>);
1713 static FORCEINLINE svec<4,uint64_t> svec_abs(svec<4,uint64_t> v) { return v;}
1714 //UNARY_OP(float, svec_abs, abs);
1715 static FORCEINLINE svec<4,float> svec_abs(svec<4,float> v) {
1716  unsigned int x = 0x7fffffff;
1717  float &f = * (float *)( &x );
1718  __m128 tmp = _mm_set1_ps(f);
1719  return _mm_and_ps(v.v, tmp);
1720 }
1721 UNARY_OP_L4(double, svec_abs, abs);
1722 
1723 // 3. Binary
1724 
1725 #define BINARY_OP_OPT_FUNC(STYPE, STYPE2, NAME, FUNC) \
1726 static FORCEINLINE svec<LANES,STYPE> NAME(svec<LANES,STYPE> a, svec<LANES,STYPE2> b) { \
1727  return svec<LANES,STYPE>(FUNC(a.v, b.v)); \
1728 }
1729 
1730 #define BINARY_OP_OPT_FUNC64(STYPE, STYPE2, NAME, FUNC) \
1731 static FORCEINLINE svec<LANES,STYPE> NAME(svec<LANES,STYPE> a, svec<LANES,STYPE> b) { \
1732  return svec<LANES,STYPE>(FUNC(a.v[0], b.v[0]), FUNC(a.v[1], b.v[1])); \
1733 }
1734 
1735 //add, sub, div, mul.
1736 
1737 //add
1738 BINARY_OP_OPT_FUNC(int8_t, int8_t, svec_add, _mm_add_epi8);
1739 BINARY_OP_OPT_FUNC(uint8_t, uint8_t, svec_add, _mm_add_epi8);
1740 BINARY_OP_OPT_FUNC(int16_t, int16_t, svec_add, _mm_add_epi16);
1741 BINARY_OP_OPT_FUNC(uint16_t, uint16_t, svec_add, _mm_add_epi16);
1742 BINARY_OP_OPT_FUNC(int32_t, int32_t, svec_add, _mm_add_epi32);
1743 BINARY_OP_OPT_FUNC(uint32_t, uint32_t, svec_add, _mm_add_epi32);
1744 BINARY_OP_OPT_FUNC64(int64_t, int64_t, svec_add, _mm_add_epi64);
1745 BINARY_OP_OPT_FUNC64(uint64_t, uint64_t, svec_add, _mm_add_epi64);
1746 BINARY_OP_OPT_FUNC(float, float, svec_add, _mm_add_ps);
1747 BINARY_OP_OPT_FUNC64(double, double, svec_add, _mm_add_pd);
1748 
1749 //sub
1750 BINARY_OP_OPT_FUNC(int8_t, int8_t, svec_sub, _mm_sub_epi8);
1751 BINARY_OP_OPT_FUNC(uint8_t, uint8_t, svec_sub, _mm_sub_epi8);
1752 BINARY_OP_OPT_FUNC(int16_t, int16_t, svec_sub, _mm_sub_epi16);
1753 BINARY_OP_OPT_FUNC(uint16_t, uint16_t, svec_sub, _mm_sub_epi16);
1754 BINARY_OP_OPT_FUNC(int32_t, int32_t, svec_sub, _mm_sub_epi32);
1755 BINARY_OP_OPT_FUNC(uint32_t, uint32_t, svec_sub, _mm_sub_epi32);
1756 BINARY_OP_OPT_FUNC64(int64_t, int64_t, svec_sub, _mm_sub_epi64);
1757 BINARY_OP_OPT_FUNC64(uint64_t, uint64_t, svec_sub, _mm_sub_epi64);
1758 BINARY_OP_OPT_FUNC(float, float, svec_sub, _mm_sub_ps);
1759 BINARY_OP_OPT_FUNC64(double, double, svec_sub, _mm_sub_pd);
1760 
1761 //mul
1762 BINARY_OP_L4(int8_t, svec_mul, *);
1763 BINARY_OP_L4(uint8_t, svec_mul, *);
1764 BINARY_OP_L4(int16_t, svec_mul, *);
1765 BINARY_OP_L4(uint16_t, svec_mul, *);
1766 BINARY_OP_OPT_FUNC(int32_t, int32_t, svec_mul, _mm_mullo_epi32);
1767 BINARY_OP_OPT_FUNC(uint32_t, uint32_t, svec_mul, _mm_mullo_epi32);
1768 BINARY_OP_L4(int64_t, svec_mul, *);
1769 BINARY_OP_L4(uint64_t, svec_mul, *);
1770 BINARY_OP_OPT_FUNC(float, float, svec_mul, _mm_mul_ps);
1771 BINARY_OP_OPT_FUNC64(double, double, svec_mul, _mm_mul_pd);
1772 
1773 //div - no _mm_idiv_epi32 and _mm_udiv_epi32
1774 BINARY_OP_L4(int8_t, svec_div, /);
1775 BINARY_OP_L4(uint8_t, svec_div, /);
1776 BINARY_OP_L4(int16_t, svec_div, /);
1777 BINARY_OP_L4(uint16_t, svec_div, /);
1778 BINARY_OP_L4(int32_t, svec_div, /);
1779 BINARY_OP_L4(uint32_t, svec_div, /);
1780 BINARY_OP_L4(int64_t, svec_div, /);
1781 BINARY_OP_L4(uint64_t, svec_div, /);
1782 BINARY_OP_OPT_FUNC(float, float, svec_div, _mm_div_ps);
1783 BINARY_OP_OPT_FUNC64(double, double, svec_div, _mm_div_pd);
1784 
1785 #define BIN_VEC_SCAL(STYPE) \
1786 static FORCEINLINE svec<LANES,STYPE> svec_add_scalar(svec<LANES,STYPE> a, STYPE s) { \
1787  return svec_add(a, svec<LANES,STYPE>(s)); \
1788 } \
1789 static FORCEINLINE svec<LANES,STYPE> svec_scalar_add(STYPE s, svec<LANES,STYPE> a) { \
1790  return svec_add(svec<LANES,STYPE>(s), a); \
1791 } \
1792 static FORCEINLINE svec<LANES,STYPE> svec_sub_scalar(svec<LANES,STYPE> a, STYPE s) { \
1793  return svec_sub(a, svec<LANES,STYPE>(s)); \
1794 } \
1795 static FORCEINLINE svec<LANES,STYPE> svec_scalar_sub(STYPE s, svec<LANES,STYPE> a) { \
1796  return svec_sub(svec<LANES,STYPE>(s), a); \
1797 } \
1798 static FORCEINLINE svec<LANES,STYPE> svec_mul_scalar(svec<LANES,STYPE> a, STYPE s) { \
1799  return svec_mul(a, svec<LANES,STYPE>(s)); \
1800 } \
1801 static FORCEINLINE svec<LANES,STYPE> svec_scalar_mul(STYPE s, svec<LANES,STYPE> a) { \
1802  return svec_mul(svec<LANES,STYPE>(s), a); \
1803 } \
1804 static FORCEINLINE svec<LANES,STYPE> svec_div_scalar(svec<LANES,STYPE> a, STYPE s) { \
1805  return svec_div(a, svec<LANES,STYPE>(s)); \
1806 } \
1807 static FORCEINLINE svec<LANES,STYPE> svec_scalar_div(STYPE s, svec<LANES,STYPE> a) { \
1808  return svec_div(svec<LANES,STYPE>(s), a); \
1809 } \
1810 
1811 BIN_VEC_SCAL(int8_t);
1812 BIN_VEC_SCAL(uint8_t);
1813 BIN_VEC_SCAL(int16_t);
1814 BIN_VEC_SCAL(uint16_t);
1815 BIN_VEC_SCAL(int32_t);
1816 BIN_VEC_SCAL(uint32_t);
1817 BIN_VEC_SCAL(int64_t);
1818 BIN_VEC_SCAL(uint64_t);
1819 BIN_VEC_SCAL(float);
1820 BIN_VEC_SCAL(double);
1821 
1822 
1823 #define INT_BINARY_OP_METHODS(STYPE) \
1824 BINARY_OP_OPT_FUNC(STYPE, STYPE, svec_or, _mm_or_si128); \
1825 BINARY_OP_OPT_FUNC(STYPE, STYPE, svec_and, _mm_and_si128); \
1826 BINARY_OP_OPT_FUNC(STYPE, STYPE, svec_xor, _mm_xor_si128); \
1827 BINARY_OP_L4(STYPE, svec_rem, %); \
1828 BINARY_OP_SCALAR_L4(STYPE, STYPE, svec_rem, %);
1829 
1830 #define INT_BINARY_OP_METHODS64(STYPE) \
1831 BINARY_OP_OPT_FUNC64(STYPE, STYPE, svec_or, _mm_or_si128); \
1832 BINARY_OP_OPT_FUNC64(STYPE, STYPE, svec_and, _mm_and_si128); \
1833 BINARY_OP_OPT_FUNC64(STYPE, STYPE, svec_xor, _mm_xor_si128); \
1834 BINARY_OP_L4(STYPE, svec_rem, %); \
1835 BINARY_OP_SCALAR_L4(STYPE, STYPE, svec_rem, %);
1836 
1837 
1838 
1839 
1840 INT_BINARY_OP_METHODS(int8_t);
1841 INT_BINARY_OP_METHODS(uint8_t);
1842 INT_BINARY_OP_METHODS(int16_t);
1843 INT_BINARY_OP_METHODS(uint16_t);
1844 INT_BINARY_OP_METHODS(int32_t);
1845 INT_BINARY_OP_METHODS(uint32_t);
1846 INT_BINARY_OP_METHODS64(int64_t);
1847 INT_BINARY_OP_METHODS64(uint64_t);
1848 
1849 
1850 //power only for float - cannot find _mm_pow_ps/pd in gcc
1851 BINARY_OP_FUNC(float, svec_pow, powf);
1852 BINARY_OP_FUNC(double, svec_pow, pow);
1853 
1854 //shift left
1855 BINARY_OP2_L4(int8_t, uint8_t, svec_shl, <<);
1856 BINARY_OP2_L4(uint8_t, uint8_t, svec_shl, <<);
1857 BINARY_OP2_L4(int16_t, uint16_t, svec_shl, <<);
1858 BINARY_OP2_L4(uint16_t, uint16_t, svec_shl, <<);
1859 BINARY_OP2_L4(int32_t, uint32_t, svec_shl, <<);
1860 BINARY_OP2_L4(uint32_t, uint32_t, svec_shl, <<);
1861 BINARY_OP2_L4(int64_t, uint64_t, svec_shl, <<);
1862 BINARY_OP2_L4(uint64_t, uint64_t, svec_shl, <<);
1863 
1864 //shift right
1865 BINARY_OP2_L4(int8_t, uint8_t, svec_shr, >>);
1866 BINARY_OP2_L4(uint8_t, uint8_t, svec_shr, >>);
1867 BINARY_OP2_L4(int16_t, uint16_t, svec_shr, >>);
1868 BINARY_OP2_L4(uint16_t, uint16_t, svec_shr, >>);
1869 BINARY_OP2_L4(int32_t, uint32_t, svec_shr, >>);
1870 BINARY_OP2_L4(uint32_t, uint32_t, svec_shr, >>);
1871 BINARY_OP2_L4(int64_t, uint64_t, svec_shr, >>);
1872 BINARY_OP2_L4(uint64_t, uint64_t, svec_shr, >>);
1873 
1874 // shift scalar left
1875 BINARY_OP_SCALAR_L4(int8_t, int32_t, svec_shl, <<);
1876 BINARY_OP_SCALAR_L4(uint8_t, int32_t, svec_shl, <<);
1877 //BINARY_OP_SCALAR_L4(int16_t, int32_t, svec_shl, <<);
1878 static FORCEINLINE svec<4,int16_t> svec_shl(svec<4,int16_t> a, int32_t s) {
1879  return svec<4,int16_t>(_mm_sll_epi16(a.v, _mm_set_epi32(0, 0, 0, s))); \
1880 }
1881 
1882 //BINARY_OP_SCALAR_L4(uint16_t, int32_t, svec_shl, <<);
1883 static FORCEINLINE svec<4,uint16_t> svec_shl(svec<4,uint16_t> a, int32_t s) {
1884  return svec<4,uint16_t>(_mm_sll_epi16(a.v, _mm_set_epi32(0, 0, 0, s))); \
1885 }
1886 //BINARY_OP_SCALAR_L4(int32_t, int32_t, svec_shl, <<);
1887 static FORCEINLINE svec<4,int32_t> svec_shl(svec<4,int32_t> a, int32_t s) {
1888  return svec<4,int32_t>(_mm_sll_epi32(a.v, _mm_set_epi32(0, 0, 0, s))); \
1889 }
1890 //BINARY_OP_SCALAR_L4(uint32_t, int32_t, svec_shl, <<);
1891 static FORCEINLINE svec<4,uint32_t> svec_shl(svec<4,uint32_t> a, int32_t s) {
1892  return svec<4,uint32_t>(_mm_sll_epi32(a.v, _mm_set_epi32(0, 0, 0, s))); \
1893 }
1894 //BINARY_OP_SCALAR_L4(int64_t, int32_t, svec_shl, <<);
1895 static FORCEINLINE svec<4,int64_t> svec_shl(svec<4,int64_t> a, int32_t s) {
1896  __m128i amt = _mm_set_epi32(0, 0, 0, s);
1897  return svec<4,int64_t>(_mm_sll_epi64(a.v[0], amt),
1898  _mm_sll_epi64(a.v[1], amt));
1899 }
1900 //BINARY_OP_SCALAR_L4(uint64_t, int32_t, svec_shl, <<);
1901 static FORCEINLINE svec<4,uint64_t> svec_shl(svec<4,uint64_t> a, int32_t s) {
1902  __m128i amt = _mm_set_epi32(0, 0, 0, s);
1903  return svec<4,uint64_t>(_mm_sll_epi64(a.v[0], amt),
1904  _mm_sll_epi64(a.v[1], amt));
1905 }
1906 
1907 //shift sclar right
1908 BINARY_OP_SCALAR_L4(int8_t, int32_t, svec_shr, >>);
1909 BINARY_OP_SCALAR_L4(uint8_t, int32_t, svec_shr, >>);
1910 //BINARY_OP_SCALAR_L4(int16_t, int32_t, svec_shr, >>);
1911 static FORCEINLINE svec<4,int16_t> svec_shr(svec<4,int16_t> a, int32_t s) {
1912  return svec<4,int16_t>(_mm_sra_epi16(a.v, _mm_set_epi32(0, 0, 0, s))); \
1913 }
1914 //BINARY_OP_SCALAR_L4(uint16_t, int32_t, svec_shr, >>);
1915 static FORCEINLINE svec<4,uint16_t> svec_shr(svec<4,uint16_t> a, int32_t s) {
1916  return svec<4,uint16_t>(_mm_srl_epi16(a.v, _mm_set_epi32(0, 0, 0, s))); \
1917 }
1918 //BINARY_OP_SCALAR_L4(int32_t, int32_t, svec_shr, >>);
1919 static FORCEINLINE svec<4,int32_t> svec_shr(svec<4,int32_t> a, int32_t s) {
1920  return svec<4,int32_t>(_mm_sra_epi32(a.v, _mm_set_epi32(0, 0, 0, s))); \
1921 }
1922 //BINARY_OP_SCALAR_L4(uint32_t, int32_t, svec_shr, >>);
1923 static FORCEINLINE svec<4,uint32_t> svec_shr(svec<4,uint32_t> a, int32_t s) {
1924  return svec<4,uint32_t>(_mm_srl_epi32(a.v, _mm_set_epi32(0, 0, 0, s))); \
1925 }
1926 BINARY_OP_SCALAR_L4(int64_t, int32_t, svec_shr, >>);
1927 //BINARY_OP_SCALAR_L4(uint64_t, int32_t, svec_shr, >>);
1928 static FORCEINLINE svec<4,uint64_t> svec_shr(svec<4,uint64_t> a, int32_t s) {
1929  __m128i amt = _mm_set_epi32(0, 0, 0, s);
1930  return svec<4,uint64_t>(_mm_srl_epi64(a.v[0], amt),
1931  _mm_srl_epi64(a.v[1], amt));
1932 }
1933 
1934 // 4. Ternary
1935 
1936 //madd / msub for only int32/u32/float/double
1937 #define TERNERY_OPT(STYPE) \
1938  \
1941 FORCEINLINE svec<LANES,STYPE> svec_madd(svec<LANES,STYPE> a, svec<LANES,STYPE> b, svec<LANES,STYPE> c) { \
1942  return a * b + c;\
1943 } \
1944  \
1947 FORCEINLINE svec<LANES,STYPE> svec_msub(svec<LANES,STYPE> a, svec<LANES,STYPE> b, svec<LANES,STYPE> c) { \
1948  return a * b - c;\
1949 } \
1950  \
1953 FORCEINLINE svec<LANES,STYPE> svec_nmsub(svec<LANES,STYPE> a, svec<LANES,STYPE> b, svec<LANES,STYPE> c) { \
1954  return c - a * b ;\
1955 }
1956 
1957 
1958 TERNERY_OPT(int32_t);
1959 TERNERY_OPT(uint32_t);
1960 TERNERY_OPT(int64_t);
1961 TERNERY_OPT(uint64_t);
1963 TERNERY_OPT(double);
1964 
1965 
1966 // 5. Max/Min
1967 BINARY_OP_FUNC_L4(int8_t, svec_max, max<int8_t>);
1968 BINARY_OP_FUNC_L4(uint8_t, svec_max, max<uint8_t>);
1969 BINARY_OP_FUNC_L4(int16_t, svec_max, max<int16_t>);
1970 BINARY_OP_FUNC_L4(uint16_t, svec_max, max<uint16_t>);
1971 BINARY_OP_OPT_FUNC(int32_t, int32_t, svec_max, _mm_max_epi32);
1972 BINARY_OP_OPT_FUNC(uint32_t, uint32_t, svec_max, _mm_max_epu32);
1973 BINARY_OP_FUNC_L4(int64_t, svec_max, max<int64_t>);
1974 BINARY_OP_FUNC_L4(uint64_t, svec_max, max<uint64_t>);
1975 BINARY_OP_OPT_FUNC(float, float, svec_max, _mm_max_ps);
1976 BINARY_OP_OPT_FUNC64(double, double, svec_max, _mm_max_pd);
1977 
1978 BINARY_OP_FUNC_L4(int8_t, svec_min, min<int8_t>);
1979 BINARY_OP_FUNC_L4(uint8_t, svec_min, min<uint8_t>);
1980 BINARY_OP_FUNC_L4(int16_t, svec_min, min<int16_t>);
1981 BINARY_OP_FUNC_L4(uint16_t, svec_min, min<uint16_t>);
1982 BINARY_OP_OPT_FUNC(int32_t, int32_t, svec_min, _mm_min_epi32);
1983 BINARY_OP_OPT_FUNC(uint32_t, uint32_t, svec_min, _mm_min_epu32);
1984 BINARY_OP_FUNC_L4(int64_t, svec_min, min<int64_t>);
1985 BINARY_OP_FUNC_L4(uint64_t, svec_min, min<uint64_t>);
1986 BINARY_OP_OPT_FUNC(float, float, svec_min, _mm_min_ps);
1987 BINARY_OP_OPT_FUNC64(double, double, svec_min, _mm_min_pd);
1988 
1989 
1990 
1991 //6. Reduce
1992 
1993 #define MAX_MIN_REDUCE_METHODS(STYPE) \
1994 BINARY_OP_REDUCE_FUNC(STYPE, svec_reduce_add, add<STYPE>); \
1995 BINARY_OP_REDUCE_FUNC(STYPE, svec_reduce_max, max<STYPE>); \
1996 BINARY_OP_REDUCE_FUNC(STYPE, svec_reduce_min, min<STYPE>); \
1997 
1998 
1999 MAX_MIN_REDUCE_METHODS(int8_t);
2000 MAX_MIN_REDUCE_METHODS(uint8_t);
2001 MAX_MIN_REDUCE_METHODS(int16_t);
2002 MAX_MIN_REDUCE_METHODS(uint16_t);
2003 MAX_MIN_REDUCE_METHODS(int32_t);
2004 MAX_MIN_REDUCE_METHODS(uint32_t);
2005 MAX_MIN_REDUCE_METHODS(int64_t);
2006 MAX_MIN_REDUCE_METHODS(uint64_t);
2007 MAX_MIN_REDUCE_METHODS(float);
2008 MAX_MIN_REDUCE_METHODS(double);
2009 
2011  __m128 s0 = _mm_hadd_ps(v0.v,v1.v);
2012  __m128 s1 = _mm_hadd_ps(v2.v,v3.v);
2013  __m128 s = _mm_hadd_ps(s0, s1);
2014  return svec<LANES,float>(s);
2015 }
2016 
2018  __m128d s00 = _mm_add_pd(v0.v[0], v0.v[1]);
2019  __m128d s01 = _mm_add_pd(v1.v[0], v1.v[1]);
2020  __m128d s02 = _mm_add_pd(v2.v[0], v2.v[1]);
2021  __m128d s03 = _mm_add_pd(v3.v[0], v3.v[1]);
2022 
2023  __m128d s0 = _mm_hadd_pd(s00, s01);
2024  __m128d s1 = _mm_hadd_pd(s02, s03);
2025 
2026  return svec<4,double>(s0, s1);
2027 }
2028 
2029 // 7. Compare
2036 static FORCEINLINE svec<4,bool> svec_equal(svec<4,bool> a, svec<4,bool> b) {
2037  return _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_castps_si128(a.v), _mm_castps_si128(b.v)));
2038 }
2039 
2040 static FORCEINLINE svec<4,bool> svec_not_equal(svec<4,bool> a, svec<4,bool> b) {
2041  return svec_not(svec_equal(a, b));
2042 }
2043 
2044 static FORCEINLINE svec<4,bool> svec_equal(svec<4,int8_t> a, svec<4,int8_t> b) {
2045  __m128i cmp = _mm_cmpeq_epi8(a.v, b.v);
2046  return svec<4,bool>(_mm_extract_epi8(cmp, 0),
2047  _mm_extract_epi8(cmp, 1),
2048  _mm_extract_epi8(cmp, 2),
2049  _mm_extract_epi8(cmp, 3));
2050 }
2051 static FORCEINLINE svec<4,bool> svec_not_equal(svec<4,int8_t> a, svec<4,int8_t> b) {
2052  return ~(a == b);
2053 }
2054 
2055 static FORCEINLINE svec<4,bool> svec_less_than(svec<4,int8_t> a, svec<4,int8_t> b) {
2056  __m128i cmp = _mm_cmplt_epi8(a.v, b.v);
2057  return svec<4,bool>(_mm_extract_epi8(cmp, 0),
2058  _mm_extract_epi8(cmp, 1),
2059  _mm_extract_epi8(cmp, 2),
2060  _mm_extract_epi8(cmp, 3));
2061 }
2062 
2063 static FORCEINLINE svec<4,bool> svec_less_equal(svec<4,int8_t> a, svec<4,int8_t> b) {
2064  return (a < b) | (a == b);
2065 }
2066 
2067 static FORCEINLINE svec<4,bool> svec_greater_than(svec<4,int8_t> a, svec<4,int8_t> b) {
2068  __m128i cmp = _mm_cmpgt_epi8(a.v, b.v);
2069  return svec<4,bool>(_mm_extract_epi8(cmp, 0),
2070  _mm_extract_epi8(cmp, 1),
2071  _mm_extract_epi8(cmp, 2),
2072  _mm_extract_epi8(cmp, 3));
2073 }
2074 
2075 static FORCEINLINE svec<4,bool> svec_greater_equal(svec<4,int8_t> a, svec<4,int8_t> b) {
2076  return (a > b) | (a == b);
2077 }
2079 
2080 static FORCEINLINE svec<4,bool> svec_equal(svec<4,uint8_t> a, svec<4,uint8_t> b) {
2081  __m128i cmp = _mm_cmpeq_epi8(a.v, b.v);
2082  return svec<4,bool>(_mm_extract_epi8(cmp, 0),
2083  _mm_extract_epi8(cmp, 1),
2084  _mm_extract_epi8(cmp, 2),
2085  _mm_extract_epi8(cmp, 3));
2086 }
2087 
2088 static FORCEINLINE svec<4,bool> svec_not_equal(svec<4,uint8_t> a, svec<4,uint8_t> b) {
2089  return ~(a == b);
2090 }
2091 
2092 CMP_OP_L4(uint8_t, less_than, <);
2093 CMP_OP_L4(uint8_t, less_equal, <=);
2094 CMP_OP_L4(uint8_t, greater_than, >);
2095 CMP_OP_L4(uint8_t, greater_equal, >=);
2097 
2098 
2099 static FORCEINLINE svec<4,bool> svec_equal(svec<4,int16_t> a, svec<4,int16_t> b) {
2100  __m128i cmp = _mm_cmpeq_epi16(a.v, b.v);
2101  return svec<4,bool>(_mm_extract_epi16(cmp, 0),
2102  _mm_extract_epi16(cmp, 1),
2103  _mm_extract_epi16(cmp, 2),
2104  _mm_extract_epi16(cmp, 3));
2105 }
2106 static FORCEINLINE svec<4,bool> svec_not_equal(svec<4,int16_t> a, svec<4,int16_t> b) {
2107  return ~(a == b);
2108 }
2109 
2110 static FORCEINLINE svec<4,bool> svec_less_than(svec<4,int16_t> a, svec<4,int16_t> b) {
2111  __m128i cmp = _mm_cmplt_epi16(a.v, b.v);
2112  return svec<4,bool>(_mm_extract_epi16(cmp, 0),
2113  _mm_extract_epi16(cmp, 1),
2114  _mm_extract_epi16(cmp, 2),
2115  _mm_extract_epi16(cmp, 3));
2116 }
2117 
2118 static FORCEINLINE svec<4,bool> svec_less_equal(svec<4,int16_t> a, svec<4,int16_t> b) {
2119  return (a < b) | (a == b);
2120 }
2121 
2122 static FORCEINLINE svec<4,bool> svec_greater_than(svec<4,int16_t> a, svec<4,int16_t> b) {
2123  __m128i cmp = _mm_cmpgt_epi16(a.v, b.v);
2124  return svec<4,bool>(_mm_extract_epi16(cmp, 0),
2125  _mm_extract_epi16(cmp, 1),
2126  _mm_extract_epi16(cmp, 2),
2127  _mm_extract_epi16(cmp, 3));
2128 }
2129 
2130 static FORCEINLINE svec<4,bool> svec_greater_equal(svec<4,int16_t> a, svec<4,int16_t> b) {
2131  return (a > b) | (a == b);
2132 }
2134 
2136  __m128i cmp = _mm_cmpeq_epi16(a.v, b.v);
2137  return svec<4,bool>(_mm_extract_epi16(cmp, 0),
2138  _mm_extract_epi16(cmp, 1),
2139  _mm_extract_epi16(cmp, 2),
2140  _mm_extract_epi16(cmp, 3));
2141 }
2142 
2143 static FORCEINLINE svec<4,bool> svec_not_equal(svec<4,uint16_t> a, svec<4,uint16_t> b) {
2144  return ~(a == b);
2145 }
2146 
2147 CMP_OP_L4(uint16_t, less_than, <);
2148 CMP_OP_L4(uint16_t, less_equal, <=);
2149 CMP_OP_L4(uint16_t, greater_than, >);
2150 CMP_OP_L4(uint16_t, greater_equal, >=);
2152 
2153 
2154 static FORCEINLINE svec<4,bool> svec_equal(svec<4,int32_t> a, svec<4,int32_t> b) {
2155  __m128i cmp = _mm_cmpeq_epi32(a.v, b.v);
2156  return svec<4,bool>(_mm_extract_epi32(cmp, 0),
2157  _mm_extract_epi32(cmp, 1),
2158  _mm_extract_epi32(cmp, 2),
2159  _mm_extract_epi32(cmp, 3));
2160 }
2161 static FORCEINLINE svec<4,bool> svec_not_equal(svec<4,int32_t> a, svec<4,int32_t> b) {
2162  return ~(a == b);
2163 }
2164 
2165 static FORCEINLINE svec<4,bool> svec_less_than(svec<4,int32_t> a, svec<4,int32_t> b) {
2166  __m128i cmp = _mm_cmplt_epi32(a.v, b.v);
2167  return svec<4,bool>(_mm_extract_epi32(cmp, 0),
2168  _mm_extract_epi32(cmp, 1),
2169  _mm_extract_epi32(cmp, 2),
2170  _mm_extract_epi32(cmp, 3));
2171 }
2172 
2173 static FORCEINLINE svec<4,bool> svec_less_equal(svec<4,int32_t> a, svec<4,int32_t> b) {
2174  return (a < b) | (a == b);
2175 }
2176 
2177 static FORCEINLINE svec<4,bool> svec_greater_than(svec<4,int32_t> a, svec<4,int32_t> b) {
2178  __m128i cmp = _mm_cmpgt_epi32(a.v, b.v);
2179  return svec<4,bool>(_mm_extract_epi32(cmp, 0),
2180  _mm_extract_epi32(cmp, 1),
2181  _mm_extract_epi32(cmp, 2),
2182  _mm_extract_epi32(cmp, 3));
2183 }
2184 
2185 static FORCEINLINE svec<4,bool> svec_greater_equal(svec<4,int32_t> a, svec<4,int32_t> b) {
2186  return (a > b) | (a == b);
2187 }
2189 
2191  __m128i cmp = _mm_cmpeq_epi32(a.v, b.v);
2192  return svec<4,bool>(_mm_extract_epi32(cmp, 0),
2193  _mm_extract_epi32(cmp, 1),
2194  _mm_extract_epi32(cmp, 2),
2195  _mm_extract_epi32(cmp, 3));
2196 }
2197 
2198 static FORCEINLINE svec<4,bool> svec_not_equal(svec<4,uint32_t> a, svec<4,uint32_t> b) {
2199  return ~(a == b);
2200 }
2201 
2202 CMP_OP_L4(uint32_t, less_than, <);
2203 CMP_OP_L4(uint32_t, less_equal, <=);
2204 CMP_OP_L4(uint32_t, greater_than, >);
2205 CMP_OP_L4(uint32_t, greater_equal, >=);
2207 
2208 
2209 static FORCEINLINE svec<4,bool> svec_equal(svec<4,int64_t> a, svec<4,int64_t> b) {
2210  __m128i cmp0 = _mm_cmpeq_epi64(a.v[0], b.v[0]);
2211  __m128i cmp1 = _mm_cmpeq_epi64(a.v[1], b.v[1]);
2212  return svec<4,bool>(_mm_shuffle_ps(_mm_castsi128_ps(cmp0), _mm_castsi128_ps(cmp1),
2213  _MM_SHUFFLE(2, 0, 2, 0)));
2214 }
2215 static FORCEINLINE svec<4,bool> svec_not_equal(svec<4,int64_t> a, svec<4,int64_t> b) {
2216  return ~(a == b);
2217 }
2218 
2219 static FORCEINLINE svec<4,bool> svec_less_than(svec<4,int64_t> a, svec<4,int64_t> b) {
2220  return ~(a >= b);
2221 }
2222 
2223 static FORCEINLINE svec<4,bool> svec_less_equal(svec<4,int64_t> a, svec<4,int64_t> b) {
2224  return (a < b) | (a == b);
2225 }
2226 
2227 static FORCEINLINE svec<4,bool> svec_greater_than(svec<4,int64_t> a, svec<4,int64_t> b) {
2228  __m128i cmp0 = _mm_cmpgt_epi64(a.v[0], b.v[0]);
2229  __m128i cmp1 = _mm_cmpgt_epi64(a.v[1], b.v[1]);
2230  return svec<4,bool>(_mm_shuffle_ps(_mm_castsi128_ps(cmp0), _mm_castsi128_ps(cmp1),
2231  _MM_SHUFFLE(2, 0, 2, 0)));
2232 }
2233 
2234 static FORCEINLINE svec<4,bool> svec_greater_equal(svec<4,int64_t> a, svec<4,int64_t> b) {
2235  return (a > b) | (a == b);
2236 }
2238 
2240  __m128i cmp0 = _mm_cmpeq_epi64(a.v[0], b.v[0]);
2241  __m128i cmp1 = _mm_cmpeq_epi64(a.v[1], b.v[1]);
2242  return svec<4,bool>(_mm_shuffle_ps(_mm_castsi128_ps(cmp0), _mm_castsi128_ps(cmp1),
2243  _MM_SHUFFLE(2, 0, 2, 0)));
2244 }
2245 
2246 static FORCEINLINE svec<4,bool> svec_not_equal(svec<4,uint64_t> a, svec<4,uint64_t> b) {
2247  return ~(a == b);
2248 }
2249 
2250 CMP_OP_L4(uint64_t, less_than, <);
2251 CMP_OP_L4(uint64_t, less_equal, <=);
2252 CMP_OP_L4(uint64_t, greater_than, >);
2253 CMP_OP_L4(uint64_t, greater_equal, >=);
2255 
2256 
2257 static FORCEINLINE svec<4,bool> svec_equal(svec<4,float> a, svec<4,float> b) {
2258  return _mm_cmpeq_ps(a.v, b.v);
2259 }
2260 
2261 static FORCEINLINE svec<4,bool> svec_not_equal(svec<4,float> a, svec<4,float> b) {
2262  return _mm_cmpneq_ps(a.v, b.v);
2263 }
2264 
2265 static FORCEINLINE svec<4,bool> svec_less_than(svec<4,float> a, svec<4,float> b) {
2266  return _mm_cmplt_ps(a.v, b.v);
2267 }
2268 
2269 static FORCEINLINE svec<4,bool> svec_less_equal(svec<4,float> a, svec<4,float> b) {
2270  return _mm_cmple_ps(a.v, b.v);
2271 }
2272 
2273 static FORCEINLINE svec<4,bool> svec_greater_than(svec<4,float> a, svec<4,float> b) {
2274  return _mm_cmpgt_ps(a.v, b.v);
2275 }
2276 
2277 static FORCEINLINE svec<4,bool> svec_greater_equal(svec<4,float> a, svec<4,float> b) {
2278  return _mm_cmpge_ps(a.v, b.v);
2279 }
2280 
2282 
2283 static FORCEINLINE svec<4,bool> svec_equal(svec<4,double> a, svec<4,double> b) {
2284  __m128d cmp0 = _mm_cmpeq_pd(a.v[0], b.v[0]);
2285  __m128d cmp1 = _mm_cmpeq_pd(a.v[1], b.v[1]);
2286  return _mm_shuffle_ps(_mm_castpd_ps(cmp0), _mm_castpd_ps(cmp1),
2287  _MM_SHUFFLE(2, 0, 2, 0));
2288 }
2289 
2290 static FORCEINLINE svec<4,bool> svec_not_equal(svec<4,double> a, svec<4,double> b) {
2291  __m128d cmp0 = _mm_cmpneq_pd(a.v[0], b.v[0]);
2292  __m128d cmp1 = _mm_cmpneq_pd(a.v[1], b.v[1]);
2293  return _mm_shuffle_ps(_mm_castpd_ps(cmp0), _mm_castpd_ps(cmp1),
2294  _MM_SHUFFLE(2, 0, 2, 0));
2295 }
2296 
2297 static FORCEINLINE svec<4,bool> svec_less_than(svec<4,double> a, svec<4,double> b) {
2298  __m128d cmp0 = _mm_cmplt_pd(a.v[0], b.v[0]);
2299  __m128d cmp1 = _mm_cmplt_pd(a.v[1], b.v[1]);
2300  return _mm_shuffle_ps(_mm_castpd_ps(cmp0), _mm_castpd_ps(cmp1),
2301  _MM_SHUFFLE(2, 0, 2, 0));
2302 }
2303 
2304 static FORCEINLINE svec<4,bool> svec_less_equal(svec<4,double> a, svec<4,double> b) {
2305  __m128d cmp0 = _mm_cmple_pd(a.v[0], b.v[0]);
2306  __m128d cmp1 = _mm_cmple_pd(a.v[1], b.v[1]);
2307  return _mm_shuffle_ps(_mm_castpd_ps(cmp0), _mm_castpd_ps(cmp1),
2308  _MM_SHUFFLE(2, 0, 2, 0));
2309 }
2310 
2311 static FORCEINLINE svec<4,bool> svec_greater_than(svec<4,double> a, svec<4,double> b) {
2312  __m128d cmp0 = _mm_cmpgt_pd(a.v[0], b.v[0]);
2313  __m128d cmp1 = _mm_cmpgt_pd(a.v[1], b.v[1]);
2314  return _mm_shuffle_ps(_mm_castpd_ps(cmp0), _mm_castpd_ps(cmp1),
2315  _MM_SHUFFLE(2, 0, 2 ,0));
2316 }
2317 
2318 static FORCEINLINE svec<4,bool> svec_greater_equal(svec<4,double> a, svec<4,double> b) {
2319  __m128d cmp0 = _mm_cmpge_pd(a.v[0], b.v[0]);
2320  __m128d cmp1 = _mm_cmpge_pd(a.v[1], b.v[1]);
2321  return _mm_shuffle_ps(_mm_castpd_ps(cmp0), _mm_castpd_ps(cmp1),
2322  _MM_SHUFFLE(2, 0, 2, 0));
2323 }
2324 
2326 
2327 
2328 
2329 // 8. Cast
2330 
2340 #define CAST_OPT(SFROM, STO) \
2341 template <class T> static T svec_cast(svec<LANES,SFROM> val); \
2342  \
2345 template <> FORCEINLINE svec<LANES,STO> svec_cast<svec<LANES,STO> >(svec<LANES,SFROM> val) { \
2346  return svec<LANES,STO>((val.v)); \
2347 }
2348 
2352 #define CAST_OPT64(SFROM, STO) \
2353 template <class T> static T svec_cast(svec<LANES,SFROM> val); \
2354  \
2357 template <> FORCEINLINE svec<LANES,STO> svec_cast<svec<LANES,STO> >(svec<LANES,SFROM> val) { \
2358  return svec<LANES,STO>((val.v[0]),(val.v[1])); \
2359 }
2360 
2361 
2362 //i1 -> all
2363 //CAST_L4(svec<4,bool>, svec<4,bool>, bool);
2364 //CAST_L4(svec<4,bool>, svec<4,int8_t>, int8_t); //better way: packing
2365 template <class T> static T svec_cast(svec<4,bool> val);
2370  return svec_select(val, svec<4,int8_t>(0xff), svec<4,int8_t>(0));
2371 }
2372 //CAST_L4(svec<4,bool>, svec<4,uint8_t>, uint8_t); //better way: packing
2373 template <class T> static T svec_cast(svec<4,bool> val);
2378  return svec_select(val, svec<4,uint8_t>(0xff), svec<4,uint8_t>(0));
2379 }
2380 //CAST_L4(svec<4,bool>, svec<4,int16_t>, int16_t); //better way: packing
2381 template <class T> static T svec_cast(svec<4,bool> val);
2386  return svec_select(val, svec<4,int16_t>(0xffff), svec<4,int16_t>(0));
2387 }
2388 //CAST_L4(svec<4,bool>, svec<4,uint16_t>, uint16_t); //better way: packing
2389 template <class T> static T svec_cast(svec<4,bool> val);
2394  return svec_select(val, svec<4,uint16_t>(0xffff), svec<4,uint16_t>(0));
2395 }
2396 //CAST_L4(svec<4,bool>, svec<4,int32_t>, int32_t);
2397 template <class T> static T svec_cast(svec<4,bool> val);
2402  return _mm_castps_si128(val.v);
2403 }
2404 //CAST_L4(svec<4,bool>, svec<4,uint32_t>, uint32_t);
2405 template <class T> static T svec_cast(svec<4,bool> val);
2410  return _mm_and_si128(_mm_castps_si128(val.v), _mm_set1_epi32(-1));
2411 }
2412 CAST_L4(bool, int64_t); //better way: unpack, singed ext
2413 CAST_L4(bool, uint64_t);//better way: unpack, singed ext
2414 //CAST_L4(bool, float); //si to fp call
2415 template <class T> static T svec_cast(svec<4,bool> val);
2419 template <> FORCEINLINE svec<4,float> svec_cast<svec<4,float> >(svec<4,bool> val) {
2420  return svec_select(val, svec<4,float>(4294967295.), svec<4,float>(0));
2421 }
2422 //CAST_L4(svec<4,bool>, svec<4,double>, double);
2423 template <class T> static T svec_cast(svec<4,bool> val);
2428  return svec_select(val, svec<4,double>(4294967295.), svec<4,double>(0));
2429 }
2430 
2431 //i8 -> all
2432 CAST_L4(int8_t, bool);
2433 //CAST_L4(int8_t, int8_t);
2434 CAST_OPT(int8_t, uint8_t);
2435 CAST_L4(int8_t, int16_t); //better way, use vec_unpackh
2436 CAST_L4(int8_t, uint16_t); //better way, sext + zero mask and
2437 CAST_L4(int8_t, int32_t); //better way, use twice vec_unpack
2438 CAST_L4(int8_t, uint32_t); //better way, use unpack + zero mask
2439 CAST_L4(int8_t, int64_t);
2440 CAST_L4(int8_t, uint64_t);
2441 CAST_L4(int8_t, float);
2442 CAST_L4(int8_t, double);
2443 
2444 //u8 -> all
2445 CAST_L4(uint8_t, bool);
2446 CAST_OPT(uint8_t, int8_t);
2447 //CAST_L4(uint8_t, uint8_t);
2448 CAST_L4(uint8_t, int16_t); //better way, use unpack + zero mask
2449 CAST_L4(uint8_t, uint16_t); //better way use unpack + zero mask
2450 CAST_L4(uint8_t, int32_t);
2451 CAST_L4(uint8_t, uint32_t);
2452 CAST_L4(uint8_t, int64_t);
2453 CAST_L4(uint8_t, uint64_t);
2454 CAST_L4(uint8_t, float);
2455 CAST_L4(uint8_t, double);
2456 
2457 //i16 -> all
2458 CAST_L4(int16_t, bool);
2459 CAST_L4(int16_t, int8_t); //could use pack
2460 CAST_L4(int16_t, uint8_t); //could use pack
2461 //CAST_L4(int16_t, int16_t);
2462 CAST_OPT(int16_t, uint16_t);
2463 CAST_L4(int16_t, int32_t); //use unpack
2464 CAST_L4(int16_t, uint32_t); //use unpack and zeromaskout
2465 CAST_L4(int16_t, int64_t);
2466 CAST_L4(int16_t, uint64_t);
2467 CAST_L4(int16_t, float);
2468 CAST_L4(int16_t, double);
2469 
2470 //u16 -> all
2471 CAST_L4(uint16_t, bool);
2472 CAST_L4(uint16_t, int8_t);
2473 CAST_L4(uint16_t, uint8_t);
2474 CAST_OPT(uint16_t, int16_t);
2475 //CAST_L4(uint16_t, uint16_t);
2476 CAST_L4(uint16_t, int32_t); //use unpack +mask
2477 CAST_L4(uint16_t, uint32_t); //use unpack + mask
2478 CAST_L4(uint16_t, int64_t);
2479 CAST_L4(uint16_t, uint64_t);
2480 CAST_L4(uint16_t, float);
2481 CAST_L4(uint16_t, double);
2482 
2483 //i32 -> all
2484 CAST_L4(int32_t, bool);
2485 CAST_L4(int32_t, int8_t);
2486 CAST_L4(int32_t, uint8_t);
2487 CAST_L4(int32_t, int16_t);
2488 CAST_L4(int32_t, uint16_t);
2489 //CAST_L4(int32_t, svec<4,int32_t>, int32_t);
2490 CAST_OPT(int32_t, uint32_t);
2491 CAST_L4(int32_t, int64_t); //use p8 unpack
2492 CAST_L4(int32_t, uint64_t); //use p8 unpack
2493 //CAST_L4(svec<4,int32_t>, svec<4,float>, float); //use ctf
2494 template <class T> static T svec_cast(svec<4,int32_t> val);
2499  return _mm_cvtepi32_ps(val.v);
2500 }
2501 //CAST_L4(svec<4,int32_t>, svec<4,double>, double);
2502 template <class T> static T svec_cast(svec<4,int32_t> val);
2507  __m128d r0 = _mm_cvtepi32_pd(val.v);
2508  __m128 shuf = _mm_shuffle_ps(_mm_castsi128_ps(val.v),
2509  _mm_castsi128_ps(val.v),
2510  _MM_SHUFFLE(3, 2, 3, 2));
2511  __m128d r1 = _mm_cvtepi32_pd(_mm_castps_si128(shuf));
2512  return svec<4,double>(r0, r1);
2513 }
2514 
2515 //u32 -> all
2516 CAST_L4(uint32_t, bool);
2517 CAST_L4(uint32_t, int8_t);
2518 CAST_L4(uint32_t, uint8_t);
2519 CAST_L4(uint32_t, int16_t);
2520 CAST_L4(uint32_t, uint16_t);
2521 CAST_OPT(uint32_t, int32_t);
2522 //CAST_L4(uint32_t, uint32_t);
2523 CAST_L4(uint32_t, int64_t); //use p8 unpack
2524 CAST_L4(uint32_t, uint64_t); //use p8 unpack
2525 CAST_L4(uint32_t, float);
2526 CAST_L4(uint32_t, double);
2527 
2528 //i64-> all
2529 CAST_L4(int64_t, bool);
2530 CAST_L4(int64_t, int8_t);
2531 CAST_L4(int64_t, uint8_t);
2532 CAST_L4(int64_t, int16_t);
2533 CAST_L4(int64_t, uint16_t);
2534 CAST_L4(int64_t, int32_t); //use p8 trunk
2535 CAST_L4(int64_t, uint32_t); //use p8 trunk
2536 //CAST_L4(int64_t, int64_t);
2537 CAST_OPT64(int64_t, uint64_t);
2538 CAST_L4(int64_t, float);
2539 CAST_L4(int64_t, double);
2540 
2541 //u64 -> all
2542 CAST_L4(uint64_t, bool);
2543 CAST_L4(uint64_t, int8_t);
2544 CAST_L4(uint64_t, uint8_t);
2545 CAST_L4(uint64_t, int16_t);
2546 CAST_L4(uint64_t, uint16_t);
2547 CAST_L4(uint64_t, int32_t); //use p8 pack
2548 CAST_L4(uint64_t, uint32_t); //use p8 pack
2549 CAST_OPT64(uint64_t, int64_t);
2550 //CAST_L4(uint64_t, uint64_t);
2551 CAST_L4(uint64_t, float);
2552 CAST_L4(uint64_t, double);
2553 
2554 //float -> all
2555 CAST_L4(float, bool);
2556 CAST_L4(float, int8_t); //use cts + pack+pack
2557 CAST_L4(float, uint8_t); //use ctu + pack + pack
2558 CAST_L4(float, int16_t); //use cts + pack
2559 CAST_L4(float, uint16_t); //use ctu + pack
2560 //CAST_L4(svec<4,float>, int32_t);//use cts
2561 template <class T> static T svec_cast(svec<4,float> val);
2566  return _mm_cvttps_epi32(val.v);
2567 }
2568 CAST_L4(float, uint32_t); //use ctu
2569 CAST_L4(float, int64_t);
2570 CAST_L4(float, uint64_t);
2571 //CAST_L4(float, float);
2572 //CAST_L4(float, double);
2573 template <class T> static T svec_cast(svec<4,float> val);
2578  return svec<4,double>(_mm_cvtps_pd(val.v),
2579  _mm_cvtps_pd(_mm_shuffle_ps(val.v, val.v,
2580  _MM_SHUFFLE(3, 2, 3, 2))));
2581 }
2582 
2583 //double -> all
2584 CAST_L4(double, bool);
2585 CAST_L4(double, int8_t);
2586 CAST_L4(double, uint8_t);
2587 CAST_L4(double, int16_t);
2588 CAST_L4(double, uint16_t);
2589 //CAST_L4(double, int32_t);
2590 template <class T> static T svec_cast(svec<4,double> val);
2595  __m128i r0 = _mm_cvtpd_epi32(val.v[0]);
2596  __m128i r1 = _mm_cvtpd_epi32(val.v[1]);
2597  return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(r0), _mm_castsi128_ps(r1),
2598  _MM_SHUFFLE(1, 0, 1, 0)));
2599 }
2600 
2601 CAST_L4(double, uint32_t);
2602 CAST_L4(double, int64_t);
2603 CAST_L4(double, uint64_t);
2604 //CAST_L4(double, float);
2605 template <class T> static T svec_cast(svec<4,double> val);
2610  __m128 r0 = _mm_cvtpd_ps(val.v[0]);
2611  __m128 r1 = _mm_cvtpd_ps(val.v[1]);
2612  return _mm_shuffle_ps(r0, r1, _MM_SHUFFLE(1, 0, 1, 0));
2613 }
2614 //CAST_L4(svec<4,double>, double);
2615 
2617 
2618 
2622 #define CAST_BITS_OPT(SFROM, STO, func) \
2623 template <class T> static T svec_cast_bits(svec<LANES,SFROM> val); \
2624  \
2627 template <> FORCEINLINE svec<LANES,STO> svec_cast_bits<svec<LANES,STO> >(svec<LANES,SFROM> val) { \
2628  return svec<LANES,STO>(func(val.v)); \
2629 }
2630 
2634 #define CAST_BITS_OPT64(SFROM, STO, func) \
2635 template <class T> static T svec_cast_bits(svec<LANES,SFROM> val); \
2636  \
2639 template <> FORCEINLINE svec<LANES,STO> svec_cast_bits<svec<LANES,STO> >(svec<LANES,SFROM> val) { \
2640  return svec<LANES,STO>(func(val.v[0]), func(val.v[1])); \
2641 }
2642 
2643 
2647 CAST_BITS_OPT(int32_t, float, _mm_castsi128_ps);
2648 CAST_BITS_OPT(uint32_t, float, _mm_castsi128_ps);
2649 CAST_BITS_OPT(float, int32_t, _mm_castps_si128);
2650 CAST_BITS_OPT(float, uint32_t, _mm_castps_si128);
2651 
2652 CAST_BITS_OPT64(int64_t, double, _mm_castsi128_pd);
2653 CAST_BITS_OPT64(uint64_t, double, _mm_castsi128_pd);
2654 CAST_BITS_OPT64(double, int64_t, _mm_castpd_si128);
2655 CAST_BITS_OPT64(double, uint64_t, _mm_castpd_si128);
2656 
2657 
2659 //
2660 // Class operations based on the above interfaces
2661 //
2663 
2667 #define SUBSCRIPT_FUNC_IMPL_SSE(STYPE) \
2668 FORCEINLINE STYPE& svec<LANES,STYPE>::operator[](int index) { \
2669  return ((STYPE *)&v)[index]; \
2670 } \
2671 const FORCEINLINE STYPE svec<LANES,STYPE>::operator[](int index) const { \
2672  return svec_extract(*this, index); \
2673 }
2674 
2675 //add the impl of i1's
2677  svec_insert(m_self, m_index, value);
2678 }
2680  svec_insert(m_self, m_index, helper.operator uint32_t());
2681 }
2683  return svec_extract(*m_self, m_index);
2684 }
2685 const FORCEINLINE uint32_t svec<4,bool>::operator[](int index) const {
2686  return svec_extract(*this, index);
2687 }
2698 
2704 FORCEINLINE bool svec<4,bool>::any_true() { return svec_any_true(*this); }
2705 
2711 FORCEINLINE bool svec<4,bool>::all_true() { return svec_all_true(*this); }
2712 
2718 FORCEINLINE bool svec<4,bool>::none_true() { return svec_none_true(*this); }
2719 
2724 FORCEINLINE svec<4,bool> svec<4,bool>::operator~() { return svec_not(*this); }
2725 
2748 FORCEINLINE svec<4,bool> svec<4,bool>::operator!() { return svec_not(*this); }
2749 
2768  return svec_equal(*this, a);
2769 }
2770 
2777  return svec_not_equal(*this, a);
2778 }
2779 
2781 VEC_CMP_IMPL(uint8_t);
2782 VEC_CMP_IMPL(int16_t);
2783 VEC_CMP_IMPL(uint16_t);
2784 VEC_CMP_IMPL(int32_t);
2785 VEC_CMP_IMPL(uint32_t);
2786 VEC_CMP_IMPL(int64_t);
2787 VEC_CMP_IMPL(uint64_t);
2790 
2802 
2803 VEC_INT_CLASS_METHOD_IMPL(int8_t, uint8_t);
2804 VEC_INT_CLASS_METHOD_IMPL(uint8_t, uint8_t);
2805 VEC_INT_CLASS_METHOD_IMPL(int16_t, uint16_t);
2806 VEC_INT_CLASS_METHOD_IMPL(uint16_t, uint16_t);
2807 VEC_INT_CLASS_METHOD_IMPL(int32_t, uint32_t);
2808 VEC_INT_CLASS_METHOD_IMPL(uint32_t, uint32_t);
2809 VEC_INT_CLASS_METHOD_IMPL(int64_t, uint64_t);
2810 VEC_INT_CLASS_METHOD_IMPL(uint64_t, uint64_t);
2811 
2814 
2815 #undef LANES
2816 } //end of namespace vsx4
2817 #endif /* POWER_VSX4_H_ */
2818 
#define COUT_FUNC_BOOL_DECL()
Definition: gsimd_utility.h:266
#define INT_BINARY_OP_METHODS64(STYPE)
Definition: sse4.h:1830
svec()
Default constructor.
Definition: sse4.h:190
svec(int16_t a)
Constructor.
Definition: sse4.h:354
#define CAST_OPT(SFROM, STO)
cast based on directly change the __mm object type type
Definition: sse4.h:2340
#define GATHER_STRIDE_L4(STYPE, OSTYPE)
macros for fast impl of gather base step
Definition: gsimd_utility.h:682
svec< 4, bool > svec_select(svec< 4, bool > mask, svec< 4, bool > a, svec< 4, bool > b)
construct c by selecting elements from two input vectors according to the mask
Definition: sse4.h:1070
svec(__m128i vv)
For internal use only.
Definition: sse4.h:437
#define VEC_INT_CLASS_METHOD_DECL(STYPE, USTYPE)
macros method definition for integer vector only Note: shift&#39;s operator can only be unsigned vector ...
Definition: gsimd_utility.h:379
svec(int64_t a, int64_t b, int64_t c, int64_t d)
Constructor.
Definition: sse4.h:549
#define CMP_ALL_MASKED_OP(STYPE)
Definition: gsimd_utility.h:1099
#define SCATTER_BASE_OFFSETS_L4(STYPE, OSTYPE)
Definition: gsimd_utility.h:789
svec(void *p0, void *p1, void *p2, void *p3)
Constructor.
Definition: sse4.h:1334
Definition: gsimd_utility.h:93
data representation and operations on a vector of 4 unsigned long long.
Definition: sse4.h:584
svec()
Default constructor,.
Definition: sse4.h:536
svec(__m128i vv)
For internal use only.
Definition: sse4.h:202
#define BINARY_OP_OPT_FUNC64(STYPE, STYPE2, NAME, FUNC)
Definition: sse4.h:1730
#define BINARY_OP2_L4(STYPE, STYPE2, NAME, OP)
macros for generic slow imple of binary operation, style 2
Definition: gsimd_utility.h:893
#define GATHER_GENERAL_L4(STYPE, PSTYPE)
slow implementation of gather general Must use template to specify the return type ...
Definition: gsimd_utility.h:617
data representation and operations on a vector of 4 double.
Definition: sse4.h:685
svec(uint8_t a, uint8_t b, uint8_t c, uint8_t d)
Constructor.
Definition: sse4.h:298
svec(uint32_t a, uint32_t b, uint32_t c, uint32_t d)
Constructor.
Definition: sse4.h:495
data representation and operations on a vector of 4 unsigned short.
Definition: sse4.h:377
svec(__m128 vv)
For internal use only.
Definition: sse4.h:650
#define VEC_FLOAT_CLASS_METHOD_DECL(STYPE)
Definition: gsimd_utility.h:393
#define VEC_CLASS_METHOD_DECL(STYPE)
macros for non-mask i8 - double types&#39;s method
Definition: gsimd_utility.h:350
#define BIN_VEC_SCAL(STYPE)
Definition: sse4.h:1785
#define BINARY_OP_FUNC(STYPE, NAME, FUNC)
Definition: gsimd_utility.h:869
data representation and operations on a vector of 4 unsigned int.
Definition: sse4.h:478
svec()
Default constructor.
Definition: sse4.h:590
__m128i v[2]
Definition: sse4.h:585
#define SVEC_BOOL_CLASS_METHOD_DECL()
macros for svec&lt;N,bool&gt; class&#39;s class method
Definition: gsimd_utility.h:330
__m128i v
Definition: sse4.h:330
#define LOAD_CONST_SSE(STYPE)
Definition: sse4.h:1280
#define SHUFFLES_L4(STYPE)
macro for shuffle/shuffle2 methods implementation
Definition: gsimd_utility.h:537
#define INSERT_EXTRACT_SSEOPT(STYPE, FUNC)
Definition: sse4.h:754
svec(uint64_t a, uint64_t b, uint64_t c, uint64_t d)
Constructor.
Definition: sse4.h:603
svec(float a, float b, float c, float d)
Constructor.
Definition: sse4.h:655
svec(__m128i a, __m128i b)
For internal use only. Construct svec&lt;4,int64_t&gt; with two _m128i objects.
Definition: sse4.h:595
__m128 v
Definition: sse4.h:184
data representation and operations on a vector of 4 float.
Definition: sse4.h:638
svec< 4,float > svec_preduce_add(svec< 4, float > v0, svec< 4, float > v1, svec< 4, float > v2, svec< 4, float > v3)
Definition: sse4.h:2010
#define INC_STATS_NAME(stat, inc, opname)
Definition: gsimd_utility.h:156
#define COUT_FUNC_DECL(STYPE)
Definition: gsimd_utility.h:283
data representation and operations on a vector of 4 unsigned chars.
Definition: sse4.h:281
svec(uint8_t a)
Constructor.
Definition: sse4.h:307
__m128i v
Definition: sse4.h:234
svec(int64_t a)
Constructor.
Definition: sse4.h:560
#define CAST_BITS_OPT(SFROM, STO, func)
cast based on directly change the __vector type
Definition: sse4.h:2622
svec(__m128 vv)
For internal use only.
Definition: sse4.h:196
svec(int8_t a, int8_t b, int8_t c, int8_t d)
Constructor.
Definition: sse4.h:251
#define UNARY_OP_L4(STYPE, NAME, OP)
Definition: gsimd_utility.h:841
svec(uint16_t a, uint16_t b, uint16_t c, uint16_t d)
Constructor.
Definition: sse4.h:394
#define SUBSCRIPT_FUNC_IMPL_SSE(STYPE)
this macro uses sse specific intrinsics to do extract, insert
Definition: sse4.h:2667
svec()
Default constructor.
Definition: sse4.h:691
#define VEC_CMP_IMPL(STYPE)
Definition: gsimd_utility.h:1175
#define SUBSCRIPT_FUNC_DECL(STYPE)
macros to define a intrinsic based subscript opertor
Definition: gsimd_utility.h:247
__m128i v
Definition: sse4.h:378
#define INSERT_EXTRACT_SSEOPT64(STYPE, FUNC)
Definition: sse4.h:770
svec(uint64_t a)
Constructor.
Definition: sse4.h:614
#define MVEC_CLASS_METHOD_IMPL(STYPE)
mask class&#39;s class method impl
Definition: gsimd_utility.h:1285
data representation and operations on a vector of 4 signed int.
Definition: sse4.h:425
#define SUBSCRIPT_FUNC_BOOL_DECL(STYPE)
Definition: gsimd_utility.h:251
#define VEC_CLASS_METHOD_IMPL(STYPE)
Definition: gsimd_utility.h:1301
svec(__m128i vv)
For internal use only.
Definition: sse4.h:246
#define GATHER_BASE_OFFSETS_L4(STYPE, OSTYPE)
Definition: gsimd_utility.h:658
#define INT_BINARY_OP_METHODS(STYPE)
Definition: sse4.h:1823
svec(int a, int b, int c, int d)
Constructor.
Definition: sse4.h:442
data representation and operations on a vector of 4 signed short.
Definition: sse4.h:329
data representation and operations on a vector of 4 signed chars.
Definition: sse4.h:233
#define MASKED_LOAD_STORE_L4(STYPE)
Definition: gsimd_utility.h:797
#define VEC_FLOAT_CLASS_METHOD_IMPL(STYPE)
Definition: gsimd_utility.h:1433
#define SCATTER_STRIDE_L4(STYPE, OSTYPE)
Definition: gsimd_utility.h:715
#define INSERT_EXTRACT_SSE(STYPE)
macros for svec&#39;s insert extract method implementation The implementation is based on vector type&#39;s s...
Definition: sse4.h:746
svec(uint32_t a)
Constructor.
Definition: sse4.h:219
data representation and operations on a vector of 4 signed long long.
Definition: sse4.h:530
#define TERNERY_OPT(STYPE)
Definition: sse4.h:1937
__m128i v
Definition: sse4.h:479
__m128i v
Definition: sse4.h:282
#define CAST_BITS_OPT64(SFROM, STO, func)
cast based on directly change the __vector type
Definition: sse4.h:2634
svec(uint32_t a)
Constructor.
Definition: sse4.h:503
svec(uint32_t a, uint32_t b, uint32_t c, uint32_t d)
Constructor.
Definition: sse4.h:209
svec()
Default constructor.
Definition: sse4.h:644
#define CAST_L4(SFROM, STO)
Definition: gsimd_utility.h:1124
#define UNARY_OP_OPT(STYPE, NAME, OP)
Definition: sse4.h:1619
#define BINARY_OP_L4(STYPE, NAME, OP)
macros for generic slow imple of binary operation
Definition: gsimd_utility.h:880
__m128i v
Definition: sse4.h:426
__m128i v[2]
Definition: sse4.h:531
svec()
Default constructor.
Definition: sse4.h:240
#define UNARY_OP_OPT64(STYPE, NAME, OP)
macros for 64bit object, i64/u64/double
Definition: sse4.h:1627
svec()
Default constructor.
Definition: sse4.h:431
svec()
Default constructor.
Definition: sse4.h:335
#define BINARY_OP_OPT_FUNC(STYPE, STYPE2, NAME, FUNC)
Definition: sse4.h:1725
svec(double a)
Constructor.
Definition: sse4.h:713
svec(int8_t a)
Constructor.
Definition: sse4.h:259
svec()
Default constructor.
Definition: sse4.h:383
svec(__m128d a, __m128d b)
For internal use only. Construct svec&lt;4,double&gt; with two __vector double values.
Definition: sse4.h:696
#define SCATTER_GENERAL_L4(STYPE, PSTYPE)
Definition: gsimd_utility.h:756
#define BINARY_OP_FUNC_L4(STYPE, NAME, FUNC)
Definition: gsimd_utility.h:904
#define ROTATE_L4(STYPE)
macro for rotate method implementation
Definition: gsimd_utility.h:507
Definition: sse4.h:130
svec(uint16_t a)
Constructor.
Definition: sse4.h:402
#define BINARY_OP_SCALAR_L4(STYPE, STYPE2, NAME, OP)
macros for binary: vector op scalar
Definition: gsimd_utility.h:917
#define COUT_FUNC_CHAR_DECL(STYPE)
Definition: gsimd_utility.h:275
svec(__m128i vv)
For internal use only.
Definition: sse4.h:490
svec(__m128i vv)
For internal use only.
Definition: sse4.h:341
svec(double a, double b, double c, double d)
Constructor.
Definition: sse4.h:704
svec(int16_t a, int16_t b, int16_t c, int16_t d)
Constructor.
Definition: sse4.h:346
svec(int32_t a)
Constructor.
Definition: sse4.h:450
svec(__m128i a, __m128i b)
For internal use only. Construct svec&lt;4,int64_t&gt; with two _m128i objects.
Definition: sse4.h:541
svec()
Default constructor.
Definition: sse4.h:287
__m128 v
Definition: sse4.h:639
svec(__m128i vv)
For internal use only.
Definition: sse4.h:389
svec()
Default constructor.
Definition: sse4.h:484
#define CMP_OP_L4(STYPE, NAME, OP)
Definition: gsimd_utility.h:1057
#define CAST_OPT64(SFROM, STO)
cast based on directly change the __vector type
Definition: sse4.h:2352
#define SELECT_BOOLCOND(STYPE)
macros for svec&#39;s select by bool scalar method implementation
Definition: gsimd_utility.h:459
__m128d v[2]
Definition: sse4.h:686
#define VEC_INT_CLASS_METHOD_IMPL(STYPE, STYPE2)
Definition: gsimd_utility.h:1394
#define FORCEINLINE
Definition: gsimd_utility.h:175
Data representation and operations on a vector of 4 boolean values. This is used in predicated vector...
Definition: sse4.h:182
svec(float a)
Constructor.
Definition: sse4.h:663
svec(__m128i vv)
For internal use only.
Definition: sse4.h:293
#define MAX_MIN_REDUCE_METHODS(STYPE)
Definition: sse4.h:1993