Generic SIMD Intrinsic Library API  0.6
 All Classes Namespaces Files Functions Variables Typedefs Friends Macros
power_vsx4.h
Go to the documentation of this file.
1 
105 #ifndef POWER_VSX4_H_
106 #define POWER_VSX4_H_
107 
108 #include <stdint.h>
109 #include <math.h>
110 #include <altivec.h>
111 #include <assert.h>
112 #include <iostream>
113 
114 #include "gsimd_utility.h"
115 #include "platform_intrinsics.h"
116 
117 namespace vsx {
118 
119 #define LANES 4
120 
122 //
123 // Constructor Section
124 //
126 
127 template <int Lanes, class T>
128 struct svec : public invalid_template_arguments<Lanes,T>::type {
129  //here we need to add the static assert
130 };
131 
132  // TODO (penguin): move common definition to gsimd_utility.h
133 template <>
134 struct svec<4,bool>;
135 template <>
136  struct svec<4,int8_t>;
137 template <>
138  struct svec<4,uint8_t>;
139 template <>
140  struct svec<4,int16_t>;
141 template <>
142  struct svec<4,uint16_t>;
143 template <>
144  struct svec<4,int32_t>;
145 template <>
146  struct svec<4,uint32_t>;
147 template <>
148  struct svec<4,int64_t>;
149 template <>
150  struct svec<4,uint64_t>;
151 template <>
152  struct svec<4,float>;
153 template <>
154  struct svec<4,double>;
155 template <>
156  struct svec<4,void*>;
157 
158 //required because macros are confused by the , in the template declaration
159 //typedef svec<4,bool> _svec4_i1;
160 //typedef svec<4,int8_t> _svec4_i8;
161 //typedef svec<4,uint8_t> _svec4_u8;
162 //typedef svec<4,int16_t> _svec4_i16;
163 //typedef svec<4,uint16_t> _svec4_u16;
164 //typedef svec<4,int32_t> _svec4_i32;
165 //typedef svec<4,uint32_t> _svec4_u32;
166 //typedef svec<4,int64_t> _svec4_i64;
167 //typedef svec<4,uint64_t> _svec4_u64;
168 //typedef svec<4,float> _svec4_f;
169 //typedef svec<4,double> _svec4_d;
170 //typedef svec<4,void*> _svec4_ptr;
171 
181 template<>
182 struct svec<4,bool> {
183 
184  __vector unsigned int v;
185 
197  FORCEINLINE svec(__vector unsigned int vv) : v(vv) { }
204  FORCEINLINE svec(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
205  __vector unsigned int t = { a ? -1 : 0, b ? -1 : 0, c ? -1 : 0, d ? -1 : 0 };
206  v = t;
207  }
214  FORCEINLINE svec( uint32_t a) {
215  if(__builtin_constant_p(a)){
216  v = (a!=0) ? vec_splat_s32(-1) : vec_splat_s32(0);
217  } else {
218  INC_STATS_NAME(STATS_SMEAR_SLOW,1, "smear i1");
219  __vector unsigned int t = { a ? -1 : 0, a ? -1 : 0, a ? -1 : 0, a ? -1 : 0 };
220  v = t;
221  }
222  }
223 
227 };
228 
232 template <>
233 struct svec<4,signed char> {
234  __vector signed char v;
235 
246  FORCEINLINE svec(__vector signed char vv) : v(vv) { }
251  FORCEINLINE svec(int8_t a, int8_t b, int8_t c, int8_t d) {
252  __vector signed char t = {a,b,c,d,0,0,0,0,
253  0,0,0,0,0,0,0,0};
254  v = t;
255  }
260  FORCEINLINE svec( int8_t a) {
261  if(__builtin_constant_p(a) && (a <= 15) && (a >= -16)){
262  v = vec_splat_s8(a); //will gen one instr.vspltisb
263  } else {
264  INC_STATS_NAME(STATS_SMEAR_SLOW,1, "smear i8");
265  __vector signed char t = {a,a,a,a,0,0,0,0,
266  0,0,0,0,0,0,0,0};
267  v = t;
268  }
269  }
274  SUBSCRIPT_FUNC_DECL(int8_t);
275  COUT_FUNC_CHAR_DECL(signed char);
276 
277  VEC_CLASS_METHOD_DECL(int8_t);
278  VEC_INT_CLASS_METHOD_DECL(int8_t, uint8_t);
279 
280 };
281 
285 template<>
286 struct svec<4,unsigned char> {
287  __vector unsigned char v;
298  FORCEINLINE svec(__vector unsigned char vv) : v(vv) { }
303  FORCEINLINE svec(uint8_t a, uint8_t b, uint8_t c, uint8_t d) {
304  __vector unsigned char t = {a,b,c,d,0,0,0,0,
305  0,0,0,0,0,0,0,0};
306  v = t;
307  }
313  FORCEINLINE svec(uint8_t a) {
314  if(__builtin_constant_p(a) && (a <= 15)){
315  v = vec_splat_u8(a); //will gen one instr.vspltisb
316  } else {
317  INC_STATS_NAME(STATS_SMEAR_SLOW,1, "smear u8");
318  __vector unsigned char t = {a,a,a,a,0,0,0,0,
319  0,0,0,0,0,0,0,0};
320  v = t;
321  }
322  }
327  SUBSCRIPT_FUNC_DECL(uint8_t);
328  COUT_FUNC_CHAR_DECL(unsigned char);
329 
330  VEC_CLASS_METHOD_DECL(uint8_t);
331  VEC_INT_CLASS_METHOD_DECL(uint8_t, uint8_t);
332 };
333 
337 template <>
338  struct svec<4,int16_t> {
339  __vector signed short v;
350  FORCEINLINE svec(__vector signed short vv) : v(vv) { }
355  FORCEINLINE svec(int16_t a, int16_t b, int16_t c, int16_t d) {
356  __vector signed short t = {a,b,c,d, 0,0,0,0};
357  v = t;
358  }
364  FORCEINLINE svec( int16_t a) {
365  if(__builtin_constant_p(a) && (a <= 15) && (a >= -16)){
366  v = vec_splat_s16(a); //will gen one instr.vspltisb
367  } else {
368  INC_STATS_NAME(STATS_SMEAR_SLOW,1, "smear i16");
369  __vector signed short t = {a,a,a,a, 0,0,0,0};
370  v = t;
371  }
372  }
377  SUBSCRIPT_FUNC_DECL(int16_t);
378  COUT_FUNC_DECL(int16_t);
379 
380  VEC_CLASS_METHOD_DECL(int16_t);
381  VEC_INT_CLASS_METHOD_DECL(int16_t, uint16_t);
382 
383 };
384 
388 template <>
389 struct svec<4,uint16_t> {
390  __vector unsigned short v;
401  FORCEINLINE svec(__vector unsigned short vv) : v(vv) { }
406  FORCEINLINE svec(uint16_t a, uint16_t b, uint16_t c, uint16_t d) {
407  __vector unsigned short t = {a,b,c,d, 0,0,0,0};
408  v = t;
409  }
415  FORCEINLINE svec( uint16_t a) {
416  if(__builtin_constant_p(a) && (a <= 15)){
417  v = vec_splat_u16(a); //will gen one instr.vspltisb
418  } else {
419  INC_STATS_NAME(STATS_SMEAR_SLOW,1, "smear u16");
420  __vector unsigned short t = {a,a,a,a, 0,0,0,0};
421  v = t;
422  }
423  }
428  SUBSCRIPT_FUNC_DECL(uint16_t);
429  COUT_FUNC_DECL(uint16_t);
430 
431  VEC_CLASS_METHOD_DECL(uint16_t);
432  VEC_INT_CLASS_METHOD_DECL(uint16_t, uint16_t);
433 
434 };
435 
439 template <>
440 struct svec<4,int32_t> {
441  __vector signed int v;
452  FORCEINLINE svec(__vector signed int vv) : v(vv) { }
457  FORCEINLINE svec(int a, int b, int c, int d) {
458  __vector signed int t = {a,b,c,d};
459  v = t;
460  }
466  FORCEINLINE svec(int32_t a) {
467  if(__builtin_constant_p(a)){
468  if((a <= 15) && (a >= -16)) {
469  v = vec_splat_s32(a); //will gen one instr.vspltisb
470  } else {
471  INC_STATS_NAME(STATS_SMEAR_SLOW,1, "smear i32");
472  __vector signed int t = {a,a,a,a};
473  v = t;
474  }
475  } else { //non-const
476 #ifdef __POWER8
477  v = vec_smear_p8(a);
478 #else
479  int32_t* p = &a;
480  __vector signed int register x = vec_vsx_ld(0, p);
481  v = vec_splat_p7(x, 0);
482 #endif
483  }
484  }
489  SUBSCRIPT_FUNC_DECL(int32_t);
490  COUT_FUNC_DECL(int32_t);
491 
492  VEC_CLASS_METHOD_DECL(int32_t);
493  VEC_INT_CLASS_METHOD_DECL(int32_t, uint32_t);
494 };
495 
499 template <>
500 struct svec<4,uint32_t> {
501  __vector unsigned int v;
512  FORCEINLINE svec(__vector unsigned int vv) : v(vv) { }
517  FORCEINLINE svec(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
518  __vector unsigned int t = {a,b,c,d};
519  v = t;
520  }
526  FORCEINLINE svec( uint32_t a) {
527  if(__builtin_constant_p(a)){
528  if((a <= 15)) {
529  v = vec_splat_u32(a); //will gen one instr.vspltisb
530  } else {
531  INC_STATS_NAME(STATS_SMEAR_SLOW,1, "smear u32");
532  __vector unsigned int t = {a,a,a,a};
533  v = t;
534  }
535  } else { //non-const
536 #ifdef __POWER8
537  v = vec_smear_p8(a);
538 #else
539  uint32_t* p = &a;
540  __vector unsigned int register x = vec_vsx_ld(0, p);
541  v = vec_splat_p7((__vector signed)x, 0);
542 #endif
543  }
544  }
549  SUBSCRIPT_FUNC_DECL(uint32_t);
550  COUT_FUNC_DECL(uint32_t);
551 
552  VEC_CLASS_METHOD_DECL(uint32_t);
553  VEC_INT_CLASS_METHOD_DECL(uint32_t, uint32_t);
554 };
555 
559 template <>
560 struct svec<4,int64_t> {
561  __vector signed long long v[2];
571  FORCEINLINE svec(__vector signed long long a, __vector signed long long b){
572  v[0] = a;
573  v[1] = b;
574  }
579  FORCEINLINE svec(int64_t a, int64_t b, int64_t c, int64_t d) {
580  __vector signed long long t1 = {a,b};
581  __vector signed long long t2 = {c,d};
582  v[0] = t1;
583  v[1] = t2;
584  }
590  FORCEINLINE svec( int64_t a) {
591  if(__builtin_constant_p(a)){
592 #ifdef __POWER8
593  if ((a >= -16l) && (a <= 15l)) {
594  const int iv = (int)a;
595  __vector signed int x = {iv,iv,iv,iv};
596  __vector signed long long t = vec_unpackh_p8(x);
597  v[0] = v[1] = t;
598  } else
599 #endif
600  if(a == 0) {
601  __vector signed long long r1 = (__vector signed long long)vec_splat_s32(0);
602  v[0] = v[1] = r1;
603  } else {
604  __vector long long x = {a,a};
605  v[0] = v[1] = x;
606  }
607  } else {
608 #ifdef __POWER8
609  __vector unsigned long long r = vec_smear_i64_p8(a);
610  v[0] = v[1] = r;
611 #else
612  int64_t* p = &a;
613  __vector signed long long r = vec_smear_i64_p7((long long*)p);
614  v[0] = v[1] = r;
615 #endif // __POWER8
616  } //non const
617  }
622  SUBSCRIPT_FUNC_DECL(int64_t);
623  COUT_FUNC_DECL(int64_t);
624 
625  VEC_CLASS_METHOD_DECL(int64_t);
626  VEC_INT_CLASS_METHOD_DECL(int64_t, uint64_t);
627 };
628 
632 template <>
633 struct svec<4,uint64_t> {
634  __vector unsigned long long v[2];
644  FORCEINLINE svec(__vector unsigned long long a, __vector unsigned long long b){
645  v[0] = a;
646  v[1] = b;
647  }
652  FORCEINLINE svec(uint64_t a, uint64_t b, uint64_t c, uint64_t d) {
653  __vector unsigned long long t1 = {a,b};
654  __vector unsigned long long t2 = {c,d};
655  v[0] = t1;
656  v[1] = t2;
657  }
663  FORCEINLINE svec( uint64_t a) {
664  if(__builtin_constant_p(a)){
665 #ifdef __POWER8
666  if ((a >= 0ul) && (a <= 31ul)) {
667  const int iv = (int)v;
668  __vector signed int x = {iv,iv,iv,iv};
669  __vector unsigned long long t = vec_unpackh_p8(x);
670  v[0] = v[1] = t;
671  } else
672 #endif
673  if(a == 0) {
674  __vector unsigned long long r1 = (__vector unsigned long long)vec_splat_u32(0);
675  v[0] = v[1] = r1, r1;
676  } else {
677  __vector unsigned long long x = {a,a};
678  v[0] = v[1] = x;
679  }
680  } else {
681 #ifdef __POWER8
682  __vector unsigned long long r = vec_smear_i64_p8(a);
683  v[0] = v[1] = r;
684 #else
685  uint64_t* p = &a;
686  __vector unsigned long long r = vec_smear_i64_p7((long long*)p);
687  v[0] = v[1] = r;
688 #endif // __POWER8
689  }
690  }
695  SUBSCRIPT_FUNC_DECL(uint64_t);
696  COUT_FUNC_DECL(uint64_t);
697 
698  VEC_CLASS_METHOD_DECL(uint64_t);
699  VEC_INT_CLASS_METHOD_DECL(uint64_t, uint64_t);
700 };
701 
705 template<>
706 struct svec<4,float> {
707  __vector float v;
718  FORCEINLINE svec(__vector float vv) : v(vv) { }
723  FORCEINLINE svec(float a, float b, float c, float d) {
724  __vector float t = {a,b,c,d};
725  v = t;
726  }
732  FORCEINLINE svec( float a) {
733  if(__builtin_constant_p(a)){
734  if(a == 0) {
735  v = (__vector float) vec_splat_s32(0);
736  } else {
737  float p; int iv;
738  p = 1.0; iv = (int)(p*a);
739  if (( (((float)iv)/p) == a ) && (iv >= -16) && (iv <= 15)) {
740  v = vec_ctf(vec_splat_s32(iv),0);
741  } else {
742  p = 2.0; iv = (int)(p*a);
743  if (( (((float)iv)/p) == a ) && (iv >= -16) && (iv <= 15)) {
744  v = vec_ctf(vec_splat_s32(iv),1);
745  } else {
746  p = 4.0; iv = (int)(p*a);
747  if (( (((float)iv)/p) == a ) && (iv >= -16) && (iv <= 15)) {
748  v = vec_ctf(vec_splat_s32(iv),2);
749  } else {
750  //no one instr solution.
751  __vector float t = {a,a,a,a};
752  v = t;
753  }
754  }
755  } //non zero const
756  }
757  } else { //none const
758 #ifdef __POWER8
759  v = vec_smear_p8(a);
760 #else
761  float* p = &a;
762  __vector float register x = vec_vsx_ld(0, p);
763  v = vec_splat_p7(x, 0);
764 #endif
765  }
766  }
771  SUBSCRIPT_FUNC_DECL(float);
773 
774  VEC_CLASS_METHOD_DECL(float);
776 };
777 
781 template<>
782 struct svec<4,double> {
783  __vector double v[2];
793  FORCEINLINE svec(__vector double a, __vector double b){
794  v[0] = a;
795  v[1] = b;
796  }
801  FORCEINLINE svec(double a, double b, double c, double d) {
802  __vector double t1 = {a,b};
803  __vector double t2 = {c,d};
804  v[0] = t1;
805  v[1] = t2;
806  }
812  FORCEINLINE svec( double a) {
813  if(__builtin_constant_p(a)){
814  if(a == 0) {
815  __vector double r1 = (__vector double)vec_splat_s32(0);
816  v[0] = v[1] = r1;
817  } else {
818  __vector double t = vec_smear_p7(a);
819  v[0] = v[1] = t;
820  }
821  } else {
822  __vector double t = vec_smear_p7(a);
823  v[0] = v[1] = t;
824  }
825  }
830  SUBSCRIPT_FUNC_DECL(double);
831  COUT_FUNC_DECL(double);
832 
833  VEC_CLASS_METHOD_DECL(double);
835 };
836 
838 //
839 // Templated data types
840 //
843 //
844 // Data operation interfaces
845 //
847 
848 //
850 //
851 #define INSERT_EXTRACT_OPT(STYPE) \
852  static FORCEINLINE STYPE svec_extract(svec<LANES,STYPE> v, int index) { \
853  return vec_extract(v.v, index); \
854  } \
855  static FORCEINLINE void svec_insert(svec<LANES,STYPE> *v, int index, STYPE val) { \
856  (*v).v = vec_insert(val, v->v, index); \
857  }
858 
859 #define INSERT_EXTRACT_OPT64(STYPE) \
860  static FORCEINLINE STYPE svec_extract(svec<LANES,STYPE> v, int index) { \
861  return vec_extract(v.v[index >> 1], index%2); \
862  } \
863  static FORCEINLINE void svec_insert(svec<LANES,STYPE> *v, int index, STYPE val) { \
864  (*v).v[index >> 1] = vec_insert(val, v->v[index>>1], index%2); \
865  }
866 
867 static FORCEINLINE uint32_t svec_extract(svec<4,bool> v, int index) {
868  return vec_extract(v.v, index);
869 }
870 static FORCEINLINE void svec_insert(svec<4,bool> *v, int index, uint32_t val) {
871  (*v).v = vec_insert(val ? -1 : 0, (*v).v, index); //special handle i1 type, use -1 to represent TRUE
872 }
873 INSERT_EXTRACT_OPT(int8_t);
874 INSERT_EXTRACT_OPT(uint8_t);
875 INSERT_EXTRACT_OPT(int16_t);
876 INSERT_EXTRACT_OPT(uint16_t);
877 INSERT_EXTRACT_OPT(int32_t);
878 INSERT_EXTRACT_OPT(uint32_t);
879 INSERT_EXTRACT_OPT64(int64_t);
880 INSERT_EXTRACT_OPT64(uint64_t);
881 INSERT_EXTRACT_OPT(float);
882 INSERT_EXTRACT_OPT64(double);
883 
884 
885 //
887 // * @brief macros for fixed index (0,1,2,3) insert extract method implementation
888 // */
889 //#define INSERT_EXTRACT_INDEX(VTYPE, STYPE) \
890 // static FORCEINLINE STYPE svec_extract_element0(VTYPE v) { \
891 // INC_STATS_NAME(STATS_EXTRACT, 1, "extract0"); \
892 // return ((STYPE *)&v)[0]; \
893 // } \
894 // static FORCEINLINE STYPE svec_extract_element1(VTYPE v) { \
895 // INC_STATS_NAME(STATS_EXTRACT, 1, "extract1"); \
896 // return ((STYPE *)&v)[1]; \
897 // } \
898 // static FORCEINLINE STYPE svec_extract_element2(VTYPE v) { \
899 // INC_STATS_NAME(STATS_EXTRACT, 1, "extract2"); \
900 // return ((STYPE *)&v)[2]; \
901 // } \
902 // static FORCEINLINE STYPE svec_extract_element3(VTYPE v) { \
903 // INC_STATS_NAME(STATS_EXTRACT, 1, "extract3"); \
904 // return ((STYPE *)&v)[3]; \
905 // }
906 //
907 
908 
909 
910 // 1. Load / Store
917 static FORCEINLINE svec<4,bool> svec_load(const svec<4,bool> *p) {
918  return *((__vector unsigned int *)p);
919 }
920 
927 static FORCEINLINE void svec_store(svec<4,bool> *p, svec<4,bool> v) {
928  *((__vector unsigned int*)p) = v.v;
929 }
930 
937 static FORCEINLINE svec<4,int8_t> svec_load(const svec<4,int8_t> *p) {
938  return vec_vsx_ld(0, (signed int*)p);
939 }
940 
947 static FORCEINLINE void svec_store(svec<4,int8_t> *p, svec<4,int8_t> v) {
948  vec_vsx_st(v.v, 0, (signed char*)p);
949 }
950 
957 static FORCEINLINE svec<4,uint8_t> svec_load(const svec<4,uint8_t> *p) {
958  return vec_vsx_ld(0, (signed int*)p);
959 }
960 
967 static FORCEINLINE void svec_store(svec<4,uint8_t> *p, svec<4,uint8_t> v) {
968  vec_vsx_st(v.v, 0, (unsigned char*)p);
969 }
970 
975 LOAD_STORE(int16_t);
976 
977 LOAD_STORE(uint16_t);
978 
985 static FORCEINLINE svec<4,int32_t> svec_load(const svec<4,int32_t> *p) {
986  return *((__vector signed int *)p);
987 }
988 
995 static FORCEINLINE void svec_store(svec<4,int32_t> *p, svec<4,int32_t> v) {
996  *((__vector signed int*)p) = v.v;
997 }
998 
1005 static FORCEINLINE svec<4,uint32_t> svec_load(const svec<4,uint32_t> *p) {
1006  return *((__vector unsigned int *)p);
1007 }
1008 
1015 static FORCEINLINE void svec_store(svec<4,uint32_t> *p, svec<4,uint32_t> v) {
1016  *((__vector unsigned int*)p) = v.v;
1017 }
1018 
1025 static FORCEINLINE svec<4,int64_t> svec_load(const svec<4,int64_t> *p) {
1026  __vector signed long long v0 = *(((__vector signed long long *)p)+0);
1027  __vector signed long long v1 = *(((__vector signed long long *)p)+1);
1028  return svec<4,int64_t>(v0,v1);
1029 }
1030 
1037 static FORCEINLINE void svec_store(svec<4,int64_t> *p, svec<4,int64_t> v) {
1038  *(((__vector signed long long *)p)+0) = v.v[0];
1039  *(((__vector signed long long *)p)+1) = v.v[1];
1040 }
1041 
1048 static FORCEINLINE svec<4,uint64_t> svec_load(const svec<4,uint64_t> *p) {
1049  __vector unsigned long long v0 = *(((__vector unsigned long long *)p)+0);
1050  __vector unsigned long long v1 = *(((__vector unsigned long long *)p)+1);
1051  return svec<4,uint64_t>(v0,v1);
1052 }
1059 static FORCEINLINE void svec_store(svec<4,uint64_t> *p, svec<4,uint64_t> v) {
1060  *(((__vector unsigned long long *)p)+0) = v.v[0];
1061  *(((__vector unsigned long long *)p)+1) = v.v[1];
1062 }
1063 
1070 static FORCEINLINE svec<4,float> svec_load(const svec<4,float> *p) {
1071  return *((__vector float *)p);
1072 // return vec_ld(0, (__vector float*)p);
1073 }
1074 
1081 static FORCEINLINE void svec_store(svec<4,float> *p, svec<4,float> v) {
1082  *((__vector float*)p) = v.v;
1083 // vec_st(v.v, 0, (__vector float*)p);
1084 }
1085 
1092 static FORCEINLINE svec<4,double> svec_load(const svec<4,double> *p) {
1093 // __vector double v0 = *(((__vector double *)p)+0);
1094 // __vector double v1 = *(((__vector double *)p)+1);
1095  __vector double v0 = vec_vsx_ld(0, ((__vector double *)p));
1096  __vector double v1 = vec_vsx_ld(0, ((__vector double *)p)+1);
1097 // __vector double v0 = vec_ld(0, ((__vector double *)p));
1098 // __vector double v1 = vec_ld(0, ((__vector double *)p)+1);
1099  return svec<4,double>(v0,v1);
1100 }
1101 
1108 static FORCEINLINE void svec_store(svec<4,double> *p, svec<4,double> v) {
1109 // *(((__vector double *)p)+0) = v.v[0];
1110 // *(((__vector double *)p)+1) = v.v[1];
1111  vec_vsx_st(v.v[0], 0, (__vector double *)p);
1112  vec_vsx_st(v.v[1], 0, (__vector double *)p + 1);
1113 // vec_st(v.v[0], 0, (__vector double *)p);
1114 // vec_st(v.v[1], 0, (__vector double *)p + 1);
1115 }
1116 
1117 // 3. Select
1118 
1127  return vec_sel(b.v, a.v, mask.v);
1128 }
1129 
1135  __vector unsigned int tsi=vec_splat_s32(0);//{0,0,0,0};
1136  __vector unsigned char t = vec_pack(vec_pack(mask.v,tsi),(vector unsigned short)tsi);
1137  return vec_sel(b.v, a.v, t);
1138 }
1139 
1145  __vector unsigned int tsi=vec_splat_u32(0);//{0,0,0,0};
1146  __vector unsigned char t = vec_pack(vec_pack(mask.v,tsi),(vector unsigned short)tsi);
1147  return vec_sel(b.v, a.v, t);
1148 }
1149 
1155  INC_STATS_NAME(STATS_OTHER_SLOW, 1, "select i16");
1156  int16_t v0 = mask[0] ? a[0] : b[0];
1157  int16_t v1 = mask[1] ? a[1] : b[1];
1158  int16_t v2 = mask[2] ? a[2] : b[2];
1159  int16_t v3 = mask[3] ? a[3] : b[3];
1160  return svec<4,int16_t>(v0, v1, v2, v3);
1161 }
1162 
1168  INC_STATS_NAME(STATS_OTHER_SLOW, 1, "select u16");
1169  uint16_t v0 = mask[0] ? a[0] : b[0];
1170  uint16_t v1 = mask[1] ? a[1] : b[1];
1171  uint16_t v2 = mask[2] ? a[2] : b[2];
1172  uint16_t v3 = mask[3] ? a[3] : b[3];
1173  return svec<4,uint16_t>(v0, v1, v2, v3);
1174 }
1175 
1181  return vec_sel(b.v, a.v, mask.v);
1182 }
1183 
1189  return vec_sel(b.v, a.v, mask.v);
1190 }
1191 
1197 
1198 #ifdef __POWER8
1199  __vector signed long long t1 = vec_sel(b.v[0],a.v[0],vec_unpackh_p8(mask.v));
1200  __vector signed long long t2 = vec_sel(b.v[1],a.v[1],vec_unpackl_p8(mask.v));
1201  svec<4,int64_t> res2 = svec<4,int64_t>(t1,t2);
1202  return res2;
1203 #else
1204  INC_STATS_NAME(STATS_OTHER_SLOW, 1, "select i64");
1205  int64_t v0 = mask[0] ? a[0] : b[0];
1206  int64_t v1 = mask[1] ? a[1] : b[1];
1207  int64_t v2 = mask[2] ? a[2] : b[2];
1208  int64_t v3 = mask[3] ? a[3] : b[3];
1209  return svec<4,int64_t>(v0,v1,v2,v3);
1210 #endif
1211 }
1212 
1218 
1219 #ifdef __POWER8
1220  __vector unsigned long long t1 = vec_sel(b.v[0],a.v[0],vec_unpackh_p8(mask.v));
1221  __vector unsigned long long t2 = vec_sel(b.v[1],a.v[1],vec_unpackl_p8(mask.v));
1222  svec<4,uint64_t> res2 = svec<4,uint64_t>(t1,t2);
1223  return res2;
1224 #else
1225  INC_STATS_NAME(STATS_OTHER_SLOW, 1, "select u64");
1226  uint64_t v0 = mask[0] ? a[0] : b[0];
1227  uint64_t v1 = mask[1] ? a[1] : b[1];
1228  uint64_t v2 = mask[2] ? a[2] : b[2];
1229  uint64_t v3 = mask[3] ? a[3] : b[3];
1230  return svec<4,uint64_t>(v0,v1,v2,v3);
1231 #endif
1232 }
1233 
1239  return vec_sel(b.v, a.v, mask.v);
1240 }
1241 
1247 #ifdef __POWER8
1248  __vector double t1 = vec_sel(b.v[0],a.v[0],vec_unpackh_p8(mask.v));
1249  __vector double t2 = vec_sel(b.v[1],a.v[1],vec_unpackl_p8(mask.v));
1250  svec<4,double> res2 = svec<4,double>(t1,t2);
1251  return res2;
1252 #else
1253  INC_STATS_NAME(STATS_OTHER_SLOW, 1, "select_double");
1254  double v0 = mask[0] ? a[0] : b[0];
1255  double v1 = mask[1] ? a[1] : b[1];
1256  double v2 = mask[2] ? a[2] : b[2];
1257  double v3 = mask[3] ? a[3] : b[3];
1258  return svec<4,double>(v0,v1,v2,v3);
1259 #endif
1260 }
1261 
1273 
1274 
1275 // 4. broadcast/rotate/shuffle/smear/setzero
1276 
1277 
1278 
1279 
1280 #define BROADCAST_OPT32(STYPE) \
1281  static FORCEINLINE svec<LANES,STYPE> svec_broadcast(svec<LANES,STYPE> v, const int index) { \
1282  if(__builtin_constant_p(index) && index >=0 && index < 4){ return svec<LANES,STYPE>(vec_splat_p7(v.v, index)); } \
1283  else { STYPE bval = v[index]; return svec<LANES,STYPE>(bval, bval, bval, bval); } \
1284  }
1285 
1286 #define BROADCAST_OPT64(STYPE) \
1287  static FORCEINLINE svec<LANES,STYPE> svec_broadcast(svec<LANES,STYPE> v, const int index) { \
1288  if(__builtin_constant_p(index) && index >=0 && index < 4){ \
1289  __vector STYPE r = vec_splat_p7(v.v[index >> 1], index %2); \
1290  return svec<LANES,STYPE>(r, r); } \
1291  else { STYPE bval = v[index]; return svec<LANES,STYPE>(bval, bval, bval, bval); } \
1292  }
1293 
1294 
1295 BROADCAST_L4(int8_t);
1296 BROADCAST_L4(uint8_t);
1297 BROADCAST_L4(int16_t);
1298 BROADCAST_L4(uint16_t);
1299 BROADCAST_OPT32(int32_t);
1300 BROADCAST_OPT32(uint32_t);
1301 BROADCAST_OPT64(int64_t);
1302 BROADCAST_OPT64(uint64_t);
1303 BROADCAST_OPT32(float);
1304 BROADCAST_OPT64(double);
1305 
1306 ROTATE_L4(int8_t);
1307 ROTATE_L4(uint8_t);
1308 ROTATE_L4(int16_t);
1309 ROTATE_L4(uint16_t);
1310 ROTATE_L4(int32_t);
1311 ROTATE_L4(uint32_t);
1312 ROTATE_L4(int64_t);
1313 ROTATE_L4(uint64_t);
1314 ROTATE_L4(float);
1315 ROTATE_L4(double);
1316 
1317 
1318 SHUFFLES_L4(int8_t);
1319 SHUFFLES_L4(uint8_t);
1320 SHUFFLES_L4(int16_t);
1321 SHUFFLES_L4(uint16_t);
1322 SHUFFLES_L4(int32_t);
1323 SHUFFLES_L4(uint32_t);
1324 SHUFFLES_L4(int64_t);
1325 SHUFFLES_L4(uint64_t);
1326 SHUFFLES_L4(float);
1327 SHUFFLES_L4(double);
1328 
1329 
1330 
1331 //load const and load and splats, need a template, other wise we cannot distinguish the LANES diff
1332 
1333 template <class RetVecType> static RetVecType svec_load_const(const int8_t* p);
1334 template<>
1335 FORCEINLINE svec<4,int8_t> svec_load_const<svec<4,int8_t> >(const int8_t* p) {
1336  return svec<4,int8_t>(p[0], p[0], p[0], p[0]);
1337 }
1338 
1339 template <class RetVecType> static RetVecType svec_load_const(const uint8_t* p);
1340 template<>
1341 FORCEINLINE svec<4,uint8_t> svec_load_const<svec<4,uint8_t> >(const uint8_t* p) {
1342  return svec<4,uint8_t>(p[0], p[0], p[0], p[0]);
1343 }
1344 
1345 template <class RetVecType> static RetVecType svec_load_const(const int16_t* p);
1346 template<>
1347 FORCEINLINE svec<4,int16_t> svec_load_const<svec<4,int16_t> >(const int16_t* p) {
1348  return svec<4,int16_t>(p[0], p[0], p[0], p[0]);
1349 }
1350 
1351 template <class RetVecType> static RetVecType svec_load_const(const uint16_t* p);
1352 template<>
1353 FORCEINLINE svec<4,uint16_t> svec_load_const<svec<4,uint16_t> >(const uint16_t* p) {
1354  return svec<4,uint16_t>(p[0], p[0], p[0], p[0]);
1355 }
1356 
1357 template <class RetVecType> static RetVecType svec_load_const(const int32_t* p);
1358 template<>
1359 FORCEINLINE svec<4,int32_t> svec_load_const<svec<4,int32_t> >(const int32_t* p) {
1360  return svec<4,int32_t>(p[0], p[0], p[0], p[0]);
1361 }
1362 
1363 template <class RetVecType> static RetVecType svec_load_const(const uint32_t* p);
1364 template<>
1365 FORCEINLINE svec<4,uint32_t> svec_load_const<svec<4,uint32_t> >(const uint32_t* p) {
1366  return svec<4,uint32_t>(p[0], p[0], p[0], p[0]);
1367 }
1368 
1369 template <class RetVecType> static RetVecType svec_load_const(const int64_t* p);
1370 template<>
1371 FORCEINLINE svec<4,int64_t> svec_load_const<svec<4,int64_t> >(const int64_t* p) {
1372  __vector signed long long t= vec_smear_const_i64_p7((const long long *)p);
1373  return svec<4,int64_t>(t,t);
1374 }
1375 
1376 template <class RetVecType> static RetVecType svec_load_const(const uint64_t* p);
1377 template<>
1378 FORCEINLINE svec<4,uint64_t> svec_load_const<svec<4,uint64_t> >(const uint64_t* p) {
1379  __vector unsigned long long t= vec_smear_const_i64_p7((const long long *)p);
1380  return svec<4,uint64_t>(t,t);
1381 }
1382 
1383 template <class RetVecType> static RetVecType svec_load_const(const float* p);
1384 template<>
1385 FORCEINLINE svec<4,float> svec_load_const<svec<4,float> >(const float* p) {
1386  //return vec_smear_const_float_p7((const __vector float *)p);
1387  return vec_splat(*(__vector float*)p, 0);
1388 }
1389 
1390 template <class RetVecType> static RetVecType svec_load_const(const double* p);
1391 template<>
1392 FORCEINLINE svec<4,double> svec_load_const<svec<4,double> >(const double* p) {
1393  __vector double t= vec_smear_const_double_p7(p);
1394  return svec<4,double>(t,t);
1395 }
1396 
1397 //load and splat
1398 
1399 template <class RetVecType> static RetVecType svec_load_and_splat(int8_t* p);
1400 template<>
1401 FORCEINLINE svec<4,int8_t> svec_load_and_splat<svec<4,int8_t> >(int8_t* p) {
1402  INC_STATS_NAME(STATS_SMEAR_SLOW,1, "load_and_splat i8");
1403  int8_t v = *p;
1404  return svec<4,int8_t>(v,v,v,v);
1405 }
1406 
1407 template <class RetVecType> static RetVecType svec_load_and_splat(uint8_t* p);
1408 template<>
1409 FORCEINLINE svec<4,uint8_t> svec_load_and_splat<svec<4,uint8_t> >(uint8_t* p) {
1410  INC_STATS_NAME(STATS_SMEAR_SLOW,1,"load_and_splat u8");
1411  uint8_t v = *p;
1412  return svec<4,uint8_t>(v,v,v,v);
1413 }
1414 
1415 template <class RetVecType> static RetVecType svec_load_and_splat(int16_t* p);
1416 template<>
1417 FORCEINLINE svec<4,int16_t> svec_load_and_splat<svec<4,int16_t> >(int16_t* p) {
1418  INC_STATS_NAME(STATS_SMEAR_SLOW,1,"load_and_splat i16");
1419  int16_t v = *p;
1420  return svec<4,int16_t>(v,v,v,v);
1421 }
1422 
1423 template <class RetVecType> static RetVecType svec_load_and_splat(uint16_t* p);
1424 template<>
1425 FORCEINLINE svec<4,uint16_t> svec_load_and_splat<svec<4,uint16_t> >(uint16_t* p) {
1426  INC_STATS_NAME(STATS_SMEAR_SLOW,1,"load_and_splat u16");
1427  uint16_t v = *p;
1428  return svec<4,uint16_t>(v,v,v,v);
1429 }
1430 
1431 template <class RetVecType> static RetVecType svec_load_and_splat(int32_t* p);
1432 template<>
1433 FORCEINLINE svec<4,int32_t> svec_load_and_splat<svec<4,int32_t> >(int32_t* p) {
1434 #ifdef __POWER8
1435  return vec_smear_i32_p8(p);
1436 #else
1437  __vector signed int register x = vec_vsx_ld(0, p);
1438  return svec<4,int32_t>(vec_splat_p7(x,0));
1439 #endif //__POWER8
1440 }
1441 
1442 template <class RetVecType> static RetVecType svec_load_and_splat(uint32_t* p);
1443 template<>
1444 FORCEINLINE svec<4,uint32_t> svec_load_and_splat<svec<4,uint32_t> >(uint32_t* p) {
1445 #ifdef __POWER8
1446  return vec_smear_i32_p8(p);
1447 #else
1448  __vector unsigned int register x = vec_vsx_ld(0, p);
1449  return svec<4,uint32_t>(vec_splat_p7((__vector signed)x,0));
1450 #endif //__POWER8
1451 }
1452 
1453 template <class RetVecType> static RetVecType svec_load_and_splat(int64_t* p);
1454 template<>
1455 FORCEINLINE svec<4,int64_t> svec_load_and_splat<svec<4,int64_t> >(int64_t* p) {
1456  __vector signed long long r = vec_smear_i64_p7((signed long long*)p);
1457  return svec<4,int64_t>(r,r);
1458 }
1459 
1460 template <class RetVecType> static RetVecType svec_load_and_splat(uint64_t* p);
1461 template<>
1462 FORCEINLINE svec<4,uint64_t> svec_load_and_splat<svec<4,uint64_t> >(uint64_t* p) {
1463  __vector unsigned long long r = vec_smear_i64_p7((unsigned long long*)p);
1464  return svec<4,uint64_t>(r,r);
1465 }
1466 
1467 template <class RetVecType> static RetVecType svec_load_and_splat(float* p);
1468 template<>
1469 FORCEINLINE svec<4,float> svec_load_and_splat<svec<4,float> >(float* p) {
1470 #ifdef __POWER8
1471  return vec_smear_float_p8(p);
1472 #else
1473  __vector float register x = vec_vsx_ld(0, p);
1474  return svec<4,float>(vec_splat_p7(x, 0));
1475 #endif //__POWER8
1476 }
1477 
1478 template <class RetVecType> static RetVecType svec_load_and_splat(double* p);
1479 template<>
1480 FORCEINLINE svec<4,double> svec_load_and_splat<svec<4,double> >(double* p) {
1481  __vector double t= vec_smear_double_p7(p);
1482  return svec<4,double>(t,t);
1483 }
1484 
1485 
1486 // 5. Gather / Scatter
1499 #ifdef __PPC64__
1500 template <>
1501  struct svec<4,void*> : public svec<4,uint64_t>{
1506  FORCEINLINE svec(void* p0, void* p1, void* p2, void* p3):
1507  svec<4,uint64_t>((uint64_t)(p0),(uint64_t)(p1),(uint64_t)(p2),(uint64_t)(p3)){}
1508 };
1509 #else // 32-bit
1510 template <>
1511  struct svec<4,void*> : public svec<4,uint32_t>{
1516  FORCEINLINE svec(void* p0, void* p1, void* p2, void* p3):
1517  svec<4,uint32_t>((uint32_t)(p0),(uint32_t)(p1),(uint32_t)(p2),(uint32_t)(p3)){}
1518 };
1519 #endif // __PPC64__
1520 
1521 #ifndef DOXYGEN_SHOULD_SKIP_THIS //not want generate svec_gather*/svec_scatter methods
1522 
1523 template <class RetVecType> static RetVecType svec_gather(svec<4,uint32_t> ptrs, svec<4,bool> mask);
1524 template <class RetVecType> static RetVecType svec_gather(svec<4,uint64_t> ptrs, svec<4,bool> mask);
1525 
1526 //There is a fast impl for gather addr64 on i8/u8 types
1527 //But it is commented out. So I didn't move the code to here
1528 //Please see vsx4.h __gather64_i8
1529 GATHER_GENERAL_L4(int8_t, uint32_t);
1530 GATHER_GENERAL_L4(int8_t, uint64_t);
1531 GATHER_GENERAL_L4(uint8_t, uint32_t);
1532 GATHER_GENERAL_L4(uint8_t, uint64_t);
1533 GATHER_GENERAL_L4(int16_t, uint32_t);
1534 GATHER_GENERAL_L4(int16_t, uint64_t);
1535 GATHER_GENERAL_L4(uint16_t, uint32_t);
1536 GATHER_GENERAL_L4(uint16_t, uint64_t);
1537 GATHER_GENERAL_L4(int32_t, uint32_t);
1538 
1539 //GATHER_GENERAL_L4(int32_t, uin64_t);
1540 template<>
1541 FORCEINLINE svec<4,int32_t> svec_gather<svec<4,int32_t> >(svec<4,uint64_t> ptrs, svec<4,bool> mask) {
1542  typedef svec<4,int32_t> RetVec;
1543  return lGatherGeneral<RetVec,int32_t,svec<4,uint64_t>,svec<4,bool> >(ptrs,mask);
1544 }
1545 
1546 GATHER_GENERAL_L4(uint32_t, uint32_t);
1547 
1548 //GATHER_GENERAL_L4(uint32_t, uint64_t);
1549 template<>
1550 FORCEINLINE svec<4,uint32_t> svec_gather<svec<4,uint32_t> >(svec<4,uint64_t> ptrs, svec<4,bool> mask) {
1551  typedef svec<4,uint32_t> RetVec;
1552  return lGatherGeneral<RetVec,uint32_t,svec<4,uint64_t>,svec<4,bool> >(ptrs,mask);
1553 }
1554 
1555 
1556 
1557 
1558 GATHER_GENERAL_L4(int64_t, uint32_t);
1559 
1560 //GATHER_GENERAL_L4(int64_t, uint64_t);
1561 template<>
1562 FORCEINLINE svec<4,int64_t> svec_gather<svec<4,int64_t> >(svec<4,uint64_t> ptrs, svec<4,bool> mask) {
1563  typedef svec<4,int64_t> RetVec;
1564  return lGatherGeneral<RetVec,int64_t, svec<4,uint64_t>,svec<4,bool> >(ptrs,mask);
1565 }
1566 
1567 GATHER_GENERAL_L4(uint64_t, uint32_t);
1568 
1569 //GATHER_GENERAL_L4(uint64_t, uint64_t);
1570 template<>
1571 FORCEINLINE svec<4,uint64_t> svec_gather<svec<4,uint64_t> >(svec<4,uint64_t> ptrs, svec<4,bool> mask) {
1572  typedef svec<4,uint64_t> RetVec;
1573  return lGatherGeneral<RetVec,uint64_t, svec<4,uint64_t>,svec<4,bool> >(ptrs,mask);
1574 }
1575 
1576 
1577 GATHER_GENERAL_L4(float, uint32_t);
1578 
1579 //GATHER_GENERAL_L4(float, uint64_t);
1580 template<>
1581 FORCEINLINE svec<4,float> svec_gather<svec<4,float> >(svec<4,uint64_t> ptrs, svec<4,bool> mask) {
1582  typedef svec<4,float> RetVec;
1583  return lGatherGeneral<RetVec,float,svec<4,uint64_t>,svec<4,bool> >(ptrs,mask);
1584 }
1585 
1586 GATHER_GENERAL_L4(double, uint32_t);
1587 GATHER_GENERAL_L4(double, uint64_t);
1588 
1589 //Utility functions for gather base off sets
1590 
1591 
1593 #ifdef __POWER8
1594 
1595 // Gather 32 bit data with 32 bit offset
1596 template<typename RetVec, typename RetScalar, typename OFF, typename MSK>
1597 static FORCEINLINE RetVec
1598 lGatherBaseOffsets32_32P8(unsigned char *p, uint32_t scale,
1599  OFF offsets, MSK mask) {
1600  RetScalar r[4];
1601  OFF vzero(0,0,0,0);
1602  //if mask is not set we still read from p+0 to avoid the if
1603  offsets = svec_select(mask, offsets, vzero);
1604  int offset;
1605  RetScalar *ptr;
1606  //extract individual offsets
1607  uint64_t doff1 = vec_extract_l(offsets.v);
1608  uint64_t doff2 = vec_extract_r(offsets.v);
1609  //split them in two
1610  uint32_t o1=(uint32_t) doff1;
1611  uint32_t o0=(uint32_t)(doff1 >> 32);
1612  uint32_t o3=(uint32_t) doff2;
1613  uint32_t o2=(uint32_t)(doff2 >> 32);
1614 #ifdef CORRECTNESS_CHECK
1615  if(o0 != offsets[0] ||
1616  o1 != offsets[1] ||
1617  o2 != offsets[2] ||
1618  o3 != offsets[3]) {
1619  printf("Error while extracting for gather\n");
1620  }
1621 #endif
1622  return vec_gather_p8((RetScalar*)(p + (scale*o0)),
1623  (RetScalar*)(p+(scale*o1)),
1624  (RetScalar*)(p+(scale*o2)),
1625  (RetScalar*)(p+(scale*o3)) );
1626 }
1627 
1628 // Gather 64 bit data with 32 bit offset
1629 template<typename RetVec, typename RetScalar, typename OFF, typename MSK>
1630 static FORCEINLINE RetVec
1631 lGatherBaseOffsets32_64P8(unsigned char *p, uint32_t scale,
1632  OFF offsets, MSK mask) {
1633  RetScalar r[4];
1634  OFF vzero(0,0,0,0);
1635  //if mask is not set we still read from p+0 to avoid the if
1636  offsets = svec_select(mask, offsets, vzero);
1637  int offset;
1638  RetScalar *ptr;
1639  //extract individual offsets
1640  uint64_t doff1 = vec_extract_l(offsets.v);
1641  uint64_t doff2 = vec_extract_r(offsets.v);
1642  //split them in two
1643  uint32_t o1=(uint32_t) doff1;
1644  uint32_t o0=(uint32_t)(doff1 >> 32);
1645  uint32_t o3=(uint32_t) doff2;
1646  uint32_t o2=(uint32_t)(doff2 >> 32);
1647 #ifdef CORRECTNESS_CHECK
1648  if(o0 != offsets[0] ||
1649  o1 != offsets[1] ||
1650  o2 != offsets[2] ||
1651  o3 != offsets[3]) {
1652  printf("Error while extracting for gather\n");
1653  }
1654 #endif
1655  return RetVec(vec_gather_p8((RetScalar*)(p + (scale*o0)),
1656  (RetScalar*)(p+(scale*o1))) ,
1657  vec_gather_p8((RetScalar*)(p+(scale*o2)),
1658  (RetScalar*)(p+(scale*o3))) );
1659 }
1660 
1661 
1662 // Gather 32 bit data with 64 bit offset
1663 template<typename RetVec, typename RetScalar, typename OFF, typename MSK>
1664 static FORCEINLINE RetVec
1665 lGatherBaseOffsets64_32P8(unsigned char *p, uint32_t scale,
1666  OFF offsets, MSK mask) {
1667  RetScalar r[4];
1668  OFF vzero(0,0,0,0);
1669  //if mask is not set we still read from p+0 to avoid the if
1670  offsets = svec_select(mask, offsets, vzero);
1671  int offset;
1672  RetScalar *ptr;
1673  //extract individual offsets
1674  uint64_t o0 = vec_extract_l(offsets.v[0]);
1675  uint64_t o1 = vec_extract_r(offsets.v[0]);
1676  uint64_t o2 = vec_extract_l(offsets.v[1]);
1677  uint64_t o3 = vec_extract_r(offsets.v[1]);
1678 
1679 #ifdef CORRECTNESS_CHECK
1680  if(o0 != offsets[0] ||
1681  o1 != offsets[1] ||
1682  o2 != offsets[2] ||
1683  o3 != offsets[3]) {
1684  printf("Error while extracting for gather\n");
1685  }
1686 #endif
1687  return vec_gather_p8((RetScalar*)(p+(scale*o0)),
1688  (RetScalar*)(p+(scale*o1)),
1689  (RetScalar*)(p+(scale*o2)),
1690  (RetScalar*)(p+(scale*o3)) );
1691 }
1692 
1693 // Gather 64 bit data with 64 bit offset
1694 template<typename RetVec, typename RetScalar, typename OFF, typename MSK>
1695 static FORCEINLINE RetVec
1696 lGatherBaseOffsets64_64P8(unsigned char *p, uint32_t scale,
1697  OFF offsets, MSK mask) {
1698  RetScalar r[4];
1699  OFF vzero(0,0,0,0);
1700  //if mask is not set we still read from p+0 to avoid the if
1701  offsets = svec_select(mask.v, offsets, vzero);
1702  int offset;
1703  RetScalar *ptr;
1704  //extract individual offsets
1705  uint64_t o0 = vec_extract_l(offsets.v[0]);
1706  uint64_t o1 = vec_extract_r(offsets.v[0]);
1707  uint64_t o2 = vec_extract_l(offsets.v[1]);
1708  uint64_t o3 = vec_extract_r(offsets.v[1]);
1709 
1710 #ifdef CORRECTNESS_CHECK
1711  if(o0 != offsets[0] ||
1712  o1 != offsets[1] ||
1713  o2 != offsets[2] ||
1714  o3 != offsets[3]) {
1715  printf("Error while extracting for gather\n");
1716  }
1717 #endif
1718  return RetVec(vec_gather_p8((RetScalar*)(p + (scale*o0)),
1719  (RetScalar*)(p+(scale*o1))) ,
1720  vec_gather_p8((RetScalar*)(p+(scale*o2)),
1721  (RetScalar*)(p+(scale*o3))) );
1722 }
1723 
1725 #endif //endif __POWER8
1726 
1727 
1728 GATHER_BASE_OFFSETS_L4(int8_t, int32_t);
1729 GATHER_BASE_OFFSETS_L4(int8_t, int64_t);
1730 GATHER_BASE_OFFSETS_L4(uint8_t, int32_t);
1731 GATHER_BASE_OFFSETS_L4(uint8_t, int64_t);
1732 GATHER_BASE_OFFSETS_L4(int16_t, int32_t);
1733 GATHER_BASE_OFFSETS_L4(int16_t, int64_t);
1734 GATHER_BASE_OFFSETS_L4(uint16_t, int32_t);
1735 GATHER_BASE_OFFSETS_L4(uint16_t, int64_t);
1736 
1737 //GATHER_BASE_OFFSETS_L4(int32_t, int32_t);
1738 static FORCEINLINE svec<4,int32_t>
1739 svec_gather_base_offsets(int32_t *b, uint32_t scale, svec<4,int32_t> offsets, svec<4,bool> mask){
1740  #ifdef __POWER8
1741  return lGatherBaseOffsets32_32P8<svec<4,int32_t>,int32_t,svec<4,int32_t>,svec<4,bool> >((uint8_t*)b,scale,offsets,mask);
1742  #else
1743  return lGatherBaseOffsets<svec<4,int32_t>, int32_t, svec<4,int32_t>,svec<4,bool> >((uint8_t*)b,scale,offsets,mask);
1744  #endif
1745 }
1746 
1747 //GATHER_BASE_OFFSETS_L4(int32_t, int64_t);
1748 static FORCEINLINE svec<4,int32_t>
1749 svec_gather_base_offsets(int32_t* b, uint32_t scale, svec<4,int64_t> offsets, svec<4,bool> mask){
1750  uint8_t *p = (uint8_t*)b;
1751  typedef svec<4,int32_t> RetVec;
1752  #ifdef __POWER8
1753  RetVec r1=lGatherBaseOffsets64_32P8<svec<4,int32_t>,int32_t,svec<4,int64_t>,svec<4,bool> >(p,scale,offsets,mask);
1754  return r1;
1755  #else
1756  return lGatherBaseOffsets<svec<4,int32_t>, int32_t,svec<4,int64_t>,svec<4,bool> >(p,scale,offsets,mask);
1757  #endif
1758 }
1759 
1760 //GATHER_BASE_OFFSETS_L4(uint32_t, int32_t);
1761 static FORCEINLINE svec<4,uint32_t>
1762 svec_gather_base_offsets(uint32_t *b, uint32_t scale, svec<4,int32_t> offsets, svec<4,bool> mask){
1763  #ifdef __POWER8
1764  return lGatherBaseOffsets32_32P8<svec<4,uint32_t>,uint32_t,svec<4,int32_t>,svec<4,bool> >((uint8_t*)b,scale,offsets,mask);
1765  #else
1766  return lGatherBaseOffsets<svec<4,uint32_t>, uint32_t, svec<4,int32_t>,svec<4,bool> >((uint8_t*)b,scale,offsets,mask);
1767  #endif
1768 }
1769 
1770 //GATHER_BASE_OFFSETS_L4(uint32_t, int64_t);
1771 static FORCEINLINE svec<4,uint32_t>
1772 svec_gather_base_offsets(uint32_t* b, uint32_t scale, svec<4,int64_t> offsets, svec<4,bool> mask){
1773  uint8_t *p = (uint8_t*)b;
1774  typedef svec<4,uint32_t> RetVec;
1775  #ifdef __POWER8
1776  RetVec r1=lGatherBaseOffsets64_32P8<svec<4,uint32_t>,uint32_t,svec<4,int64_t>,svec<4,bool> >(p,scale,offsets,mask);
1777  return r1;
1778  #else
1779  return lGatherBaseOffsets<svec<4,uint32_t>, uint32_t,svec<4,int64_t>,svec<4,bool> >(p,scale,offsets,mask);
1780  #endif
1781 }
1782 
1783 //GATHER_BASE_OFFSETS_L4(int64_t, int32_t);
1784 static FORCEINLINE svec<4,int64_t>
1785 svec_gather_base_offsets(int64_t *b, uint32_t scale, svec<4,int32_t> offsets,svec<4,bool> mask){
1786  uint8_t *p = (uint8_t *)b;
1787  typedef svec<4,int64_t> RetVec;
1788  #ifdef __POWER8
1789  svec<4,int64_t> r2 = lGatherBaseOffsets32_64P8<RetVec,int64_t,svec<4,int32_t>,svec<4,bool> >(p,scale,offsets,mask);
1790  return r2;
1791  #else
1792  return lGatherBaseOffsets<RetVec, int64_t, svec<4,int32_t>,svec<4,bool> >((uint8_t*)p,scale,offsets,mask);
1793  #endif
1794 }
1795 
1796 GATHER_BASE_OFFSETS_L4(int64_t, int64_t);
1797 
1798 //GATHER_BASE_OFFSETS_L4(uint64_t, int32_t);
1799 static FORCEINLINE svec<4,uint64_t>
1800 svec_gather_base_offsets(uint64_t *b, uint32_t scale, svec<4,int32_t> offsets,svec<4,bool> mask){
1801  uint8_t *p = (uint8_t *)b;
1802  typedef svec<4,uint64_t> RetVec;
1803  #ifdef __POWER8
1804  svec<4,uint64_t> r2 = lGatherBaseOffsets32_64P8<RetVec,uint64_t,svec<4,int32_t>,svec<4,bool> >(p,scale,offsets,mask);
1805  return r2;
1806  #else
1807  return lGatherBaseOffsets<svec<4,uint64_t>,uint64_t, svec<4,int32_t>,svec<4,bool> >((uint8_t*)p,scale,offsets,mask);
1808  #endif
1809 }
1810 
1811 GATHER_BASE_OFFSETS_L4(uint64_t, int64_t);
1812 
1813 //GATHER_BASE_OFFSETS_L4(float, int32_t);
1814 static FORCEINLINE svec<4,float>
1815 svec_gather_base_offsets(float *b, uint32_t scale, svec<4,int32_t> offsets, svec<4,bool> mask){
1816  uint8_t *p = (uint8_t*)b;
1817  #ifdef __POWER8
1818  return lGatherBaseOffsets32_32P8<svec<4,float>,float,svec<4,int32_t>,svec<4,bool> >(p,scale,offsets,mask);
1819  #else
1820  return lGatherBaseOffsets<svec<4,float>,float, svec<4,int32_t>,svec<4,bool> >((uint8_t*)p,scale,offsets,mask);
1821  #endif
1822 }
1823 
1824 //GATHER_BASE_OFFSETS_L4(float, int64_t);
1825 static FORCEINLINE svec<4,float>
1826 svec_gather_base_offsets(float* b, uint32_t scale, svec<4,int64_t> offsets, svec<4,bool> mask){
1827  uint8_t *p = (uint8_t*)b;
1828  #ifdef __POWER8
1829  typedef svec<4,float> RetVec;
1830  RetVec r1=lGatherBaseOffsets64_32P8<RetVec,float,svec<4,int64_t>,svec<4,bool> >(p,scale,offsets,mask);
1831  return r1;
1832  #else
1833  return lGatherBaseOffsets<svec<4,float>,float,svec<4,int64_t>,svec<4,bool> >(p,scale,offsets,mask);
1834  #endif
1835 }
1836 
1837 
1838 //GATHER_BASE_OFFSETS_L4(double, int32_t);
1839 static FORCEINLINE svec<4,double>
1840 svec_gather_base_offsets(double* b, uint32_t scale, svec<4,int32_t> offsets, svec<4,bool> mask){
1841  typedef svec<4,double> RetVec;
1842  uint8_t* p = (uint8_t*)b;
1843  #ifdef __POWER8
1844  svec<4,double> r2 = lGatherBaseOffsets32_64P8<RetVec,double,svec<4,int32_t>,svec<4,bool> >(p,scale,offsets,mask);
1845  return r2;
1846  #else
1847  return lGatherBaseOffsets<svec<4,double>,double,svec<4,int32_t>,svec<4,bool> >(p,scale,offsets,mask);
1848  #endif
1849 }
1850 
1851 //GATHER_BASE_OFFSETS_L4(double,int64_t);
1852 static FORCEINLINE svec<4,double>
1853 svec_gather_base_offsets(double* b, uint32_t scale, svec<4,int64_t> offsets, svec<4,bool> mask){
1854  uint8_t *p = (uint8_t*)b;
1855  typedef svec<4,double> RetVec;
1856  #ifdef __POWER8
1857  RetVec r1=lGatherBaseOffsets64_64P8<RetVec,double,svec<4,int64_t>,svec<4,bool> >(p,scale,offsets,mask);
1858  return r1;
1859  #else
1860  return lGatherBaseOffsets<svec<4,double>, double, svec<4,int64_t>,svec<4,bool> >(p,scale,offsets,mask);
1861  #endif
1862 }
1863 
1864 #ifdef __POWER8
1865 
1866 template<typename STYPE, typename PTRTYPE, typename VTYPE>
1867 static FORCEINLINE void lScatter64_32(PTRTYPE ptrs,
1868  VTYPE val, svec<4,bool> mask) {
1869 
1870  uint64_t p0 = vec_extract_l(ptrs.v[0]);
1871  uint64_t p1 = vec_extract_r(ptrs.v[0]);
1872  uint64_t p2 = vec_extract_l(ptrs.v[1]);
1873  uint64_t p3 = vec_extract_r(ptrs.v[1]);
1874 
1875  //extract mask
1876  uint64_t doff1 = vec_extract_l(mask.v);
1877  uint64_t doff2 = vec_extract_r(mask.v);
1878  //split them in four
1879  uint32_t m1=(uint32_t) doff1;
1880  uint32_t m0=(uint32_t)(doff1 >> 32);
1881  uint32_t m3=(uint32_t) doff2;
1882  uint32_t m2=(uint32_t)(doff2 >> 32);
1883 
1884  //check correctness
1885  /*
1886  if(p0 != __extract_element(ptrs,0) ||
1887  p1 != __extract_element(ptrs,1) ||
1888  p2 != __extract_element(ptrs,2) ||
1889  p3 != __extract_element(ptrs,3)) {
1890  printf("Error while extracting ptrs for scatter\n");
1891  }
1892 
1893  if(m0 != __extract_element(mask,0) ||
1894  m1 != __extract_element(mask,1) ||
1895  m2 != __extract_element(mask,2) ||
1896  m3 != __extract_element(mask,3)) {
1897  printf("Error while extracting mask for scatter\n");
1898  }
1899  */
1900 
1901  if(m0)
1902  vec_scatter_step_12((STYPE*)p0, val.v);
1903  if(m1)
1904  vec_scatter_step_0((STYPE*)p1, val.v);
1905  if(m2)
1906  vec_scatter_step_4((STYPE*)p2, val.v);
1907  if(m3)
1908  vec_scatter_step_8((STYPE*)p3, val.v);
1909 }
1910 #endif
1911 
1912 GATHER_STRIDE_L4(int8_t, int32_t);
1913 GATHER_STRIDE_L4(int8_t, int64_t);
1914 GATHER_STRIDE_L4(uint8_t, int32_t);
1915 GATHER_STRIDE_L4(uint8_t, int64_t);
1916 GATHER_STRIDE_L4(int16_t, int32_t);
1917 GATHER_STRIDE_L4(int16_t, int64_t);
1918 GATHER_STRIDE_L4(uint16_t, int32_t);
1919 GATHER_STRIDE_L4(uint16_t, int64_t);
1920 GATHER_STRIDE_L4(int32_t, int32_t);
1921 GATHER_STRIDE_L4(int32_t, int64_t);
1922 GATHER_STRIDE_L4(uint32_t, int32_t);
1923 GATHER_STRIDE_L4(uint32_t, int64_t);
1924 GATHER_STRIDE_L4(int64_t, int32_t);
1925 GATHER_STRIDE_L4(int64_t, int64_t);
1926 GATHER_STRIDE_L4(uint64_t, int32_t);
1927 GATHER_STRIDE_L4(uint64_t, int64_t);
1928 GATHER_STRIDE_L4(float, int32_t);
1929 GATHER_STRIDE_L4(float, int64_t);
1930 GATHER_STRIDE_L4(double, int32_t);
1931 
1932 //FORCEINLINE svec<4,double> svec_gather_STRIDE(double* b, int32_t step) {
1933 // __vector double v0 = vec_splats(*b);
1934 // b += step;
1935 // __vector double v1 = vec_splats(*b);
1936 // __vector double v01 = vec_mergeh(v0, v1);
1937 // b += step;
1938 // __vector double v2 = vec_splats(*b);
1939 // b += step;
1940 // __vector double v3 = vec_splats(*b);
1941 // __vector double v23 = vec_mergeh(v2, v3);
1942 // return svec<4,double>(v01, v23);
1943 //}
1944 GATHER_STRIDE_L4(double, int64_t);
1945 
1946 
1947 
1948 
1949 SCATTER_GENERAL_L4(int8_t, uint32_t);
1950 SCATTER_GENERAL_L4(int8_t, uint64_t);
1951 SCATTER_GENERAL_L4(uint8_t, uint32_t);
1952 SCATTER_GENERAL_L4(uint8_t, uint64_t);
1953 SCATTER_GENERAL_L4(int16_t, uint32_t);
1954 SCATTER_GENERAL_L4(int16_t, uint64_t);
1955 SCATTER_GENERAL_L4(uint16_t, uint32_t);
1956 SCATTER_GENERAL_L4(uint16_t, uint64_t);
1957 SCATTER_GENERAL_L4(int32_t, uint32_t);
1958 
1959 //SCATTER_GENERAL_L4(int32_t, uint64_t);
1960 static FORCEINLINE void svec_scatter(svec<4,uint64_t> ptrs, svec<4,int32_t> val, svec<4,bool> mask) {
1961  #ifdef __POWER8
1962  lScatter64_32<int32_t, svec<4,uint64_t>, svec<4,int32_t> >(ptrs,val,mask);
1963  #else
1964  lScatterGeneral<int32_t, svec<4,uint64_t>, svec<4,int32_t>, svec<4,bool> >(ptrs,val,mask);
1965  #endif
1966 }
1967 
1968 SCATTER_GENERAL_L4(uint32_t, uint32_t);
1969 
1970 //SCATTER_GENERAL_L4(uint32_t, uint64_t);
1971 static FORCEINLINE void svec_scatter(svec<4,uint64_t> ptrs, svec<4,uint32_t> val, svec<4,bool> mask) {
1972  #ifdef __POWER8
1973  lScatter64_32<uint32_t, svec<4,uint64_t>, svec<4,uint32_t> >(ptrs,val,mask);
1974  #else
1975  lScatterGeneral<uint32_t, svec<4,uint64_t>, svec<4,uint32_t>, svec<4,bool> >(ptrs,val,mask);
1976  #endif
1977 }
1978 
1979 SCATTER_GENERAL_L4(int64_t, uint32_t);
1980 SCATTER_GENERAL_L4(int64_t, uint64_t);
1981 SCATTER_GENERAL_L4(uint64_t, uint32_t);
1982 SCATTER_GENERAL_L4(uint64_t, uint64_t);
1983 SCATTER_GENERAL_L4(float, uint32_t);
1984 
1985 //SCATTER_GENERAL_L4(float, uint64_t);
1986 static FORCEINLINE void svec_scatter (svec<4,uint64_t> ptrs,svec<4,float> val,svec<4,bool> mask) {
1987  #ifdef __POWER8
1988  lScatter64_32<float, svec<4,uint64_t>, svec<4,float> >(ptrs,val,mask);
1989  #else
1990  lScatterGeneral<float, svec<4,uint64_t>, svec<4,float>, svec<4,bool> >(ptrs,val,mask);
1991  #endif
1992 }
1993 
1994 SCATTER_GENERAL_L4(double, uint32_t);
1995 SCATTER_GENERAL_L4(double, uint64_t);
1996 
1997 #ifdef __POWER8
1998 template<typename STYPE, typename OTYPE, typename VTYPE>
1999 static FORCEINLINE void lScatterBaseOffsets32_32(unsigned char *b,
2000  uint32_t scale, OTYPE offsets,
2001  VTYPE val, svec<4,bool> mask) {
2002  //data is 32; offset is 32
2003  unsigned char *base = b;
2004  //extract offsets
2005  uint64_t doff1 = vec_extract_l(offsets.v);
2006  uint64_t doff2 = vec_extract_r(offsets.v);
2007  //split them in four
2008  uint32_t o1=(uint32_t) doff1;
2009  uint32_t o0=(uint32_t)(doff1 >> 32);
2010  uint32_t o3=(uint32_t) doff2;
2011  uint32_t o2=(uint32_t)(doff2 >> 32);
2012 
2013  //extract mask
2014  doff1 = vec_extract_l(mask.v);
2015  doff2 = vec_extract_r(mask.v);
2016  //split them in four
2017  uint32_t m1=(uint32_t) doff1;
2018  uint32_t m0=(uint32_t)(doff1 >> 32);
2019  uint32_t m3=(uint32_t) doff2;
2020  uint32_t m2=(uint32_t)(doff2 >> 32);
2021 
2022  //check correctness
2023  /*
2024  if(o0 != __extract_element(offsets,0) ||
2025  o1 != __extract_element(offsets,1) ||
2026  o2 != __extract_element(offsets,2) ||
2027  o3 != __extract_element(offsets,3)) {
2028  printf("Error while extracting offsets for scatter\n");
2029  }
2030 
2031  if(m0 != __extract_element(mask,0) ||
2032  m1 != __extract_element(mask,1) ||
2033  m2 != __extract_element(mask,2) ||
2034  m3 != __extract_element(mask,3)) {
2035  printf("Error while extracting mask for scatter\n");
2036  }
2037  */
2038 
2039  STYPE *ptr0 = (STYPE *)(base + scale * o0);
2040  STYPE *ptr1 = (STYPE *)(base + scale * o1);
2041  STYPE *ptr2 = (STYPE *)(base + scale * o2);
2042  STYPE *ptr3 = (STYPE *)(base + scale * o3);
2043 
2044  if(m0)
2045  vec_scatter_step_12(ptr0, val.v);
2046  if(m1)
2047  vec_scatter_step_0(ptr1, val.v);
2048  if(m2)
2049  vec_scatter_step_4(ptr2, val.v);
2050  if(m3)
2051  vec_scatter_step_8(ptr3, val.v);
2052 }
2053 
2054 
2055 template<typename STYPE, typename OTYPE, typename VTYPE>
2056 static FORCEINLINE void lScatterBaseOffsets64_32(unsigned char *b,
2057  uint32_t scale, OTYPE offsets,
2058  VTYPE val, svec<4,bool> mask) {
2059  //data is 32; offset is 64
2060  unsigned char *base = b;
2061 
2062  uint64_t o0 = vec_extract_l(offsets.v[0]);
2063  uint64_t o1 = vec_extract_r(offsets.v[0]);
2064  uint64_t o2 = vec_extract_l(offsets.v[1]);
2065  uint64_t o3 = vec_extract_r(offsets.v[1]);
2066 
2067  //extract mask
2068  uint64_t doff1 = vec_extract_l(mask.v);
2069  uint64_t doff2 = vec_extract_r(mask.v);
2070  //split them in four
2071  uint32_t m1=(uint32_t) doff1;
2072  uint32_t m0=(uint32_t)(doff1 >> 32);
2073  uint32_t m3=(uint32_t) doff2;
2074  uint32_t m2=(uint32_t)(doff2 >> 32);
2075 
2076  //check correctness
2077  /*
2078  if(o0 != __extract_element(offsets,0) ||
2079  o1 != __extract_element(offsets,1) ||
2080  o2 != __extract_element(offsets,2) ||
2081  o3 != __extract_element(offsets,3)) {
2082  printf("Error while extracting offsets for scatter\n");
2083  }
2084 
2085  if(m0 != __extract_element(mask,0) ||
2086  m1 != __extract_element(mask,1) ||
2087  m2 != __extract_element(mask,2) ||
2088  m3 != __extract_element(mask,3)) {
2089  printf("Error while extracting mask for scatter\n");
2090  }
2091  */
2092 
2093  STYPE *ptr0 = (STYPE *)(base + scale * o0);
2094  STYPE *ptr1 = (STYPE *)(base + scale * o1);
2095  STYPE *ptr2 = (STYPE *)(base + scale * o2);
2096  STYPE *ptr3 = (STYPE *)(base + scale * o3);
2097 
2098  if(m0)
2099  vec_scatter_step_12(ptr0, val.v);
2100  if(m1)
2101  vec_scatter_step_0(ptr1, val.v);
2102  if(m2)
2103  vec_scatter_step_4(ptr2, val.v);
2104  if(m3)
2105  vec_scatter_step_8(ptr3, val.v);
2106 }
2107 #endif
2108 
2109 
2110 
2111 SCATTER_BASE_OFFSETS_L4(int8_t, int32_t);
2112 SCATTER_BASE_OFFSETS_L4(int8_t, int64_t);
2113 SCATTER_BASE_OFFSETS_L4(uint8_t, int32_t);
2114 SCATTER_BASE_OFFSETS_L4(uint8_t, int64_t);
2115 SCATTER_BASE_OFFSETS_L4(int16_t, int32_t);
2116 SCATTER_BASE_OFFSETS_L4(int16_t, int64_t);
2117 SCATTER_BASE_OFFSETS_L4(uint16_t, int32_t);
2118 SCATTER_BASE_OFFSETS_L4(uint16_t, int64_t);
2119 
2120 //SCATTER_BASE_OFFSETS_L4(int32_t, int32_t);
2121 static FORCEINLINE void
2122 svec_scatter_base_offsets(int32_t* p, uint32_t scale, svec<4,int32_t> offsets,
2123  svec<4,int32_t> val, svec<4,bool> mask){
2124  uint8_t* b = (uint8_t*) p;
2125  #ifdef __POWER8
2126  lScatterBaseOffsets32_32<int32_t, svec<4,int32_t>, svec<4,int32_t> >(b,scale,offsets,val,mask);
2127  #else
2128  lScatterBaseOffsets<int32_t, svec<4,int32_t>, svec<4,int32_t> >(b,scale,offsets,val,mask);
2129  #endif
2130 }
2131 
2132 
2133 //SCATTER_BASE_OFFSETS_L4(int32_t, int64_t);
2134 static FORCEINLINE void
2135 svec_scatter_base_offsets(int32_t* p, uint32_t scale, svec<4,int64_t> offsets,
2136  svec<4,int32_t> val, svec<4,bool> mask){
2137  uint8_t* b = (uint8_t*) p;
2138  #ifdef __POWER8
2139  lScatterBaseOffsets64_32<int32_t, svec<4,int64_t>, svec<4,int32_t> >(b,scale,offsets,val,mask);
2140  #else
2141  lScatterBaseOffsets<int32_t,svec<4,int64_t>, svec<4,int32_t> >(b,scale,offsets,val,mask);
2142  #endif
2143 }
2144 
2145 //SCATTER_BASE_OFFSETS_L4(uint32_t, int32_t);
2146 static FORCEINLINE void
2147 svec_scatter_base_offsets(uint32_t* p, uint32_t scale, svec<4,int32_t> offsets,
2148  svec<4,uint32_t> val, svec<4,bool> mask){
2149  uint8_t* b = (uint8_t*) p;
2150  #ifdef __POWER8
2151  lScatterBaseOffsets32_32<uint32_t, svec<4,int32_t>, svec<4,uint32_t> >(b,scale,offsets,val,mask);
2152  #else
2153  lScatterBaseOffsets<uint32_t, svec<4,int32_t>, svec<4,uint32_t> >(b,scale,offsets,val,mask);
2154  #endif
2155 }
2156 
2157 //SCATTER_BASE_OFFSETS_L4(uint32_t, int64_t);
2158 static FORCEINLINE void
2159 svec_scatter_base_offsets(uint32_t* p, uint32_t scale, svec<4,int64_t> offsets,
2160  svec<4,uint32_t> val, svec<4,bool> mask){
2161  uint8_t* b = (uint8_t*) p;
2162  #ifdef __POWER8
2163  lScatterBaseOffsets64_32<uint32_t, svec<4,int64_t>, svec<4,uint32_t> >(b,scale,offsets,val,mask);
2164  #else
2165  lScatterBaseOffsets<uint32_t,svec<4,int64_t>, svec<4,uint32_t> >(b,scale,offsets,val,mask);
2166  #endif
2167 }
2168 
2169 SCATTER_BASE_OFFSETS_L4(int64_t, int32_t);
2170 SCATTER_BASE_OFFSETS_L4(int64_t, int64_t);
2171 SCATTER_BASE_OFFSETS_L4(uint64_t, int32_t);
2172 SCATTER_BASE_OFFSETS_L4(uint64_t, int64_t);
2173 
2174 //SCATTER_BASE_OFFSETS_L4(float, int32_t);
2175 static FORCEINLINE void
2176 svec_scatter_base_offsets(float* p, uint32_t scale, svec<4,int32_t> offsets,
2177  svec<4,float> val,svec<4,bool> mask){
2178  uint8_t* b = (uint8_t*)p;
2179  #ifdef __POWER8
2180  lScatterBaseOffsets32_32<float, svec<4,int32_t>, svec<4,float> >(b,scale,offsets,val,mask);
2181  #else
2182  lScatterBaseOffsets<float, svec<4,int32_t>, svec<4,float> >(b,scale,offsets,val,mask);
2183  #endif
2184 }
2185 
2186 //SCATTER_BASE_OFFSETS_L4(float, int64_t);
2187 static FORCEINLINE void
2188 svec_scatter_base_offsets(float* p,uint32_t scale, svec<4,int64_t> offsets,
2189  svec<4,float> val, svec<4,bool> mask){
2190  uint8_t* b = (uint8_t*)p;
2191  #ifdef __POWER8
2192  lScatterBaseOffsets64_32<float, svec<4,int64_t>, svec<4,float> >(b,scale,offsets,val,mask);
2193  #else
2194  lScatterBaseOffsets<float, svec<4,int64_t>, svec<4,float> >(b,scale,offsets,val,mask);
2195  #endif
2196 }
2197 
2198 
2199 SCATTER_BASE_OFFSETS_L4(double, int32_t);
2200 
2201 //SCATTER_BASE_OFFSETS_L4(double, int64_t);
2202 static FORCEINLINE void
2203 svec_scatter_base_offsets(double* p, uint32_t scale, svec<4,int64_t> offsets,
2204  svec<4,double> val, svec<4,bool> mask){
2205  uint8_t* b = (uint8_t*)p;
2206  lScatterBaseOffsets<double, svec<4,int64_t>, svec<4,double> >(b,scale,offsets,val,mask);
2207 }
2208 
2209 SCATTER_STRIDE_L4(int8_t, int32_t);
2210 SCATTER_STRIDE_L4(int8_t, int64_t);
2211 SCATTER_STRIDE_L4(uint8_t, int32_t);
2212 SCATTER_STRIDE_L4(uint8_t, int64_t);
2213 SCATTER_STRIDE_L4(int16_t, int32_t);
2214 SCATTER_STRIDE_L4(int16_t, int64_t);
2215 SCATTER_STRIDE_L4(uint16_t, int32_t);
2216 SCATTER_STRIDE_L4(uint16_t, int64_t);
2217 SCATTER_STRIDE_L4(int32_t, int32_t);
2218 SCATTER_STRIDE_L4(int32_t, int64_t);
2219 SCATTER_STRIDE_L4(uint32_t, int32_t);
2220 SCATTER_STRIDE_L4(uint32_t, int64_t);
2221 SCATTER_STRIDE_L4(int64_t, int32_t);
2222 SCATTER_STRIDE_L4(int64_t, int64_t);
2223 SCATTER_STRIDE_L4(uint64_t, int32_t);
2224 SCATTER_STRIDE_L4(uint64_t, int64_t);
2225 SCATTER_STRIDE_L4(float, int32_t);
2226 SCATTER_STRIDE_L4(float, int64_t);
2227 SCATTER_STRIDE_L4(double, int32_t);
2228 SCATTER_STRIDE_L4(double, int64_t);
2229 
2230 #endif //DOXYGEN_SHOULD_SKIP_THIS
2231 
2232 
2233 // 5. masked load/masked store
2234 
2235 //Masked load/store is implemented based on gather_base_offsets/scatter_base_offsets
2236 //Here we only use offsets with 32bit
2237 
2238 MASKED_LOAD_STORE_L4(int8_t);
2239 MASKED_LOAD_STORE_L4(uint8_t);
2240 MASKED_LOAD_STORE_L4(int16_t);
2241 MASKED_LOAD_STORE_L4(uint16_t);
2242 MASKED_LOAD_STORE_L4(int32_t);
2243 MASKED_LOAD_STORE_L4(uint32_t);
2244 MASKED_LOAD_STORE_L4(int64_t);
2245 MASKED_LOAD_STORE_L4(uint64_t);
2246 MASKED_LOAD_STORE_L4(float);
2247 MASKED_LOAD_STORE_L4(double);
2248 
2250 //
2251 // Mask type (i1) interfaces
2252 //
2254 
2255 // 1. mask construction
2261 static FORCEINLINE bool svec_any_true(const svec<4,bool>& mask) {
2262  return vec_any_ne(mask.v, vec_splat_u32(0));
2263 }
2264 
2270 static FORCEINLINE bool svec_all_true(const svec<4,bool>& mask) {
2271  return vec_all_ne(mask.v, vec_splat_u32(0));
2272 }
2273 
2274 
2280 static FORCEINLINE bool svec_none_true(const svec<4,bool>& mask) {
2281  return vec_all_eq(mask.v, vec_splat_u32(0));
2282 }
2283 
2284 // 2. bit operations
2285 
2289 static FORCEINLINE svec<4,bool> svec_and(svec<4,bool> a, svec<4,bool> b) {
2290  return a.v & b.v;
2291 }
2292 
2293 
2297 static FORCEINLINE svec<4,bool> svec_or(svec<4,bool> a, svec<4,bool> b) {
2298  return a.v | b.v;
2299 }
2300 
2304 static FORCEINLINE svec<4,bool> svec_xor(svec<4,bool> a, svec<4,bool> b) {
2305  return a.v ^ b.v;
2306 }
2307 
2311 static FORCEINLINE svec<4,bool> svec_not(svec<4,bool> a) {
2312  return ~a.v;
2313 }
2314 
2315 
2317 //
2318 // General data operation interfaces
2319 //
2321 
2322 
2323 // 1. Unary
2324 #define UNARY_OP_OPT(STYPE, NAME, OP)\
2325 static FORCEINLINE svec<LANES,STYPE> NAME(svec<LANES,STYPE> a) { \
2326  return OP(a.v); \
2327 }
2328 
2332 #define UNARY_OP_OPT64(STYPE, NAME, OP)\
2333 static FORCEINLINE svec<LANES,STYPE> NAME(svec<LANES,STYPE> a) { \
2334  return svec<LANES,STYPE>(OP(a.v[0]), OP(a.v[1])); \
2335 }
2336 
2337 // neg operation
2338 UNARY_OP_OPT(int8_t, svec_neg, -);
2339 UNARY_OP_OPT(uint8_t, svec_neg, -);
2340 UNARY_OP_OPT(int16_t, svec_neg, -);
2341 UNARY_OP_OPT(uint16_t, svec_neg, -);
2342 UNARY_OP_OPT(int32_t, svec_neg, -);
2343 UNARY_OP_OPT(uint32_t, svec_neg, -);
2344 UNARY_OP_OPT64(int64_t, svec_neg, -);
2345 UNARY_OP_OPT64(uint64_t, svec_neg, -);
2346 UNARY_OP_OPT(float, svec_neg, -);
2347 UNARY_OP_OPT64(double, svec_neg, -);
2348 
2349 // 2. Math unary
2350 //round
2351 UNARY_OP_L4(float, svec_round, roundf);
2352 UNARY_OP_L4(double, svec_round, round);
2353 //floor
2354 UNARY_OP_OPT(float, svec_floor, vec_floor);
2355 UNARY_OP_L4(double, svec_floor, floor);
2356 //ceil
2357 UNARY_OP_OPT(float, svec_ceil, vec_ceil);
2358 UNARY_OP_L4(double, svec_ceil, ceil);
2359 //reverse 1/
2360 static FORCEINLINE svec<4,float> svec_rcp(svec<4,float> v) {
2361  //return vec_re(v);//Get the reciprocal estimate
2362  __vector float estimate = vec_re( v.v );
2363  //One round of Newton-Raphson refinement
2364  __vector float r = vec_madd( vec_nmsub(estimate, v.v, (__vector float){1.0,1.0,1.0,1.0} ), estimate, estimate);
2365  return svec<4,float>(r);
2366 }
2367 
2368 UNARY_OP_L4(double, svec_rcp, 1.0/);
2369 //reverse sqrt
2370 static FORCEINLINE svec<4,float> svec_rsqrt(svec<4,float> v) {
2371  //return vec_rsqrte(v);
2372  //Get the square root reciprocal estimate
2373  __vector float zero = (__vector float){0,0,0,0};
2374  __vector float oneHalf = (__vector float){0.5,0.5,0.5,0.5};
2375  __vector float one = (__vector float){1.0,1.0,1.0,1.0};
2376  __vector float estimate = vec_rsqrte( v.v );
2377  //One round of Newton-Raphson refinement
2378  __vector float estimateSquared = vec_madd( estimate, estimate, zero );
2379  __vector float halfEstimate = vec_madd( estimate, oneHalf, zero );
2380  __vector float r = vec_madd( vec_nmsub( v.v, estimateSquared, one ), halfEstimate, estimate );
2381  return svec<4,float>(r);
2382 
2383 }
2384 
2385 UNARY_OP_L4(double, svec_rsqrt, 1.0/sqrt);
2386 //sqrt
2387 static FORCEINLINE svec<4,float> svec_sqrt(svec<4,float> v) {
2388  __vector float r = vec_madd( v.v, svec_rsqrt(v).v, (__vector float){0,0,0,0} );
2389  return svec<4,float>(r);
2390 }
2391 
2392 UNARY_OP_L4(double, svec_sqrt, sqrt);
2393 
2394 //exp
2395 static FORCEINLINE svec<4,float> svec_exp(svec<4,float> v) {
2396  return vec_expte(v.v);
2397 }
2398 UNARY_OP_L4(double, svec_exp, exp);
2399 
2400 
2401 //log
2402 static FORCEINLINE svec<4,float> svec_log(svec<4,float> v) {
2403  return svec<4,float>(vec_loge(v.v)) * log(2);
2404 }
2405 UNARY_OP_L4(double, svec_log, log);
2406 //abs - for all types
2407 UNARY_OP_OPT(int8_t, svec_abs, vec_abs);
2408 static FORCEINLINE svec<4,uint8_t> svec_abs(svec<4,uint8_t> v) { return v;}
2409 UNARY_OP_OPT(int16_t, svec_abs, vec_abs);
2410 static FORCEINLINE svec<4,uint16_t> svec_abs(svec<4,uint16_t> v) { return v;}
2411 UNARY_OP_OPT(int32_t, svec_abs, vec_abs);
2412 static FORCEINLINE svec<4,uint32_t> svec_abs(svec<4,uint32_t> v) { return v;}
2413 UNARY_OP_L4(int64_t, svec_abs, abs<int64_t>);
2414 static FORCEINLINE svec<4,uint64_t> svec_abs(svec<4,uint64_t> v) { return v;}
2415 UNARY_OP_OPT(float, svec_abs, vec_abs);
2416 UNARY_OP_OPT64(double, svec_abs, vec_abs);
2417 
2418 
2419 
2420 
2421 // 3. Binary
2422 
2427 #define BINARY_OP_OPT(STYPE, NAME, OP) \
2428 static FORCEINLINE svec<LANES,STYPE> NAME(svec<LANES,STYPE> a, svec<LANES,STYPE> b) { \
2429  return svec<LANES,STYPE>(a.v OP b.v); \
2430 }
2431 
2432 #define BINARY_OP_OPT64(STYPE, NAME, OP) \
2433 static FORCEINLINE svec<LANES,STYPE> NAME(svec<LANES,STYPE> a, svec<LANES,STYPE> b) { \
2434  return svec<LANES,STYPE>(a.v[0] OP b.v[0], a.v[1] OP b.v[1]); \
2435 }
2436 
2437 #define BINARY_OP_OPT_FUNC(STYPE, STYPE2, NAME, FUNC) \
2438 static FORCEINLINE svec<LANES,STYPE> NAME(svec<LANES,STYPE> a, svec<LANES,STYPE2> b) { \
2439  return svec<LANES,STYPE>(FUNC(a.v, b.v)); \
2440 }
2441 
2442 #define BINARY_OP_OPT_FUNC64(STYPE, STYPE2, NAME, FUNC) \
2443 static FORCEINLINE svec<LANES,STYPE> NAME(svec<LANES,STYPE> a, svec<LANES,STYPE2> b) { \
2444  return svec<LANES,STYPE>(FUNC(a.v[0], b.v[0]), FUNC(a.v[1], b.v[1])); \
2445 }
2446 
2447 
2448 
2449 
2450 // add
2451 
2452 static FORCEINLINE svec<4,int8_t> svec_add (svec<4,int8_t> a, svec<4,int8_t> b) {
2453  return vec_add(a.v,b.v);
2454 }
2455 
2456 static FORCEINLINE svec<4,uint8_t> svec_add(svec<4,uint8_t> a, svec<4,uint8_t> b) {
2457  return vec_add(a.v,b.v);
2458 }
2459 
2460 static FORCEINLINE svec<4,int16_t> svec_add (svec<4,int16_t> a, svec<4,int16_t> b) {
2461  return vec_add(a.v,b.v);
2462 }
2463 
2464 static FORCEINLINE svec<4,uint16_t> svec_add(svec<4,uint16_t> a, svec<4,uint16_t> b) {
2465  return vec_add(a.v,b.v);
2466 }
2467 
2468 static FORCEINLINE svec<4,int32_t> svec_add (svec<4,int32_t> a, svec<4,int32_t> b) {
2469  return vec_add(a.v,b.v);
2470 }
2471 
2472 static FORCEINLINE svec<4,uint32_t> svec_add(svec<4,uint32_t> a, svec<4,uint32_t> b) {
2473  return vec_add(a.v,b.v);
2474 }
2475 
2476 static FORCEINLINE svec<4,int64_t> svec_add (svec<4,int64_t> a, svec<4,int64_t> b) {
2477 #ifdef __POWER8
2478  return svec<4,int64_t>(vec_add_p8(a.v[0],b.v[0]),vec_add_p8(a.v[1],b.v[1]) );
2479 #else
2480  return svec<4,int64_t>(a.v[0] + b.v[0], a.v[1] + b.v[1]);
2481 #endif
2482 }
2483 
2484 static FORCEINLINE svec<4,uint64_t> svec_add(svec<4,uint64_t> a, svec<4,uint64_t> b) {
2485 #ifdef __POWER8
2486  return svec<4,uint64_t>(vec_add_p8(a.v[0],b.v[0]),vec_add_p8(a.v[1],b.v[1]) );
2487 #else
2488  return svec<4,uint64_t>(a.v[0] + b.v[0], a.v[1] + b.v[1]);
2489 #endif
2490 }
2491 
2492 static FORCEINLINE svec<4,float> svec_add (svec<4,float> a, svec<4,float> b) {
2493  return vec_add(a.v,b.v);
2494 }
2495 
2496 static FORCEINLINE svec<4,double> svec_add(svec<4,double> a, svec<4,double> b) {
2497  return svec<4,double>(a.v[0] + b.v[0], a.v[1] + b.v[1]);
2498 }
2499 
2500 //sub
2501 static FORCEINLINE svec<4,int8_t> svec_sub (svec<4,int8_t> a, svec<4,int8_t> b) {
2502  return vec_sub(a.v,b.v);
2503 }
2504 
2505 static FORCEINLINE svec<4,uint8_t> svec_sub(svec<4,uint8_t> a, svec<4,uint8_t> b) {
2506  return vec_sub(a.v,b.v);
2507 }
2508 
2509 static FORCEINLINE svec<4,int16_t> svec_sub (svec<4,int16_t> a, svec<4,int16_t> b) {
2510  return vec_sub(a.v,b.v);
2511 }
2512 
2513 static FORCEINLINE svec<4,uint16_t> svec_sub(svec<4,uint16_t> a, svec<4,uint16_t> b) {
2514  return vec_sub(a.v,b.v);
2515 }
2516 
2517 static FORCEINLINE svec<4,int32_t> svec_sub (svec<4,int32_t> a, svec<4,int32_t> b) {
2518  return vec_sub(a.v,b.v);
2519 }
2520 
2521 static FORCEINLINE svec<4,uint32_t> svec_sub(svec<4,uint32_t> a, svec<4,uint32_t> b) {
2522  return vec_sub(a.v,b.v);
2523 }
2524 
2525 static FORCEINLINE svec<4,int64_t> svec_sub (svec<4,int64_t> a, svec<4,int64_t> b) {
2526 #ifdef __POWER8
2527  return svec<4,int64_t>(vec_sub_p8(a.v[0],b.v[0]),vec_sub_p8(a.v[1],b.v[1]) );
2528 #else
2529  return svec<4,int64_t>(a.v[0] - b.v[0], a.v[1] - b.v[1]);
2530 #endif
2531 }
2532 
2533 static FORCEINLINE svec<4,uint64_t> svec_sub(svec<4,uint64_t> a, svec<4,uint64_t> b) {
2534 #ifdef __POWER8
2535  return svec<4,uint64_t>(vec_sub_p8(a.v[0],b.v[0]),vec_sub_p8(a.v[1],b.v[1]) );
2536 #else
2537  return svec<4,uint64_t>(a.v[0] - b.v[0], a.v[1] - b.v[1]);
2538 #endif
2539 }
2540 
2541 static FORCEINLINE svec<4,float> svec_sub (svec<4,float> a, svec<4,float> b) {
2542  return vec_sub(a.v,b.v);
2543 }
2544 
2545 static FORCEINLINE svec<4,double> svec_sub(svec<4,double> a, svec<4,double> b) {
2546  return svec<4,double>(a.v[0] - b.v[0], a.v[1] - b.v[1]);
2547 }
2548 
2549 
2550 
2551 //mul
2552 static FORCEINLINE svec<4,int8_t> svec_mul (svec<4,int8_t> a, svec<4,int8_t> b) {
2553  return a.v * b.v;
2554 }
2555 
2556 static FORCEINLINE svec<4,uint8_t> svec_mul(svec<4,uint8_t> a, svec<4,uint8_t> b) {
2557  return a.v * b.v;
2558 }
2559 
2560 static FORCEINLINE svec<4,int16_t> svec_mul (svec<4,int16_t> a, svec<4,int16_t> b) {
2561  return a.v * b.v;
2562 }
2563 
2564 static FORCEINLINE svec<4,uint16_t> svec_mul(svec<4,uint16_t> a, svec<4,uint16_t> b) {
2565  return a.v * b.v;
2566 }
2567 
2568 static FORCEINLINE svec<4,int32_t> svec_mul (svec<4,int32_t> a, svec<4,int32_t> b) {
2569 #ifdef __POWER8
2570  return ((__vector signed int)vec_mul_p8((vector unsigned int)a.v,(vector unsigned int)b.v));
2571 #else
2572 
2573  return vec_mulo((__vector signed short)a.v, (__vector signed short)(b.v));
2574 
2575  //adapted from apple web site
2576  __vector unsigned int bSwapped, BD, AD_plus_BC;
2577  __vector unsigned int sixteen = vec_splat_u32(-16 ); //only low 5 bits important here
2578  __vector unsigned int zero = vec_splat_u32(0);
2579  bSwapped = vec_rl( b.v, sixteen );
2580  //Calculate A*D + B*C, and B*D
2581  BD = vec_mulo( (__vector unsigned short) a.v, (__vector unsigned short) b.v );
2582  AD_plus_BC = vec_msum( (__vector unsigned short) a.v, (__vector unsigned short) bSwapped, zero );
2583 
2584  //Left shift the high results by 16 bits
2585  AD_plus_BC = vec_sl( AD_plus_BC, sixteen );
2586 
2587  //Add in the BD component
2588  return vec_add( AD_plus_BC, BD );
2589 #endif
2590 }
2591 
2592 static FORCEINLINE svec<4,uint32_t> svec_mul(svec<4,uint32_t> a, svec<4,uint32_t> b) {
2593 #ifdef __POWER8
2594  return ((__vector signed int)vec_mul_p8((vector unsigned int)a.v,(vector unsigned int)b.v));
2595 #else
2596  //return vec_mulo((__vector signed short)a.v, (__vector signed short)(b.v));
2597  //adapted from apple web site
2598  __vector unsigned int bSwapped, BD, AD_plus_BC;
2599  __vector unsigned int sixteen = vec_splat_u32(-16 ); //only low 5 bits important here
2600  __vector unsigned int zero = vec_splat_u32(0);
2601  bSwapped = vec_rl( b.v, sixteen );
2602  //Calculate A*D + B*C, and B*D
2603  BD = vec_mulo( (__vector unsigned short) a.v, (__vector unsigned short) b.v );
2604  AD_plus_BC = vec_msum( (__vector unsigned short) a.v, (__vector unsigned short) bSwapped, zero );
2605 
2606  //Left shift the high results by 16 bits
2607  AD_plus_BC = vec_sl( AD_plus_BC, sixteen );
2608 
2609  //Add in the BD component
2610  return vec_add( AD_plus_BC, BD );
2611 #endif
2612 }
2613 
2614 static FORCEINLINE svec<4,int64_t> svec_mul (svec<4,int64_t> a, svec<4,int64_t> b) {
2615  return svec<4,int64_t>(a.v[0] * b.v[0], a.v[1] * b.v[1]);
2616 }
2617 
2618 static FORCEINLINE svec<4,uint64_t> svec_mul(svec<4,uint64_t> a, svec<4,uint64_t> b) {
2619  return svec<4,uint64_t>(a.v[0] * b.v[0], a.v[1] * b.v[1]);
2620 }
2621 
2622 static FORCEINLINE svec<4,float> svec_mul (svec<4,float> a, svec<4,float> b) {
2623  return vec_mul(a.v,b.v);
2624 }
2625 
2626 static FORCEINLINE svec<4,double> svec_mul(svec<4,double> a, svec<4,double> b) {
2627  return svec<4,double>(a.v[0] * b.v[0], a.v[1] * b.v[1]);
2628 }
2629 
2630 //div
2631 
2632 BINARY_OP_OPT(int8_t, svec_div, /);
2633 BINARY_OP_OPT(uint8_t, svec_div, /);
2634 BINARY_OP_OPT(int16_t, svec_div, /);
2635 BINARY_OP_OPT(uint16_t, svec_div, /);
2636 BINARY_OP_OPT(int32_t, svec_div, /);
2637 BINARY_OP_OPT(uint32_t, svec_div, /);
2638 BINARY_OP_OPT64(int64_t, svec_div, /);
2639 BINARY_OP_OPT64(uint64_t, svec_div, /);
2640 BINARY_OP_OPT(float, svec_div, /);
2641 BINARY_OP_OPT64(double, svec_div, /);
2642 
2643 
2644 //power only for float
2645 BINARY_OP_FUNC_L4(float, svec_pow, powf);
2646 BINARY_OP_FUNC_L4(double, svec_pow, pow);
2647 
2648 //or
2649 BINARY_OP_OPT(int8_t, svec_or, |);
2650 BINARY_OP_OPT(uint8_t, svec_or, |);
2651 BINARY_OP_OPT(int16_t, svec_or, |);
2652 BINARY_OP_OPT(uint16_t, svec_or, |);
2653 BINARY_OP_OPT(int32_t, svec_or, |);
2654 BINARY_OP_OPT(uint32_t, svec_or, |);
2655 BINARY_OP_OPT64(int64_t, svec_or, |);
2656 BINARY_OP_OPT64(uint64_t, svec_or, |);
2657 //and
2658 BINARY_OP_OPT(int8_t, svec_and, &);
2659 BINARY_OP_OPT(uint8_t, svec_and, &);
2660 BINARY_OP_OPT(int16_t, svec_and, &);
2661 BINARY_OP_OPT(uint16_t, svec_and, &);
2662 BINARY_OP_OPT(int32_t, svec_and, &);
2663 BINARY_OP_OPT(uint32_t, svec_and, &);
2664 BINARY_OP_OPT64(int64_t, svec_and, &);
2665 BINARY_OP_OPT64(uint64_t, svec_and, &);
2666 
2667 //xor
2668 BINARY_OP_OPT(int8_t, svec_xor, ^);
2669 BINARY_OP_OPT(uint8_t, svec_xor, ^);
2670 BINARY_OP_OPT(int16_t, svec_xor, ^);
2671 BINARY_OP_OPT(uint16_t, svec_xor, ^);
2672 BINARY_OP_OPT(int32_t, svec_xor, ^);
2673 BINARY_OP_OPT(uint32_t, svec_xor, ^);
2674 BINARY_OP_OPT64(int64_t, svec_xor, ^);
2675 BINARY_OP_OPT64(uint64_t, svec_xor, ^);
2676 
2677 #define BIN_VEC_SCAL(STYPE) \
2678 static FORCEINLINE svec<LANES,STYPE> svec_add_scalar(svec<LANES,STYPE> a, STYPE s) { \
2679  return svec_add(a, svec<LANES,STYPE>(s)); \
2680 } \
2681 static FORCEINLINE svec<LANES,STYPE> svec_scalar_add(STYPE s, svec<LANES,STYPE> a) { \
2682  return svec_add(svec<LANES,STYPE>(s), a); \
2683 } \
2684 static FORCEINLINE svec<LANES,STYPE> svec_sub_scalar(svec<LANES,STYPE> a, STYPE s) { \
2685  return svec_sub(a, svec<LANES,STYPE>(s)); \
2686 } \
2687 static FORCEINLINE svec<LANES,STYPE> svec_scalar_sub(STYPE s, svec<LANES,STYPE> a) { \
2688  return svec_sub(svec<LANES,STYPE>(s), a); \
2689 } \
2690 static FORCEINLINE svec<LANES,STYPE> svec_mul_scalar(svec<LANES,STYPE> a, STYPE s) { \
2691  return svec_mul(a, svec<LANES,STYPE>(s)); \
2692 } \
2693 static FORCEINLINE svec<LANES,STYPE> svec_scalar_mul(STYPE s, svec<LANES,STYPE> a) { \
2694  return svec_mul(svec<LANES,STYPE>(s), a); \
2695 } \
2696 static FORCEINLINE svec<LANES,STYPE> svec_div_scalar(svec<LANES,STYPE> a, STYPE s) { \
2697  return svec_div(a, svec<LANES,STYPE>(s)); \
2698 } \
2699 static FORCEINLINE svec<LANES,STYPE> svec_scalar_div(STYPE s, svec<LANES,STYPE> a) { \
2700  return svec_div(svec<LANES,STYPE>(s), a); \
2701 } \
2702 
2703 BIN_VEC_SCAL(int8_t);
2704 BIN_VEC_SCAL(uint8_t);
2705 BIN_VEC_SCAL(int16_t);
2706 BIN_VEC_SCAL(uint16_t);
2707 BIN_VEC_SCAL(int32_t);
2708 BIN_VEC_SCAL(uint32_t);
2709 BIN_VEC_SCAL(int64_t);
2710 BIN_VEC_SCAL(uint64_t);
2711 BIN_VEC_SCAL(float);
2712 BIN_VEC_SCAL(double);
2713 
2714 
2715 //shift left
2716 BINARY_OP_OPT_FUNC(int8_t, uint8_t, svec_shl, vec_sl);
2717 BINARY_OP_OPT_FUNC(uint8_t, uint8_t, svec_shl, vec_sl);
2718 BINARY_OP_OPT_FUNC(int16_t, uint16_t, svec_shl, vec_sl);
2719 BINARY_OP_OPT_FUNC(uint16_t, uint16_t, svec_shl, vec_sl);
2720 BINARY_OP_OPT_FUNC(int32_t, uint32_t, svec_shl, vec_sl);
2721 BINARY_OP_OPT_FUNC(uint32_t, uint32_t, svec_shl, vec_sl);
2722 
2723 //BINARY_OP_OPT_FUNC64(int64_t, uint64_t, svec_shl, vec_sl);
2724 static FORCEINLINE svec<4,int64_t> svec_shl(svec<4,int64_t> a, svec<4,uint64_t> b) {
2725  INC_STATS_NAME(STATS_BINARY_SLOW,1, "shl i64"); \
2726  return svec<4,int64_t>(a[0] << b[0], a[1] << b[1], a[2] << b[2], a[3] << b[3]);
2727 }
2728 
2729 //BINARY_OP_OPT_FUNC64(uint64_t, uint64_t, svec_shl, vec_sl);
2730 static FORCEINLINE svec<4,uint64_t> svec_shl(svec<4,uint64_t> a, svec<4,uint64_t> b) {
2731  INC_STATS_NAME(STATS_BINARY_SLOW,1, "shl u64"); \
2732  return svec<4,uint64_t>(a[0] << b[0], a[1] << b[1], a[2] << b[2], a[3] << b[3]);
2733 }
2734 //shift right
2735 BINARY_OP_OPT_FUNC(int8_t, uint8_t, svec_shr, vec_sra);
2736 BINARY_OP_OPT_FUNC(uint8_t, uint8_t, svec_shr, vec_sr);
2737 BINARY_OP_OPT_FUNC(int16_t, uint16_t, svec_shr, vec_sra);
2738 BINARY_OP_OPT_FUNC(uint16_t, uint16_t, svec_shr, vec_sr);
2739 BINARY_OP_OPT_FUNC(int32_t, uint32_t, svec_shr, vec_sra);
2740 BINARY_OP_OPT_FUNC(uint32_t, uint32_t, svec_shr, vec_sr);
2741 
2742 //BINARY_OP_OPT_FUNC64(int64_t, uint64_t, svec_shr, vec_sr);
2743 static FORCEINLINE svec<4,int64_t> svec_shr(svec<4,int64_t> a, svec<4,uint64_t> b) {
2744  INC_STATS_NAME(STATS_BINARY_SLOW,1, "shr i64"); \
2745  return svec<4,int64_t>(a[0] >> b[0], a[1] >> b[1], a[2] >> b[2], a[3] >> b[3]);
2746 }
2747 
2748 //BINARY_OP_OPT_FUNC64(uint64_t, uint64_t, svec_shr, vec_sr);
2749 static FORCEINLINE svec<4,uint64_t> svec_shr(svec<4,uint64_t> a, svec<4,uint64_t> b) {
2750  INC_STATS_NAME(STATS_BINARY_SLOW,1, "shr u64"); \
2751  return svec<4,uint64_t>(a[0] >> b[0], a[1] >> b[1], a[2] >> b[2], a[3] >> b[3]);
2752 }
2753 
2754 //uniform shift left
2755 
2756 // a better impl may be by smear and vector shift
2757 BINARY_OP_SCALAR_L4(int8_t, int32_t, svec_shl, <<);
2758 BINARY_OP_SCALAR_L4(uint8_t, int32_t, svec_shl, <<);
2759 BINARY_OP_SCALAR_L4(int16_t, int32_t, svec_shl, <<);
2760 BINARY_OP_SCALAR_L4(uint16_t, int32_t, svec_shl, <<);
2761 BINARY_OP_SCALAR_L4(int32_t, int32_t, svec_shl, <<);
2762 BINARY_OP_SCALAR_L4(uint32_t, int32_t, svec_shl, <<);
2763 BINARY_OP_SCALAR_L4(int64_t, int32_t, svec_shl, <<);
2764 BINARY_OP_SCALAR_L4(uint64_t, int32_t, svec_shl, <<);
2765 //shift right
2766 BINARY_OP_SCALAR_L4(int8_t, int32_t, svec_shr, >>);
2767 BINARY_OP_SCALAR_L4(uint8_t, int32_t, svec_shr, >>);
2768 BINARY_OP_SCALAR_L4(int16_t, int32_t, svec_shr, >>);
2769 BINARY_OP_SCALAR_L4(uint16_t, int32_t, svec_shr, >>);
2770 BINARY_OP_SCALAR_L4(int32_t, int32_t, svec_shr, >>);
2771 BINARY_OP_SCALAR_L4(uint32_t, int32_t, svec_shr, >>);
2772 BINARY_OP_SCALAR_L4(int64_t, int32_t, svec_shr, >>);
2773 BINARY_OP_SCALAR_L4(uint64_t, int32_t, svec_shr, >>);
2774 
2775 //remainder %
2776 
2780 BINARY_OP_L4(int8_t, svec_rem, %);
2781 BINARY_OP_L4(uint8_t, svec_rem, %);
2782 BINARY_OP_L4(int16_t, svec_rem, %);
2783 BINARY_OP_L4(uint16_t, svec_rem, %);
2784 BINARY_OP_L4(int32_t, svec_rem, %);
2785 BINARY_OP_L4(uint32_t, svec_rem, %);
2786 BINARY_OP_L4(int64_t, svec_rem, %);
2787 BINARY_OP_L4(uint64_t, svec_rem, %);
2788 
2789 BINARY_OP_SCALAR_L4(int8_t, int8_t, svec_rem, %);
2790 BINARY_OP_SCALAR_L4(uint8_t, uint8_t, svec_rem, %);
2791 BINARY_OP_SCALAR_L4(int16_t, int16_t, svec_rem, %);
2792 BINARY_OP_SCALAR_L4(uint16_t, uint16_t, svec_rem, %);
2793 BINARY_OP_SCALAR_L4(int32_t, int32_t, svec_rem, %);
2794 BINARY_OP_SCALAR_L4(uint32_t, uint16_t, svec_rem, %);
2795 BINARY_OP_SCALAR_L4(int64_t, int64_t, svec_rem, %);
2796 BINARY_OP_SCALAR_L4(uint64_t, uint64_t, svec_rem, %);
2797 
2798 
2799 // 4. Ternary
2800 
2801 //madd / msub for only int32/u32/float/double
2802 TERNERY_L4(int32_t);
2803 TERNERY_L4(uint32_t);
2804 TERNERY_L4(int64_t);
2805 TERNERY_L4(uint64_t);
2806 
2811  return vec_madd(a.v, b.v, c.v);
2812 }
2817  return svec<4,double>(vec_madd(a.v[0], b.v[0], c.v[0]), vec_madd(a.v[1], b.v[1], c.v[1]));
2818 }
2823  return vec_msub(a.v, b.v, c.v);
2824 }
2829  return svec<4,double>(vec_msub(a.v[0], b.v[0], c.v[0]), vec_msub(a.v[1], b.v[1], c.v[1]));
2830 }
2835  return vec_nmsub(a.v, b.v, c.v);
2836 }
2841  return svec<4,double>(vec_nmsub(a.v[0], b.v[0], c.v[0]), vec_nmsub(a.v[1], b.v[1], c.v[1]));
2842 }
2843 
2844 // 5. Max/Min
2845 
2846 //add/max/min
2847 BINARY_OP_OPT_FUNC(int8_t, int8_t, svec_max, vec_max);
2848 BINARY_OP_OPT_FUNC(uint8_t, uint8_t, svec_max, vec_max);
2849 BINARY_OP_OPT_FUNC(int16_t, int16_t, svec_max, vec_max);
2850 BINARY_OP_OPT_FUNC(uint16_t, uint16_t, svec_max, vec_max);
2851 BINARY_OP_OPT_FUNC(int32_t, int32_t, svec_max, vec_max);
2852 BINARY_OP_OPT_FUNC(uint32_t, uint32_t, svec_max, vec_max);
2853 BINARY_OP_FUNC_L4(int64_t, svec_max, max<int64_t>);
2854 BINARY_OP_FUNC_L4(uint64_t, svec_max, max<uint64_t>);
2855 BINARY_OP_OPT_FUNC(float, float, svec_max, vec_max);
2856 BINARY_OP_FUNC_L4(double, svec_max, max<double>);
2857 
2858 BINARY_OP_OPT_FUNC(int8_t, int8_t, svec_min, vec_min);
2859 BINARY_OP_OPT_FUNC(uint8_t, uint8_t, svec_min, vec_min);
2860 BINARY_OP_OPT_FUNC(int16_t, int16_t, svec_min, vec_min);
2861 BINARY_OP_OPT_FUNC(uint16_t, uint16_t, svec_min, vec_min);
2862 BINARY_OP_OPT_FUNC(int32_t, int32_t, svec_min, vec_min);
2863 BINARY_OP_OPT_FUNC(uint32_t, uint32_t, svec_min, vec_min);
2864 BINARY_OP_FUNC_L4(int64_t, svec_min, min<int64_t>);
2865 BINARY_OP_FUNC_L4(uint64_t, svec_min, min<uint64_t>);
2866 BINARY_OP_OPT_FUNC(float, float, svec_min, vec_min);
2867 BINARY_OP_FUNC_L4(double, svec_min, min<double>);
2868 
2869 // 6. reduce
2870 
2871 #define MAX_MIN_REDUCE_METHODS(STYPE) \
2872 BINARY_OP_REDUCE_FUNC_L4(STYPE, svec_reduce_add, add<STYPE>); \
2873 BINARY_OP_REDUCE_FUNC_L4(STYPE, svec_reduce_max, max<STYPE>); \
2874 BINARY_OP_REDUCE_FUNC_L4(STYPE, svec_reduce_min, min<STYPE>); \
2875 
2876 MAX_MIN_REDUCE_METHODS(int8_t);
2877 MAX_MIN_REDUCE_METHODS(uint8_t);
2878 MAX_MIN_REDUCE_METHODS(int16_t);
2879 MAX_MIN_REDUCE_METHODS(uint16_t);
2880 MAX_MIN_REDUCE_METHODS(int32_t);
2881 MAX_MIN_REDUCE_METHODS(uint32_t);
2882 MAX_MIN_REDUCE_METHODS(int64_t);
2883 MAX_MIN_REDUCE_METHODS(uint64_t);
2884 MAX_MIN_REDUCE_METHODS(float);
2885 MAX_MIN_REDUCE_METHODS(double);
2886 
2887 
2889  //TODO: rewrite it with vec_mergeh/vec_mergel. First 32bit merge, then 64 bit, then 32bit add
2890  return svec<LANES,float>(
2891  svec_reduce_add(v0),
2892  svec_reduce_add(v1),
2893  svec_reduce_add(v2),
2894  svec_reduce_add(v3)
2895  );
2896 }
2897 
2898 
2900  //parallel reduction using mergeh mergel
2901  __vector double sv0 = v0.v[0] + v0.v[1];
2902  __vector double sv1 = v1.v[0] + v1.v[1];
2903  __vector double sv2 = v2.v[0] + v2.v[1];
2904  __vector double sv3 = v3.v[0] + v3.v[1];
2905 
2906  __vector double h0 = vec_mergeh(sv0, sv1);
2907  __vector double l0 = vec_mergel(sv0, sv1);
2908  __vector double h1 = vec_mergeh(sv2, sv3);
2909  __vector double l1 = vec_mergel(sv2, sv3);
2910 
2911  //reduction again
2912  __vector double s0 = h0 + l0;
2913  __vector double s1 = h1 + l1;
2914  return svec<4,double>(s0, s1);
2915 }
2916 
2917 // 7. Compare
2918 
2925 static FORCEINLINE svec<4,bool> svec_equal(svec<4,bool> a, svec<4,bool> b) {
2926  return (__vector unsigned int)(vec_cmpeq(a.v, b.v));
2927 }
2928 
2935 static FORCEINLINE svec<4,bool> svec_not_equal(svec<4,bool> a, svec<4,bool> b) {
2936  return ~(__vector unsigned int)(vec_cmpeq(a.v, b.v));
2937 }
2938 
2939 
2940 static FORCEINLINE svec<4,bool> svec_equal(svec<4,int8_t> a, svec<4,int8_t> b) {
2941  __vector bool char t = vec_cmpeq(a.v,b.v);
2942  return (__vector unsigned int)vec_unpackh(vec_unpackh(t));
2943 }
2944 
2945 static FORCEINLINE svec<4,bool> svec_not_equal(svec<4,int8_t> a, svec<4,int8_t> b) {
2946  return ~ svec_equal(a, b);
2947 }
2948 
2949 CMP_OP_L4(int8_t, less_than, <);
2950 CMP_OP_L4(int8_t, less_equal, <=);
2951 CMP_OP_L4(int8_t, greater_than, >);
2952 CMP_OP_L4(int8_t, greater_equal, >=);
2954 
2955 static FORCEINLINE svec<4,bool> svec_equal(svec<4,uint8_t> a, svec<4,uint8_t> b) {
2956  __vector bool char t = vec_cmpeq(a.v,b.v);
2957  return (__vector unsigned int)vec_unpackh(vec_unpackh(t));
2958 }
2959 
2960 static FORCEINLINE svec<4,bool> svec_not_equal(svec<4,uint8_t> a, svec<4,uint8_t> b) {
2961  return ~ svec_equal(a, b);
2962 }
2963 
2964 CMP_OP_L4(uint8_t, less_than, <);
2965 CMP_OP_L4(uint8_t, less_equal, <=);
2966 CMP_OP_L4(uint8_t, greater_than, >);
2967 CMP_OP_L4(uint8_t, greater_equal, >=);
2969 
2973 CMP_ALL_NOMASK_OP_L4(int16_t);
2975 
2976 CMP_ALL_NOMASK_OP_L4(uint16_t);
2978 
2983 static FORCEINLINE svec<4,bool> svec_equal(svec<4,int32_t> a, svec<4,int32_t> b) {
2984  return (__vector unsigned int)vec_cmpeq(a.v,b.v);
2985 }
2986 
2987 static FORCEINLINE svec<4,bool> svec_not_equal(svec<4,int32_t> a, svec<4,int32_t> b) {
2988  return ~(__vector unsigned int)vec_cmpeq(a.v,b.v);
2989 }
2990 
2991 static FORCEINLINE svec<4,bool> svec_less_than(svec<4,int32_t> a, svec<4,int32_t> b) {
2992  return (__vector unsigned int)vec_cmplt(a.v,b.v);
2993 }
2994 
2995 static FORCEINLINE svec<4,bool> svec_less_equal(svec<4,int32_t> a, svec<4,int32_t> b) {
2996  return svec_less_than(a, b) | svec_equal(a, b);
2997 }
2998 
2999 static FORCEINLINE svec<4,bool> svec_greater_than(svec<4,int32_t> a, svec<4,int32_t> b) {
3000  return (__vector unsigned int)vec_cmpgt(a.v,b.v);
3001 }
3002 
3003 static FORCEINLINE svec<4,bool> svec_greater_equal(svec<4,int32_t> a, svec<4,int32_t> b) {
3004  return svec_greater_than(a, b) | svec_equal(a, b);
3005 }
3006 
3008 
3010  return (__vector unsigned int)vec_cmpeq(a.v,b.v);
3011 }
3012 
3013 static FORCEINLINE svec<4,bool> svec_not_equal(svec<4,uint32_t> a, svec<4,uint32_t> b) {
3014  return ~(__vector unsigned int)vec_cmpeq(a.v,b.v);
3015 }
3016 
3017 static FORCEINLINE svec<4,bool> svec_less_than(svec<4,uint32_t> a, svec<4,uint32_t> b) {
3018  return (__vector unsigned int)vec_cmplt(a.v,b.v);
3019 }
3020 
3021 static FORCEINLINE svec<4,bool> svec_less_equal(svec<4,uint32_t> a, svec<4,uint32_t> b) {
3022  return svec_less_than(a, b) | svec_equal(a, b);
3023 }
3024 
3025 static FORCEINLINE svec<4,bool> svec_greater_than(svec<4,uint32_t> a, svec<4,uint32_t> b) {
3026  return (__vector unsigned int)vec_cmpgt(a.v,b.v);
3027 }
3028 
3029 static FORCEINLINE svec<4,bool> svec_greater_equal(svec<4,uint32_t> a, svec<4,uint32_t> b) {
3030  return svec_greater_than(a, b) | svec_equal(a, b);
3031 }
3032 
3034 
3039 static FORCEINLINE svec<4,bool> svec_equal(svec<4,int64_t> a, svec<4,int64_t> b) {
3040 #ifdef __POWER8
3041  __vector signed long long tr1 = vec_cmpeq_p8(a.v[0], b.v[0]);
3042  __vector signed long long tr2 = vec_cmpeq_p8(a.v[1], b.v[1]);
3043  svec<4,bool> res2 = vec_pack_p8(tr1,tr2);
3044  return res2;
3045 #else
3046  INC_STATS_NAME(STATS_OTHER_SLOW, 1, "equal_i64");
3047  unsigned int r0 = a[0] == b[0];
3048  unsigned int r1 = a[1] == b[1];
3049  unsigned int r2 = a[2] == b[2];
3050  unsigned int r3 = a[3] == b[3];
3051  svec<4,bool> res = svec<4,bool>(r0,r1,r2,r3);
3052  return res;
3053 #endif
3054 }
3055 
3056 static FORCEINLINE svec<4,bool> svec_not_equal(svec<4,int64_t> a, svec<4,int64_t> b) {
3057  return ~ svec_equal(a, b);
3058 }
3059 
3060 CMP_OP_L4(int64_t, less_than, <);
3061 CMP_OP_L4(int64_t, less_equal, <=);
3062 CMP_OP_L4(int64_t, greater_than, >);
3063 CMP_OP_L4(int64_t, greater_equal, >=);
3065 
3067 #ifdef __POWER8
3068  __vector signed long long tr1 = vec_cmpeq_p8(a.v[0], b.v[0]);
3069  __vector signed long long tr2 = vec_cmpeq_p8(a.v[1], b.v[1]);
3070  svec<4,bool> res2 = vec_pack_p8(tr1,tr2);
3071  return res2;
3072 #else
3073  INC_STATS_NAME(STATS_OTHER_SLOW, 1, "equal_u64");
3074  unsigned int r0 = a[0] == b[0];
3075  unsigned int r1 = a[1] == b[1];
3076  unsigned int r2 = a[2] == b[2];
3077  unsigned int r3 = a[3] == b[3];
3078  svec<4,bool> res = svec<4,bool>(r0,r1,r2,r3);
3079  return res;
3080 #endif
3081 }
3082 
3083 static FORCEINLINE svec<4,bool> svec_not_equal(svec<4,uint64_t> a, svec<4,uint64_t> b) {
3084  return ~ svec_equal(a, b);
3085 }
3086 
3087 CMP_OP_L4(uint64_t, less_than, <);
3088 CMP_OP_L4(uint64_t, less_equal, <=);
3089 CMP_OP_L4(uint64_t, greater_than, >);
3090 CMP_OP_L4(uint64_t, greater_equal, >=);
3092 
3097 static FORCEINLINE svec<4,bool> svec_equal(svec<4,float> a, svec<4,float> b) {
3098  return (__vector unsigned int)vec_cmpeq(a.v,b.v);
3099 }
3100 
3101 static FORCEINLINE svec<4,bool> svec_not_equal(svec<4,float> a, svec<4,float> b) {
3102  return ~(__vector unsigned int)vec_cmpeq(a.v,b.v);
3103 }
3104 
3105 static FORCEINLINE svec<4,bool> svec_less_than(svec<4,float> a, svec<4,float> b) {
3106  return (__vector unsigned int)vec_cmplt(a.v,b.v);
3107 }
3108 
3109 static FORCEINLINE svec<4,bool> svec_less_equal(svec<4,float> a, svec<4,float> b) {
3110  return (__vector unsigned int)vec_cmple(a.v,b.v);
3111 }
3112 
3113 static FORCEINLINE svec<4,bool> svec_greater_than(svec<4,float> a, svec<4,float> b) {
3114  return (__vector unsigned int)vec_cmpgt(a.v,b.v);
3115 }
3116 
3117 static FORCEINLINE svec<4,bool> svec_greater_equal(svec<4,float> a, svec<4,float> b) {
3118  return (__vector unsigned int)vec_cmpge(a.v,b.v);
3119 }
3120 
3122 
3127 CMP_OP(double, equal, ==);
3128 CMP_OP(double, not_equal, !=);
3129 
3130 static FORCEINLINE svec<4,bool> svec_less_than(svec<4,double> a, svec<4,double> b) {
3131 #ifdef __POWER8
3132  __vector signed long long tr1 = (__vector signed long long)vec_cmplt(a.v[0], b.v[0]);
3133  __vector signed long long tr2 = (__vector signed long long)vec_cmplt(a.v[1], b.v[1]);
3134  return vec_pack_p8(tr1,tr2);
3135 #else
3136  INC_STATS_NAME(STATS_OTHER_SLOW, 1, "less_than_double");
3137  unsigned int r0 = a[0] < b[0];
3138  unsigned int r1 = a[1] < b[1];
3139  unsigned int r2 = a[2] < b[2];
3140  unsigned int r3 = a[3] < b[3];
3141  return svec<4,bool>(r0,r1,r2,r3);
3142 #endif
3143 }
3144 
3145 static FORCEINLINE svec<4,bool> svec_less_equal(svec<4,double> a, svec<4,double> b) {
3146  return svec_less_than(a, b) | svec_equal(a, b);
3147 }
3148 
3149 
3150 static FORCEINLINE svec<4,bool> svec_greater_than(svec<4,double> a, svec<4,double> b) {
3151 #ifdef __POWER8
3152  __vector signed long long tr1 = (__vector signed long long)vec_cmpgt(a.v[0], b.v[0]);
3153  __vector signed long long tr2 = (__vector signed long long)vec_cmpgt(a.v[1], b.v[1]);
3154  return vec_pack_p8(tr1,tr2);
3155 #else
3156  INC_STATS_NAME(STATS_OTHER_SLOW, 1, "greater_than_double");
3157  unsigned int r0 = a[0] > b[0];
3158  unsigned int r1 = a[1] > b[1];
3159  unsigned int r2 = a[2] > b[2];
3160  unsigned int r3 = a[3] > b[3];
3161  return svec<4,bool>(r0,r1,r2,r3);
3162 #endif
3163 }
3164 
3165 static FORCEINLINE svec<4,bool> svec_greater_equal(svec<4,double> a, svec<4,double> b) {
3166  return svec_greater_than(a, b) | svec_equal(a, b);
3167 }
3168 
3170 
3171 // 8. Cast
3172 
3176 #define CAST_OPT(SFROM, STO) \
3177 template <class T> static T svec_cast(svec<LANES,SFROM> val); \
3178  \
3181 template <> FORCEINLINE svec<LANES,STO> svec_cast<svec<LANES,STO> >(svec<LANES,SFROM> val) { \
3182  return svec<LANES,STO>((val.v)); \
3183 }
3184 
3188 #define CAST_OPT64(SFROM, STO) \
3189 template <class T> static T svec_cast(svec<LANES,SFROM> val); \
3190  \
3193 template <> FORCEINLINE svec<LANES,STO> svec_cast<svec<LANES,STO> >(svec<LANES,SFROM> val) { \
3194  return svec<LANES,STO>((val.v[0]),(val.v[1])); \
3195 }
3196 
3204 //i1 -> all
3205 //CAST_L4(bool, bool);
3206 CAST_L4(bool, int8_t); //better way: packing
3207 CAST_L4(bool, uint8_t); //better way: packing
3208 CAST_L4(bool, int16_t); //better way: packing
3209 CAST_L4(bool, uint16_t); //better way: packing
3210 CAST_OPT(bool, int32_t);
3211 CAST_OPT(bool, uint32_t);
3212 CAST_L4(bool, int64_t); //better way: unpack, singed ext
3213 CAST_L4(bool, uint64_t);//better way: unpack, singed ext
3214 CAST_L4(bool, float); //si to fp call
3215 CAST_L4(bool, double);
3216 
3217 //i8 -> all
3218 CAST_L4(int8_t, bool);
3219 //CAST_L4(int8_t, int8_t);
3220 CAST_OPT(int8_t, uint8_t);
3221 //CAST_L4(int8_t, int16_t); //better way, use vec_unpackh
3222 template <class T> static T svec_cast(svec<4,int8_t> val);
3227  return vec_unpackh(val.v);
3228 }
3229 //CAST_L4(int8_t, uint16_t); //better way, sext + zero mask and
3230 template <class T> static T svec_cast(svec<4,int8_t> val);
3235  __vector uint16_t v = vec_unpackh(val.v);
3236  return (v);
3237 }
3238 //CAST_L4(int8_t, int32_t); //better way, use twice vec_unpack
3239 template <class T> static T svec_cast(svec<4,int8_t> val);
3244  return vec_unpackh(vec_unpackh(val.v));
3245 }
3246 //CAST_L4(int8_t, uint32_t); //better way, use unpack + zero mask
3247 template <class T> static T svec_cast(svec<4,int8_t> val);
3252  __vector uint32_t v = vec_unpackh(vec_unpackh(val.v));
3253  return (v);
3254 }
3255 CAST_L4(int8_t, int64_t);
3256 CAST_L4(int8_t, uint64_t);
3257 CAST_L4(int8_t, float);
3258 CAST_L4(int8_t, double);
3259 
3260 //u8 -> all
3261 CAST_L4(uint8_t, bool);
3262 CAST_OPT(uint8_t, int8_t);
3263 //CAST_L4(uint8_t, uint8_t);
3264 //CAST_L4(uint8_t, int16_t); //better way, use unpack + zero mask
3265 template <class T> static T svec_cast(svec<4,uint8_t> val);
3270  __vector int16_t v = vec_unpackh((__vector int8_t)val.v);
3271  __vector int16_t mask = {0xFF, 0xFF, 0xFF, 0xFF, 0, 0, 0, 0};
3272  return (v & mask);
3273 }
3274 //CAST_L4(svec<4,uint8_t>, svec<4,uint16_t>, uint16_t); //better way use unpack + zero mask
3275 template <class T> static T svec_cast(svec<4,uint8_t> val);
3280  __vector uint16_t v = vec_unpackh((__vector int8_t)val.v);
3281  __vector uint16_t mask = {0xFF, 0xFF, 0xFF, 0xFF, 0, 0, 0, 0};
3282  return (v & mask);
3283 }
3284 //CAST_L4(uint8_t, int32_t);
3285 template <class T> static T svec_cast(svec<4,uint8_t> val); //better way use unpack + zero mask
3290  __vector int32_t v = vec_unpackh(vec_unpackh((__vector int8_t)val.v));
3291  __vector int32_t mask = {0xFF, 0xFF, 0xFF, 0xFF};
3292  return (v & mask);
3293 }
3294 //CAST_L4(svec<4,uint8_t>, svec<4,uint32_t>, uint32_t);
3295 template <class T> static T svec_cast(svec<4,uint8_t> val); //better way use unpack + zero mask
3300  __vector uint32_t v = vec_unpackh(vec_unpackh((__vector int8_t)val.v));
3301  __vector uint32_t mask = {0xFF, 0xFF, 0xFF, 0xFF};
3302  return (v & mask);
3303 }
3304 CAST_L4(uint8_t, int64_t);
3305 CAST_L4(uint8_t, uint64_t);
3306 CAST_L4(uint8_t, float);
3307 CAST_L4(uint8_t, double);
3308 
3309 //i16 -> all
3310 CAST_L4(int16_t, bool);
3311 CAST_L4(int16_t, int8_t); //could use pack
3312 CAST_L4(int16_t, uint8_t); //could use pack
3313 //CAST_L4(int16_t, int16_t);
3314 CAST_OPT(int16_t, uint16_t);
3315 //CAST_L4(int16_t, int32_t); //use unpack
3316 template <class T> static T svec_cast(svec<4,int16_t> val);
3321  return vec_unpackh(val.v);
3322 }
3323 //CAST_L4(int16_t, uint32_t); //use unpack and zeromaskout
3324 template <class T> static T svec_cast(svec<4,int16_t> val);
3329  __vector uint32_t v = vec_unpackh(val.v);
3330  return (v);
3331 }
3332 CAST_L4(int16_t, int64_t);
3333 CAST_L4(int16_t, uint64_t);
3334 CAST_L4(int16_t, float);
3335 CAST_L4(int16_t, double);
3336 
3337 //u16 -> all
3338 CAST_L4(uint16_t, bool);
3339 CAST_L4(uint16_t, int8_t);
3340 CAST_L4(uint16_t, uint8_t);
3341 CAST_OPT(uint16_t, int16_t);
3342 //CAST_L4(uint16_t, uint16_t);
3343 //CAST_L4(uint16_t, int32_t); //use unpack +mask
3344 template <class T> static T svec_cast(svec<4,uint16_t> val);
3349  __vector int32_t v = vec_unpackh((__vector int16_t)val.v);
3350  __vector int32_t mask = {0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF};
3351  return (v & mask);
3352 }
3353 //CAST_L4(uint16_t, uint32_t); //use unpack + mask
3354 template <class T> static T svec_cast(svec<4,uint16_t> val);
3359  __vector uint32_t v = vec_unpackh((__vector int16_t)val.v);
3360  __vector uint32_t mask = {0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF};
3361  return (v & mask);
3362 }
3363 CAST_L4(uint16_t, int64_t);
3364 CAST_L4(uint16_t, uint64_t);
3365 CAST_L4(uint16_t, float);
3366 CAST_L4(uint16_t, double);
3367 
3368 //i32 -> all
3369 CAST_L4(int32_t, bool);
3370 CAST_L4(int32_t, int8_t);
3371 CAST_L4(int32_t, uint8_t);
3372 CAST_L4(int32_t, int16_t);
3373 CAST_L4(int32_t, uint16_t);
3374 //CAST_L4(int32_t, int32_t);
3375 CAST_OPT(int32_t, uint32_t);
3376 //CAST_L4(int32_t, int64_t); //use p8 unpack
3377 template <class T> static T svec_cast(svec<4,int32_t> val);
3382 #ifdef __POWER8
3383  return svec<4,int64_t>(vec_unpackh_p8((__vector unsigned int)val.v),vec_unpackl_p8((__vector unsigned int)val.v));
3384 #else
3385  INC_STATS_NAME(STATS_OTHER_SLOW, 1, "cast i32 to i64");
3386  return svec<4,int64_t>((int64_t)val[0], (int64_t)val[1], (int64_t)val[2], (int64_t)val[3]);
3387 #endif
3388 }
3389 //CAST_L4(int32_t, uint64_t); //use p8 unpack
3390 template <class T> static T svec_cast(svec<4,int32_t> val);
3395 #ifdef __POWER8
3396  return svec<4,uint64_t>(vec_unpackh_p8((__vector unsigned int)val.v),vec_unpackl_p8((__vector unsigned int)val.v));
3397 #else
3398  INC_STATS_NAME(STATS_OTHER_SLOW, 1, "cast i32 to u64");
3399  return svec<4,uint64_t>((uint64_t)val[0], (uint64_t)val[1], (uint64_t)val[2], (uint64_t)val[3]);
3400 #endif
3401 }
3402 //CAST_L4(int32_t, float); //use ctf
3403 template <class T> static T svec_cast(svec<4,int32_t> val);
3408  return vec_ctf(val.v,0);
3409 }
3410 CAST_L4(int32_t, double);
3411 
3412 //u32 -> all
3413 CAST_L4(uint32_t, bool);
3414 CAST_L4(uint32_t, int8_t);
3415 CAST_L4(uint32_t, uint8_t);
3416 CAST_L4(uint32_t, int16_t);
3417 CAST_L4(uint32_t, uint16_t);
3418 CAST_OPT(uint32_t, int32_t);
3419 //CAST_L4(uint32_t, uint32_t);
3420 //CAST_L4(uint32_t, int64_t); //use p8 unpack
3421 template <class T> static T svec_cast(svec<4,uint32_t> val);
3426 #ifdef __POWER8
3427  return svec<4,int64_t>(vec_unpackh_p8((__vector unsigned int)val.v),vec_unpackl_p8((__vector unsigned int)val.v));
3428 #else
3429  INC_STATS_NAME(STATS_OTHER_SLOW, 1, "cast u32 to i64");
3430  return svec<4,int64_t>((int64_t)val[0], (int64_t)val[1], (int64_t)val[2], (int64_t)val[3]);
3431 #endif
3432 }
3433 //CAST_L4(uint32_t, uint64_t); //use p8 unpack
3434 template <class T> static T svec_cast(svec<4,uint32_t> val);
3439 #ifdef __POWER8
3440  return svec<4,uint64_t>(vec_unpackh_p8((__vector unsigned int)val.v),vec_unpackl_p8((__vector unsigned int)val.v));
3441 #else
3442  INC_STATS_NAME(STATS_OTHER_SLOW, 1, "cast u32 to u64");
3443  return svec<4,uint64_t>((uint64_t)val[0], (uint64_t)val[1], (uint64_t)val[2], (uint64_t)val[3]);
3444 #endif
3445 }
3446 CAST_L4(uint32_t, float);
3447 CAST_L4(uint32_t, double);
3448 
3449 //i64-> all
3450 CAST_L4(int64_t, bool);
3451 CAST_L4(int64_t, int8_t);
3452 CAST_L4(int64_t, uint8_t);
3453 CAST_L4(int64_t, int16_t);
3454 CAST_L4(int64_t, uint16_t);
3455 //CAST_L4(int64_t, int32_t); //use p8 trunk
3456 template <class T> static T svec_cast(svec<4,int64_t> val);
3461 #ifdef __POWER8
3462  return (__vector signed int)vec_pack_p8(val.v[0],val.v[1]);
3463 #else
3464  INC_STATS_NAME(STATS_OTHER_SLOW, 1, "cast i64 to i32");
3465  return svec<4,int32_t>((int32_t)val[0], (int32_t)val[1], (int32_t)val[2], (int32_t)val[3]);
3466 #endif
3467 }
3468 //CAST_L4(svec<4,int64_t>, uint32_t); //use p8 trunk
3469 template <class T> static T svec_cast(svec<4,int64_t> val);
3474 #ifdef __POWER8
3475  return (__vector unsigned int)vec_pack_p8(val.v[0],val.v[1]);
3476 #else
3477  INC_STATS_NAME(STATS_OTHER_SLOW, 1, "cast i64 to u32");
3478  return svec<4,uint32_t>((uint32_t)val[0], (uint32_t)val[1], (uint32_t)val[2], (uint32_t)val[3]);
3479 #endif
3480 }
3481 //CAST_L4(int64_t, int64_t);
3482 CAST_OPT64(int64_t, uint64_t);
3483 CAST_L4(int64_t, float);
3484 CAST_L4(int64_t, double);
3485 
3486 //u64 -> all
3487 CAST_L4(uint64_t, bool);
3488 CAST_L4(uint64_t, int8_t);
3489 CAST_L4(uint64_t, uint8_t);
3490 CAST_L4(uint64_t, int16_t);
3491 CAST_L4(uint64_t, uint16_t);
3492 //CAST_L4(uint64_t, int32_t); //use p8 pack
3493 template <class T> static T svec_cast(svec<4,uint64_t> val);
3498 #ifdef __POWER8
3499  return (__vector signed int)vec_pack_p8(val.v[0],val.v[1]);
3500 #else
3501  INC_STATS_NAME(STATS_OTHER_SLOW, 1, "cast u64 to i32");
3502  return svec<4,int32_t>((int32_t)val[0], (int32_t)val[1], (int32_t)val[2], (int32_t)val[3]);
3503 #endif
3504 }
3505 //CAST_L4(uint64_t, uint32_t); //use p8 pack
3506 template <class T> static T svec_cast(svec<4,uint64_t> val);
3511 #ifdef __POWER8
3512  return (__vector unsigned int)vec_pack_p8(val.v[0],val.v[1]);
3513 #else
3514  INC_STATS_NAME(STATS_OTHER_SLOW, 1, "cast u64 to u32");
3515  return svec<4,uint32_t>((uint32_t)val[0], (uint32_t)val[1], (uint32_t)val[2], (uint32_t)val[3]);
3516 #endif
3517 }
3518 CAST_OPT64(uint64_t, int64_t);
3519 //CAST_L4(uint64_t, uint64_t);
3520 CAST_L4(uint64_t, float);
3521 CAST_L4(uint64_t, double);
3522 
3523 //float -> all
3524 CAST_L4(float, bool);
3525 //CAST_L4(float, int8_t); //use cts + pack+pack
3526 template <class T> static T svec_cast(svec<4,float> val);
3531  __vector signed int tsi=vec_splat_s32(0);//{0,0,0,0};
3532  return vec_pack(vec_pack(vec_cts(val.v, 0), tsi), (__vector signed short)tsi);
3533 }
3534 //CAST_L4(svec<4,float>, uint8_t); //use ctu + pack + pack
3535 template <class T> static T svec_cast(svec<4,float> val);
3540  __vector unsigned int tsi=vec_splat_s32(0);//{0,0,0,0};
3541  return vec_pack(vec_pack(vec_ctu(val.v, 0), tsi), (__vector unsigned short)tsi);
3542 
3543 }
3544 //CAST_L4(svec<4,float>, int16_t); //use cts + pack
3545 template <class T> static T svec_cast(svec<4,float> val);
3550  __vector signed int tsi=vec_splat_s32(0);//{0,0,0,0};
3551  return vec_pack(vec_cts(val.v, 0), tsi);
3552 }
3553 //CAST_L4(svec<4,float>, uint16_t); //use ctu + pack
3554 template <class T> static T svec_cast(svec<4,float> val);
3559  __vector unsigned int tsi=vec_splat_s32(0);//{0,0,0,0};
3560  return vec_pack(vec_ctu(val.v, 0), tsi);
3561 }
3562 //CAST_L4(svec<4,float>, int32_t);//use cts
3563 template <class T> static T svec_cast(svec<4,float> val);
3568  return vec_cts(val.v, 0);
3569 }
3570 //CAST_L4(svec<4,float>, uint32_t); //use ctu
3571 template <class T> static T svec_cast(svec<4,float> val);
3576  return vec_ctu(val.v, 0);
3577 }
3578 CAST_L4(float, int64_t);
3579 CAST_L4(float, uint64_t);
3580 //CAST_L4(float, float);
3581 CAST_L4(float, double);
3582 
3583 //double -> all
3584 CAST_L4(double, bool);
3585 CAST_L4(double, int8_t);
3586 CAST_L4(double, uint8_t);
3587 CAST_L4(double, int16_t);
3588 CAST_L4(double, uint16_t);
3589 CAST_L4(double, int32_t);
3590 CAST_L4(double, uint32_t);
3591 CAST_L4(double, int64_t);
3592 CAST_L4(double, uint64_t);
3593 CAST_L4(double, float);
3594 //CAST_L4(double, double);
3595 
3597 //typedef union {
3598 // int32_t i32;
3599 // uint32_t u32;
3600 // float f;
3601 // int64_t i64;
3602 // uint64_t u64;
3603 // double d;
3604 //} BitcastUnion;
3605 //
3606 //#define CAST_BITS(FROM, FROM_F, TO, TO_F) \
3607 //template <class T> static T svec_cast_bits(FROM val); \
3608 //template <> FORCEINLINE TO svec_cast_bits<TO>(FROM val) { \
3609 // INC_STATS_NAME(STATS_CAST_SLOW, 1, #FROME"-"#TO); \
3610 // BitcastUnion u[4]; \
3611 // u[0].FROM_F = val[0]; \
3612 // u[1].FROM_F = val[1]; \
3613 // u[2].FROM_F = val[2]; \
3614 // u[3].FROM_F = val[3]; \
3615 // return TO(u[0].TO_F, u[1].TO_F, u[2].TO_F, u[3].TO_F); \
3616 //}
3617 
3621 #define CAST_BITS_OPT(SFROM, STO) \
3622 template <class T> static T svec_cast_bits(svec<LANES,SFROM> val); \
3623  \
3626 template <> FORCEINLINE svec<LANES,STO> svec_cast_bits<svec<LANES,STO> >(svec<LANES,SFROM> val) { \
3627  return svec<LANES,STO>((__vector STO)(val.v)); \
3628 }
3629 
3633 #define CAST_BITS_OPT64(SFROM, STO) \
3634 template <class T> static T svec_cast_bits(svec<LANES,SFROM> val); \
3635  \
3638 template <> FORCEINLINE svec<LANES,STO> svec_cast_bits<svec<LANES,STO> >(svec<LANES,SFROM> val) { \
3639  return svec<LANES,STO>((__vector STO)(val.v[0]), (__vector STO)(val.v[1])); \
3640 }
3641 
3642 CAST_BITS_OPT(int32_t, float);
3643 CAST_BITS_OPT(uint32_t, float);
3644 CAST_BITS_OPT(float, int32_t);
3645 CAST_BITS_OPT(float, uint32_t);
3646 
3647 CAST_BITS_OPT64(int64_t, double);
3648 CAST_BITS_OPT64(uint64_t, double);
3649 CAST_BITS_OPT64(double, int64_t);
3650 CAST_BITS_OPT64(double, uint64_t);
3651 
3652 
3653 
3655 //
3656 // Class operations based on the above interfaces
3657 //
3659 
3663 #define SUBSCRIPT_FUNC_IMPL_VSX(STYPE) \
3664 FORCEINLINE STYPE& svec<LANES,STYPE>::operator[](int index) { \
3665  INC_STATS_NAME(STATS_INSERT, 1, "insert "#STYPE); \
3666  return ((STYPE *)&v)[index]; \
3667 } \
3668 const FORCEINLINE STYPE svec<LANES,STYPE>::operator[](int index) const { \
3669  return svec_extract(*this, index); \
3670 }
3671 
3673  svec_insert(m_self, m_index, value);
3674 }
3676  svec_insert(m_self, m_index, helper.operator uint32_t());
3677 }
3679  return svec_extract(*m_self, m_index);
3680 }
3681 const FORCEINLINE uint32_t svec<4,bool>::operator[](int index) const {
3682  return svec_extract(*this, index);
3683 }
3684 
3685 SUBSCRIPT_FUNC_IMPL_VSX(int8_t);
3686 SUBSCRIPT_FUNC_IMPL_VSX(uint8_t);
3695 
3696 
3697 
3704 static FORCEINLINE uint64_t svec_movmsk(svec<4,bool> mask) {
3705  uint64_t res;
3706  res = ((mask[0]>>31) & 0x1) |
3707  ((mask[1]>>30) & 0x2) |
3708  ((mask[2]>>29) & 0x4) |
3709  ((mask[3]>>28) & 0x8);
3710  INC_STATS_NAME(STATS_OTHER_SLOW,1, "svec_movmsk");
3711  return res;
3712 }
3713 
3719 FORCEINLINE bool svec<4,bool>::any_true() { return svec_any_true(*this); }
3720 
3726 FORCEINLINE bool svec<4,bool>::all_true() { return svec_all_true(*this); }
3727 
3733 FORCEINLINE bool svec<4,bool>::none_true() { return svec_none_true(*this); }
3734 
3739 FORCEINLINE svec<4,bool> svec<4,bool>::operator~() { return svec_not(*this); }
3740 
3763 FORCEINLINE svec<4,bool> svec<4,bool>::operator!() { return svec_not(*this); }
3764 
3777 
3784  return svec_equal(*this, a);
3785 }
3786 
3793  return svec_not_equal(*this, a);
3794 }
3795 
3801 VEC_CMP_IMPL(int8_t);
3802 VEC_CMP_IMPL(uint8_t);
3803 VEC_CMP_IMPL(int16_t);
3804 VEC_CMP_IMPL(uint16_t);
3805 VEC_CMP_IMPL(int32_t);
3806 VEC_CMP_IMPL(uint32_t);
3807 VEC_CMP_IMPL(int64_t);
3808 VEC_CMP_IMPL(uint64_t);
3811 
3823 
3824 VEC_INT_CLASS_METHOD_IMPL(int8_t, uint8_t);
3825 VEC_INT_CLASS_METHOD_IMPL(uint8_t, uint8_t);
3826 VEC_INT_CLASS_METHOD_IMPL(int16_t, uint16_t);
3827 VEC_INT_CLASS_METHOD_IMPL(uint16_t, uint16_t);
3828 VEC_INT_CLASS_METHOD_IMPL(int32_t, uint32_t);
3829 VEC_INT_CLASS_METHOD_IMPL(uint32_t, uint32_t);
3830 VEC_INT_CLASS_METHOD_IMPL(int64_t, uint64_t);
3831 VEC_INT_CLASS_METHOD_IMPL(uint64_t, uint64_t);
3832 
3835 
3836 #undef LANES
3837 } //end of namespace vsx4
3838 #endif /* POWER_VSX4_H_ */
3839 
#define COUT_FUNC_BOOL_DECL()
Definition: gsimd_utility.h:266
#define CAST_OPT64(SFROM, STO)
cast based on directly change the __vector type
Definition: power_vsx4.h:3188
__vector signed short v
Definition: power_vsx4.h:339
svec()
Default constructor.
Definition: power_vsx4.h:712
svec()
Default constructor.
Definition: power_vsx4.h:190
#define CAST_OPT(SFROM, STO)
cast based on directly change the __vector type
Definition: power_vsx4.h:3176
#define GATHER_STRIDE_L4(STYPE, OSTYPE)
macros for fast impl of gather base step
Definition: gsimd_utility.h:682
__vector unsigned int v
Definition: power_vsx4.h:501
svec(int8_t a, int8_t b, int8_t c, int8_t d)
Constructor.
Definition: power_vsx4.h:251
svec(__vector unsigned long long a, __vector unsigned long long b)
For internal use only. Construct svec&lt;4,uint64_t&gt; with two __vector unsigned long long values...
Definition: power_vsx4.h:644
svec(__vector double a, __vector double b)
For internal use only. Construct svec&lt;4,double&gt; with two __vector double values.
Definition: power_vsx4.h:793
#define VEC_INT_CLASS_METHOD_DECL(STYPE, USTYPE)
macros method definition for integer vector only Note: shift&#39;s operator can only be unsigned vector ...
Definition: gsimd_utility.h:379
#define BINARY_OP_OPT(STYPE, NAME, OP)
macros based on __vector type&#39;s operator overload
Definition: power_vsx4.h:2427
#define TERNERY_L4(STYPE)
Definition: gsimd_utility.h:984
#define CMP_ALL_MASKED_OP(STYPE)
Definition: gsimd_utility.h:1099
__vector unsigned int v
use __vector unsigned int v for storage
Definition: power_vsx4.h:184
#define SCATTER_BASE_OFFSETS_L4(STYPE, OSTYPE)
Definition: gsimd_utility.h:789
svec(__vector signed char vv)
For internal use only.
Definition: power_vsx4.h:246
#define BINARY_OP_OPT64(STYPE, NAME, OP)
Definition: power_vsx4.h:2432
Definition: gsimd_utility.h:93
svec(double a)
Constructor.
Definition: power_vsx4.h:812
svec(__vector unsigned char vv)
For internal use only.
Definition: power_vsx4.h:298
svec()
Default constructor.
Definition: power_vsx4.h:788
#define CAST_BITS_OPT(SFROM, STO)
cast based on directly change the __vector type
Definition: power_vsx4.h:3621
#define GATHER_GENERAL_L4(STYPE, PSTYPE)
slow implementation of gather general Must use template to specify the return type ...
Definition: gsimd_utility.h:617
svec()
Default constructor.
Definition: power_vsx4.h:395
Data representation and operations on a vector of 4 boolean values. This is used in predicated vector...
Definition: power_vsx4.h:182
#define CMP_ALL_NOMASK_OP_L4(STYPE)
Definition: gsimd_utility.h:1091
svec(int64_t a)
Constructor.
Definition: power_vsx4.h:590
#define VEC_FLOAT_CLASS_METHOD_DECL(STYPE)
Definition: gsimd_utility.h:393
__vector double v[2]
Definition: power_vsx4.h:783
#define VEC_CLASS_METHOD_DECL(STYPE)
macros for non-mask i8 - double types&#39;s method
Definition: gsimd_utility.h:350
svec< 4, bool > svec_select(svec< 4, bool > mask, svec< 4, bool > a, svec< 4, bool > b)
construct c by selecting elements from two input vectors according to the mask
Definition: power_vsx4.h:1126
svec()
Default constructor.
Definition: power_vsx4.h:639
svec(__vector unsigned int vv)
For internal use only.
Definition: power_vsx4.h:197
#define SVEC_BOOL_CLASS_METHOD_DECL()
macros for svec&lt;N,bool&gt; class&#39;s class method
Definition: gsimd_utility.h:330
svec(__vector signed long long a, __vector signed long long b)
For internal use only. Construct svec&lt;4,int64_t&gt; with two __vector signed long long values...
Definition: power_vsx4.h:571
svec(uint32_t a, uint32_t b, uint32_t c, uint32_t d)
Constructor.
Definition: power_vsx4.h:204
#define SHUFFLES_L4(STYPE)
macro for shuffle/shuffle2 methods implementation
Definition: gsimd_utility.h:537
svec(float a, float b, float c, float d)
Constructor.
Definition: power_vsx4.h:723
svec(__vector unsigned int vv)
For internal use only.
Definition: power_vsx4.h:512
#define BROADCAST_OPT32(STYPE)
Definition: power_vsx4.h:1280
#define INC_STATS_NAME(stat, inc, opname)
Definition: gsimd_utility.h:156
svec(__vector signed int vv)
For internal use only.
Definition: power_vsx4.h:452
#define COUT_FUNC_DECL(STYPE)
Definition: gsimd_utility.h:283
svec(uint32_t a)
Constructor.
Definition: power_vsx4.h:214
#define UNARY_OP_OPT(STYPE, NAME, OP)
Definition: power_vsx4.h:2324
svec(uint64_t a, uint64_t b, uint64_t c, uint64_t d)
Constructor.
Definition: power_vsx4.h:652
#define UNARY_OP_L4(STYPE, NAME, OP)
Definition: gsimd_utility.h:841
svec(__vector float vv)
For internal use only.
Definition: power_vsx4.h:718
data representation and operations on a vector of 4 signed short.
Definition: power_vsx4.h:338
#define VEC_CMP_IMPL(STYPE)
Definition: gsimd_utility.h:1175
svec(void *p0, void *p1, void *p2, void *p3)
Constructor.
Definition: power_vsx4.h:1516
svec()
Default constructor.
Definition: power_vsx4.h:292
data representation and operations on a vector of 4 unsigned long long.
Definition: power_vsx4.h:633
svec(uint8_t a, uint8_t b, uint8_t c, uint8_t d)
Constructor.
Definition: power_vsx4.h:303
#define SUBSCRIPT_FUNC_DECL(STYPE)
macros to define a intrinsic based subscript opertor
Definition: gsimd_utility.h:247
svec(int8_t a)
Constructor.
Definition: power_vsx4.h:260
svec(__vector unsigned short vv)
For internal use only.
Definition: power_vsx4.h:401
data representation and operations on a vector of 4 signed int.
Definition: power_vsx4.h:440
svec()
Default constructor.
Definition: power_vsx4.h:344
#define MVEC_CLASS_METHOD_IMPL(STYPE)
mask class&#39;s class method impl
Definition: gsimd_utility.h:1285
svec< 4, int32_t > svec_madd(svec< 4, int32_t > a, svec< 4, int32_t > b, svec< 4, int32_t > c)
vector multiply and add operation. return a * b + c.
Definition: power_vsx4.h:2802
#define INSERT_EXTRACT_OPT(STYPE)
Definition: power_vsx4.h:851
#define SUBSCRIPT_FUNC_BOOL_DECL(STYPE)
Definition: gsimd_utility.h:251
#define VEC_CLASS_METHOD_IMPL(STYPE)
Definition: gsimd_utility.h:1301
#define GATHER_BASE_OFFSETS_L4(STYPE, OSTYPE)
Definition: gsimd_utility.h:658
data representation and operations on a vector of 4 signed long long.
Definition: power_vsx4.h:560
#define UNARY_OP_OPT64(STYPE, NAME, OP)
macros for 64bit object, i64/u64/double
Definition: power_vsx4.h:2332
__vector unsigned long long v[2]
Definition: power_vsx4.h:634
data representation and operations on a vector of 4 unsigned short.
Definition: power_vsx4.h:389
#define MASKED_LOAD_STORE_L4(STYPE)
Definition: gsimd_utility.h:797
#define VEC_FLOAT_CLASS_METHOD_IMPL(STYPE)
Definition: gsimd_utility.h:1433
svec(uint32_t a)
Constructor.
Definition: power_vsx4.h:526
#define MAX_MIN_REDUCE_METHODS(STYPE)
Definition: power_vsx4.h:2871
#define SCATTER_STRIDE_L4(STYPE, OSTYPE)
Definition: gsimd_utility.h:715
svec(double a, double b, double c, double d)
Constructor.
Definition: power_vsx4.h:801
svec(uint64_t a)
Constructor.
Definition: power_vsx4.h:663
data representation and operations on a vector of 4 unsigned int.
Definition: power_vsx4.h:500
#define SUBSCRIPT_FUNC_IMPL_VSX(STYPE)
this macro uses vsx specific intrinsics to do extract, insert
Definition: power_vsx4.h:3663
__vector signed long long v[2]
Definition: power_vsx4.h:561
svec(float a)
Constructor.
Definition: power_vsx4.h:732
#define LOAD_STORE(STYPE)
Definition: gsimd_utility.h:419
svec(uint32_t a, uint32_t b, uint32_t c, uint32_t d)
Constructor.
Definition: power_vsx4.h:517
svec(uint8_t a)
Constructor.
Definition: power_vsx4.h:313
__vector unsigned short v
Definition: power_vsx4.h:390
data representation and operations on a vector of 4 double.
Definition: power_vsx4.h:782
#define CAST_L4(SFROM, STO)
Definition: gsimd_utility.h:1124
#define BINARY_OP_L4(STYPE, NAME, OP)
macros for generic slow imple of binary operation
Definition: gsimd_utility.h:880
svec(int16_t a)
Constructor.
Definition: power_vsx4.h:364
svec< 4, int32_t > svec_nmsub(svec< 4, int32_t > a, svec< 4, int32_t > b, svec< 4, int32_t > c)
vector multiply and add operation. return - ( a * b - c).
Definition: power_vsx4.h:2802
svec()
Default constructor.
Definition: power_vsx4.h:240
svec(int a, int b, int c, int d)
Constructor.
Definition: power_vsx4.h:457
data representation and operations on a vector of 4 float.
Definition: power_vsx4.h:706
__vector signed char v
Definition: power_vsx4.h:234
svec< 4,float > svec_preduce_add(svec< 4, float > v0, svec< 4, float > v1, svec< 4, float > v2, svec< 4, float > v3)
Definition: power_vsx4.h:2888
svec(int32_t a)
Constructor.
Definition: power_vsx4.h:466
#define SCATTER_GENERAL_L4(STYPE, PSTYPE)
Definition: gsimd_utility.h:756
#define BINARY_OP_FUNC_L4(STYPE, NAME, FUNC)
Definition: gsimd_utility.h:904
#define ROTATE_L4(STYPE)
macro for rotate method implementation
Definition: gsimd_utility.h:507
svec(uint16_t a, uint16_t b, uint16_t c, uint16_t d)
Constructor.
Definition: power_vsx4.h:406
#define BINARY_OP_SCALAR_L4(STYPE, STYPE2, NAME, OP)
macros for binary: vector op scalar
Definition: gsimd_utility.h:917
#define COUT_FUNC_CHAR_DECL(STYPE)
Definition: gsimd_utility.h:275
#define CAST_BITS_OPT64(SFROM, STO)
cast based on directly change the __vector type
Definition: power_vsx4.h:3633
#define CMP_OP(STYPE, NAME, OP)
macros for binary: vector op scalar
Definition: gsimd_utility.h:1049
svec(int64_t a, int64_t b, int64_t c, int64_t d)
Constructor.
Definition: power_vsx4.h:579
#define BIN_VEC_SCAL(STYPE)
Definition: power_vsx4.h:2677
svec(__vector signed short vv)
For internal use only.
Definition: power_vsx4.h:350
svec()
Default constructor.
Definition: power_vsx4.h:446
#define BINARY_OP_OPT_FUNC(STYPE, STYPE2, NAME, FUNC)
Definition: power_vsx4.h:2437
svec()
Default constructor.
Definition: power_vsx4.h:506
#define CMP_OP_L4(STYPE, NAME, OP)
Definition: gsimd_utility.h:1057
svec< 4, int32_t > svec_msub(svec< 4, int32_t > a, svec< 4, int32_t > b, svec< 4, int32_t > c)
vector multiply and add operation. return a * b - c.
Definition: power_vsx4.h:2802
#define INSERT_EXTRACT_OPT64(STYPE)
Definition: power_vsx4.h:859
Definition: power_vsx4.h:128
#define SELECT_BOOLCOND(STYPE)
macros for svec&#39;s select by bool scalar method implementation
Definition: gsimd_utility.h:459
svec()
Default constructor,.
Definition: power_vsx4.h:566
#define VEC_INT_CLASS_METHOD_IMPL(STYPE, STYPE2)
Definition: gsimd_utility.h:1394
#define FORCEINLINE
Definition: gsimd_utility.h:175
#define BROADCAST_OPT64(STYPE)
Definition: power_vsx4.h:1286
__vector signed int v
Definition: power_vsx4.h:441
svec(uint16_t a)
Constructor.
Definition: power_vsx4.h:415
__vector float v
Definition: power_vsx4.h:707
svec(int16_t a, int16_t b, int16_t c, int16_t d)
Constructor.
Definition: power_vsx4.h:355
__vector unsigned char v
Definition: power_vsx4.h:287
#define BROADCAST_L4(STYPE)
macro for broadcast method implementation for lanes4 All broadcast are slow implementation ...
Definition: gsimd_utility.h:485