Generic SIMD Intrinsic Library API  0.6
 All Classes Namespaces Files Functions Variables Typedefs Friends Macros
gsimd_utility.h
Go to the documentation of this file.
1 
78 #ifndef GSIMD_UTILITY_H_
79 #define GSIMD_UTILITY_H_
80 
81 #include <stdint.h>
82 #include <iostream>
83 #define DUMP(v) std::cout << #v << ":" << (v) << std::endl
84 #if ((ULONG_MAX) == (UINT_MAX))
85 #define IS32BIT
86 #else
87 #define IS64BIT
88 #endif
89 
90 //simple trick to generate a compiler error if invalid template
91 //arguments are used
92 template <int Lanes, class T>
94  typedef void type;
95 };
96 
97 #if defined(ENABLE_STATS) || defined(ENABLE_STATS_AND_TRACE)
98 enum {
99  STATS_MASKED_LOAD = 0,
100  STATS_MASKED_STORE,
101  STATS_EXTRACT,
102  STATS_INSERT,
103  STATS_UNARY_SLOW,
104  STATS_BINARY_SLOW,
105  STATS_CAST_SLOW,
106  STATS_BINARY_FUNC_SLOW,
107  STATS_COMPARE_SLOW,
108  STATS_GATHER_SLOW,
109  STATS_SCATTER_SLOW,
110  STATS_SMEAR_SLOW,
111  STATS_LOAD_SLOW,
112  STATS_STORE_SLOW,
113  STATS_OTHER_SLOW,
114  LAST_STATS
115 };
116 
117 static const char* gStatNames[LAST_STATS] = {
118  "masked_load",
119  "masked_store",
120  "extract",
121  "insert",
122  "unary_slow",
123  "binary_slow",
124  "cast_slow",
125  "binary_func_slow",
126  "compare_slow",
127  "gather_slow",
128  "scatter_slow",
129  "smear_slow",
130  "load_slow",
131  "store_slow",
132  "other_slot"
133 };
134 static float gStats[LAST_STATS] = {0,0,0,0,0,0,0,0,0,0,0,0,0};
135 
136 extern "C" void print_stats()
137 {
138  std::cout << " DUMP INTRINSICS STATS" << std::endl;
139  for (int i=0; i<LAST_STATS; i++) {
140  if (gStats[i] > 0) {
141  std::cout << " - " << gStatNames[i] << ": " << gStats[i] << std::endl;
142  }
143  }
144 }
145 
146 #define INC_STATS(stat,inc) gStats[stat] += inc;
147 #ifdef ENABLE_STATS_AND_TRACE
148 #define INC_STATS_NAME(stat,inc,opname) \
149  std::cout << "slow impl of " << opname << " @ "\
150  << __FILE__ << " Line: " << __LINE__ << std::endl; \
151  gStats[stat] += inc
152 #else
153 #define INC_STATS_NAME(stat,inc,opname) gStats[stat] += inc
154 #endif // ENABLE_STATS_AND_TRACE
155 #else
156 #define INC_STATS_NAME(stat,inc,opname)
157 #define INC_STATS(stat,inc)
158 #endif
159 
160 #define NOT_IMPLEMENTED(msg) \
161  std::cout << "WARNING: operation " << msg << " is not implemented yet" << std::endl; \
162  assert(false);
163 
164 /*
165  * @brief macros to define FORCEINLINE
166  * FORCEINLINE is widely used in the interfaces
167  */
168 #ifdef _MSC_VER
169 #define FORCEINLINE __forceinline
170 #define PRE_ALIGN(x) /*__declspec(align(x))*/
171 #define POST_ALIGN(x)
172 #define roundf(x) (floorf(x + .5f))
173 #define round(x) (floor(x + .5))
174 #else
175 #define FORCEINLINE inline __attribute__((always_inline))
176 #define PRE_ALIGN(x)
177 #define POST_ALIGN(x) __attribute__ ((aligned(x)))
178 #endif
179 
180 
184 typedef uint8_t svec1_u8;
185 typedef int8_t svec1_i8;
186 typedef uint16_t svec1_u16;
187 typedef int16_t svec1_i16;
188 typedef uint32_t svec1_u32;
189 typedef int32_t svec1_i32;
190 typedef uint64_t svec1_u64;
191 typedef int64_t svec1_i64;
192 typedef float svec1_f;
193 typedef double svec1_d;
194 
195 
196 /*
197  * Register scalar type names and use iu_get_type_name to query the short type name
198  */
199 #pragma once
200 template<typename T> const char *iu_get_type_name();
201 
202 #define DEFINE_TYPE_NAME(type, name) \
203  template<> FORCEINLINE const char *iu_get_type_name<type>(){return name;} \
204 
205 //Scalar type
206 DEFINE_TYPE_NAME(int8_t, "i8");
207 DEFINE_TYPE_NAME(uint8_t, "u8");
208 DEFINE_TYPE_NAME(int16_t, "i16");
209 DEFINE_TYPE_NAME(uint16_t, "u16");
210 DEFINE_TYPE_NAME(int32_t, "i32");
211 DEFINE_TYPE_NAME(uint32_t, "u32");
212 DEFINE_TYPE_NAME(int64_t, "i64");
213 DEFINE_TYPE_NAME(uint64_t, "u64");
214 DEFINE_TYPE_NAME(float, "f");
215 DEFINE_TYPE_NAME(double, "d");
216 
217 #pragma once
218 template<typename T> FORCEINLINE void stdout_scalar(std::ostream &out, T v) {
219  out << v;
220 }
221 
222 template<> FORCEINLINE void stdout_scalar<int8_t>(std::ostream &out, int8_t v) {
223  out << int16_t(v);
224 }
225 
226 template<> FORCEINLINE void stdout_scalar<uint8_t>(std::ostream &out, uint8_t v) {
227  out << uint16_t(v);
228 }
229 
230 #pragma once
231 template<int N> const bool check_lanes(int n);
232 
233 template<> FORCEINLINE const bool check_lanes<2>(int n) { return n == 2; }
234 template<> FORCEINLINE const bool check_lanes<4>(int n) { return n == 4; }
235 template<> FORCEINLINE const bool check_lanes<8>(int n) { return n == 8; }
236 template<> FORCEINLINE const bool check_lanes<16>(int n) { return n == 16; }
237 
238 
239 
241 // Below are macros for generic implementations
243 
247 #define SUBSCRIPT_FUNC_DECL(STYPE) \
248  FORCEINLINE STYPE& operator[](int index); \
249  const FORCEINLINE STYPE operator[](int index) const;
250 
251 #define SUBSCRIPT_FUNC_BOOL_DECL(STYPE) \
252 \
255  struct Helper { \
256  int m_index; svec<LANES,bool> *m_self; \
257  FORCEINLINE Helper(svec<LANES,bool> *p_vec, int index): m_self(p_vec), m_index(index) {} \
258  FORCEINLINE void operator=(STYPE value); \
259  FORCEINLINE void operator=(Helper helper); \
260  FORCEINLINE operator STYPE() const; \
261  }; \
262  FORCEINLINE Helper operator[](int index) { return Helper(this, index);} \
263  const FORCEINLINE STYPE operator[](int index) const;
264 
265 //Only for I1 output, use 0/1 as output
266 #define COUT_FUNC_BOOL_DECL() \
267  friend std::ostream& operator<< (std::ostream &out, const svec<LANES,bool> &v) { \
268  out << "svec<" << LANES << ",bool> " << "[" << (v[0]?1:0); \
269  for(int i = 1; i < LANES ; i++) { out << ", " << (v[i]?1:0);} \
270  out << "]"; \
271  return out; \
272  } \
273 
274 //i8 type need special one for output char as int number
275 #define COUT_FUNC_CHAR_DECL(STYPE) \
276  friend std::ostream& operator<< (std::ostream &out, const svec<LANES,STYPE> &v) { \
277  out << "svec<" << LANES << "," << #STYPE <<"> [" << short(v[0]); \
278  for(int i = 1; i < LANES ; i++) { out << ", " << short(v[i]);} \
279  out << "]"; \
280  return out; \
281  } \
282 
283 #define COUT_FUNC_DECL(STYPE) \
284  friend std::ostream& operator<< (std::ostream &out, const svec<LANES,STYPE> &v) { \
285  out << "svec<" << LANES << "," << #STYPE <<"> [" << v[0]; \
286  for(int i = 1; i < LANES ; i++) { out << ", " << v[i];} \
287  out << "]"; \
288  return out; \
289  } \
290 
291 
296 #define VEC_CMP_DECL(STYPE) \
297  FORCEINLINE svec<LANES,bool> operator==(svec<LANES,STYPE> a); \
298  FORCEINLINE svec<LANES,bool> operator!=(svec<LANES,STYPE> a); \
299  FORCEINLINE svec<LANES,bool> operator<(svec<LANES,STYPE> a); \
300  FORCEINLINE svec<LANES,bool> operator<=(svec<LANES,STYPE> a); \
301  FORCEINLINE svec<LANES,bool> operator>(svec<LANES,STYPE> a); \
302  FORCEINLINE svec<LANES,bool> operator>=(svec<LANES,STYPE> a); \
303 
304 
307 #define VEC_UNARY_DECL(STYPE) \
308  FORCEINLINE svec<LANES,STYPE> operator-(); \
309  FORCEINLINE STYPE reduce_add(); \
310  FORCEINLINE STYPE reduce_max(); \
311  FORCEINLINE STYPE reduce_min();
312 
316 #define VEC_BIN_DECL(STYPE) \
317  FORCEINLINE svec<LANES,STYPE> operator+(svec<LANES,STYPE> a); \
318  FORCEINLINE svec<LANES,STYPE> operator+(STYPE s); \
319  FORCEINLINE svec<LANES,STYPE> operator-(svec<LANES,STYPE> a); \
320  FORCEINLINE svec<LANES,STYPE> operator-(STYPE s); \
321  FORCEINLINE svec<LANES,STYPE> operator*(svec<LANES,STYPE> a); \
322  FORCEINLINE svec<LANES,STYPE> operator*(STYPE s); \
323  FORCEINLINE svec<LANES,STYPE> operator/(svec<LANES,STYPE> a); \
324  FORCEINLINE svec<LANES,STYPE> operator/(STYPE s);
325 
326 
330 #define SVEC_BOOL_CLASS_METHOD_DECL() \
331  FORCEINLINE svec<LANES,bool> operator==(svec<LANES,bool> a); \
332  FORCEINLINE svec<LANES,bool> operator!=(svec<LANES,bool> a); \
333  static FORCEINLINE svec<LANES,bool> load(svec<LANES,bool>* p); \
334  FORCEINLINE void store(svec<LANES,bool>* p); \
335  FORCEINLINE bool any_true(); \
336  FORCEINLINE bool all_true(); \
337  FORCEINLINE bool none_true(); \
338  FORCEINLINE svec<LANES,bool> operator|(svec<LANES,bool>); \
339  FORCEINLINE svec<LANES,bool> operator&(svec<LANES,bool> a); \
340  FORCEINLINE svec<LANES,bool> operator^(svec<LANES,bool> a); \
341  FORCEINLINE svec<LANES,bool> operator~(); \
342  FORCEINLINE svec<LANES,bool> operator!(); \
343  FORCEINLINE svec<LANES,bool> operator&&(svec<LANES,bool> a); \
344  FORCEINLINE svec<LANES,bool> operator||(svec<LANES,bool> a);
345 
346 
350 #define VEC_CLASS_METHOD_DECL(STYPE) \
351  VEC_CMP_DECL(STYPE);\
352  VEC_UNARY_DECL(STYPE);\
353  VEC_BIN_DECL(STYPE);\
354  static FORCEINLINE svec<LANES,STYPE> load(svec<LANES,STYPE>* p); \
355  FORCEINLINE void store(svec<LANES,STYPE>* p); \
356  static FORCEINLINE svec<LANES,STYPE> masked_load(svec<LANES,STYPE>* p, svec<LANES,bool> mask); \
357  FORCEINLINE void masked_store(svec<LANES,STYPE>* p, svec<LANES,bool> mask); \
358  static FORCEINLINE svec<LANES,STYPE> load_const(const STYPE* p); \
359  static FORCEINLINE svec<LANES,STYPE> load_and_splat(STYPE* p); \
360  static FORCEINLINE svec<LANES,STYPE> gather(svec<LANES,void*> ptrs, svec<LANES,bool> mask);\
361  FORCEINLINE void scatter(svec<LANES,void*> ptrs, svec<LANES,bool> mask); \
362  static FORCEINLINE svec<LANES,STYPE> gather_base_offsets(STYPE* b, uint32_t scale, svec<LANES,int32_t> offsets, svec<LANES,bool> mask);\
363  static FORCEINLINE svec<LANES,STYPE> gather_base_offsets(STYPE* b, uint32_t scale, svec<LANES,int64_t> offsets, svec<LANES,bool> mask);\
364  FORCEINLINE void scatter_base_offsets(STYPE* b, uint32_t scale, svec<LANES,int32_t> offsets, svec<LANES,bool> mask); \
365  FORCEINLINE void scatter_base_offsets(STYPE* b, uint32_t scale, svec<LANES,int64_t> offsets, svec<LANES,bool> mask); \
366  static FORCEINLINE svec<LANES,STYPE> gather_stride(STYPE* b, int32_t off, int32_t stride);\
367  static FORCEINLINE svec<LANES,STYPE> gather_stride(STYPE* b, int64_t off, int64_t stride);\
368  FORCEINLINE void scatter_stride(STYPE* b, int32_t off, int32_t stride); \
369  FORCEINLINE void scatter_stride(STYPE* b, int64_t off, int64_t stride); \
370  FORCEINLINE svec<LANES,STYPE> broadcast(int32_t index); \
371  FORCEINLINE svec<LANES,STYPE> rotate(int32_t index); \
372  FORCEINLINE svec<LANES,STYPE> shuffle(svec<LANES, int32_t> index); \
373  FORCEINLINE svec<LANES,STYPE> abs();
374 
379 #define VEC_INT_CLASS_METHOD_DECL(STYPE, USTYPE) \
380  FORCEINLINE svec<LANES, STYPE> operator|(svec<LANES, STYPE> a); \
381  FORCEINLINE svec<LANES, STYPE> operator&(svec<LANES, STYPE> a); \
382  FORCEINLINE svec<LANES, STYPE> operator^(svec<LANES, STYPE> a); \
383  FORCEINLINE svec<LANES, STYPE> operator<<(svec<LANES, USTYPE> a); \
384  FORCEINLINE svec<LANES, STYPE> operator<<(int32_t s); \
385  FORCEINLINE svec<LANES, STYPE> operator>>(svec<LANES, USTYPE> a); \
386  FORCEINLINE svec<LANES, STYPE> operator>>(int32_t s); \
387  FORCEINLINE svec<LANES, STYPE> operator%(svec<LANES, STYPE> a); \
388  FORCEINLINE svec<LANES, STYPE> operator%(STYPE s);
389 
393 #define VEC_FLOAT_CLASS_METHOD_DECL(STYPE) \
394  FORCEINLINE svec<LANES,STYPE> round(); \
395  FORCEINLINE svec<LANES,STYPE> floor(); \
396  FORCEINLINE svec<LANES,STYPE> ceil(); \
397  FORCEINLINE svec<LANES,STYPE> sqrt(); \
398  FORCEINLINE svec<LANES,STYPE> rcp(); \
399  FORCEINLINE svec<LANES,STYPE> rsqrt(); \
400  FORCEINLINE svec<LANES,STYPE> exp(); \
401  FORCEINLINE svec<LANES,STYPE> log(); \
402  FORCEINLINE svec<LANES,STYPE> pow(svec<LANES,STYPE> a);
403 
404 // 0. Extract/Insert
409 #define INSERT_EXTRACT(STYPE) \
410  static FORCEINLINE STYPE svec_extract(svec<LANES,STYPE> v, int index) { \
411  return v.v[index]; \
412  } \
413  static FORCEINLINE void svec_insert(svec<LANES,STYPE> *v, int index, STYPE val) { \
414  v->v[index] = val; \
415  }
416 
417 // 1. Load / Store
418 
419 #define LOAD_STORE(STYPE) \
420  \
426 static FORCEINLINE svec<LANES,STYPE> svec_load(const svec<LANES,STYPE> *p) { \
427  STYPE *ptr = (STYPE *)p; \
428  svec<LANES,STYPE> ret; \
429  INC_STATS_NAME(STATS_LOAD_SLOW, 1, "load:svec_"#STYPE); \
430  for (int i = 0; i < LANES; ++i) {ret[i] = ptr[i];} \
431  return ret; \
432 } \
433  \
439 static FORCEINLINE void svec_store(svec<LANES,STYPE> *p, svec<LANES,STYPE> v) { \
440  STYPE *ptr = (STYPE *)p; \
441  INC_STATS_NAME(STATS_STORE_SLOW, 1, "store:svec_"#STYPE); \
442  for (int i = 0; i < LANES; ++i) { ptr[i] = v[i]; } \
443 }
444 
448 #define SELECT(STYPE) \
449 static FORCEINLINE svec<LANES,STYPE> svec_select(svec<LANES,bool> mask, svec<LANES,STYPE> a, svec<LANES,STYPE> b) { \
450  svec<LANES,STYPE> ret; \
451  INC_STATS_NAME(STATS_OTHER_SLOW, 1, "select:svec_"#STYPE); \
452  for (int i = 0; i < LANES; ++i) {ret[i] = mask[i] ? a[i] : b[i];} \
453  return ret; \
454 }
455 
459 #define SELECT_BOOLCOND(STYPE) \
460 \
463 FORCEINLINE svec<LANES,STYPE> svec_select(bool cond, svec<LANES,STYPE> a, svec<LANES,STYPE> b) { \
464  return cond ? a : b; \
465 }
466 
467 // 4. broadcast/rotate/shuffle/smear/setzero
472 #define BROADCAST(STYPE) \
473  static FORCEINLINE svec<LANES,STYPE> svec_broadcast(svec<LANES,STYPE> v, int index) { \
474  INC_STATS_NAME(STATS_OTHER_SLOW, 1, "broadcast"); \
475  STYPE bval = v[index]; \
476  svec<LANES,STYPE> ret; \
477  for (int i = 0; i < LANES; ++i) { ret[i] = bval;} \
478  return ret; \
479  }
480 
485 #define BROADCAST_L4(STYPE) \
486  static FORCEINLINE svec<LANES,STYPE> svec_broadcast(svec<LANES,STYPE> v, int index) { \
487  INC_STATS_NAME(STATS_OTHER_SLOW, 1, "broadcast"); \
488  STYPE bval = v[index]; \
489  svec<LANES,STYPE> ret(bval,bval,bval,bval); \
490  return ret; \
491  }
492 
496 #define ROTATE(STYPE) \
497  static FORCEINLINE svec<LANES,STYPE> svec_rotate(svec<LANES,STYPE> v, int index) { \
498  INC_STATS_NAME(STATS_OTHER_SLOW, 1, "rotate"); \
499  svec<LANES,STYPE> ret; \
500  for (int i = 0; i < LANES; ++i) { ret[i] = v[(i+index) & (LANES-1)];} \
501  return ret; \
502  }
503 
507 #define ROTATE_L4(STYPE) \
508  static FORCEINLINE svec<LANES,STYPE> svec_rotate(svec<LANES,STYPE> v, int index) { \
509  INC_STATS_NAME(STATS_OTHER_SLOW, 1, "rotate"); \
510  svec<LANES,STYPE> ret (v[(0+index) & 0x3], \
511  v[(1+index) & 0x3], \
512  v[(2+index) & 0x3], \
513  v[(3+index) & 0x3]); \
514  return ret; \
515  }
516 
517 
521 #define SHUFFLES(STYPE) \
522  static FORCEINLINE svec<LANES,STYPE> svec_shuffle(svec<LANES,STYPE> v, svec<LANES,int32_t> index) { \
523  INC_STATS_NAME(STATS_OTHER_SLOW, 1, "shuffle"); \
524  svec<LANES,STYPE> ret; \
525  for (int i = 0; i < LANES; ++i) { ret[i] = v[index[i] & (LANES-1)]; }\
526  return ret; \
527  } \
528  static FORCEINLINE svec<LANES,STYPE> svec_shuffle2(svec<LANES,STYPE> v0, svec<LANES,STYPE> v1, svec<LANES,int32_t> index) { \
529  svec<LANES,STYPE> ret; \
530  NOT_IMPLEMENTED("shuffle 2"); \
531  return ret; \
532 }
533 
537 #define SHUFFLES_L4(STYPE) \
538  static FORCEINLINE svec<LANES,STYPE> svec_shuffle(svec<LANES,STYPE> v, svec<LANES,int32_t> index) { \
539  INC_STATS_NAME(STATS_OTHER_SLOW, 1, "shuffle"); \
540  svec<LANES,STYPE> ret (v[index[0] & 0x3], \
541  v[index[1] & 0x3], \
542  v[index[2] & 0x3], \
543  v[index[3] & 0x3] ); \
544  return ret; \
545  } \
546  static FORCEINLINE svec<LANES,STYPE> svec_shuffle2(svec<LANES,STYPE> v0, svec<LANES,STYPE> v1, svec<LANES,int32_t> index) { \
547  svec<LANES,STYPE> ret; \
548  NOT_IMPLEMENTED("shuffle 2"); \
549  return ret; \
550 }
551 
555 #define ZERO(STYPE, NAME) \
556  static FORCEINLINE svec<LANES,STYPE> svec_zero(svec<LANES,STYPE>) { \
557  svec<LANES,STYPE> ret(0,0,0,0); \
558  return ret; \
559  }
560 
561 //LOAD_CONST
562 #define LOAD_CONST(STYPE) \
563 template <class RetVecType> static RetVecType svec_load_const(const STYPE* p); \
564 template<> \
565 FORCEINLINE svec<LANES,STYPE> svec_load_const<svec<LANES,STYPE> >(const STYPE* p) { \
566  svec<LANES,STYPE> ret; \
567  INC_STATS_NAME(STATS_LOAD_SLOW, 1, "load const"); \
568  for (int i = 0; i < LANES; ++i) { ret[i] = *p; }\
569  return ret; \
570 } \
571 template <class RetVecType> static RetVecType svec_load_and_splat(STYPE* p); \
572 template<> \
573 FORCEINLINE svec<LANES,STYPE> svec_load_and_splat<svec<LANES,STYPE> >(STYPE* p) { \
574  svec<LANES,STYPE> ret; \
575  INC_STATS_NAME(STATS_LOAD_SLOW, 1, "load const"); \
576  for (int i = 0; i < LANES; ++i) { ret[i] = *p; }\
577  return ret; \
578 }
579 //Missing gather scatter's generial impl
580 
584 template<typename RetVec, typename RetScalar, typename PTRS, typename MSK>
585 static FORCEINLINE RetVec
586 lGatherGeneral(PTRS ptrs, MSK mask) {
587  RetScalar r[4];
588  if(svec_extract(mask,0)) { r[0] = *((RetScalar*)svec_extract(ptrs, 0));}
589  if(svec_extract(mask,1)) { r[1] = *((RetScalar*)svec_extract(ptrs, 1));}
590  if(svec_extract(mask,2)) { r[2] = *((RetScalar*)svec_extract(ptrs, 2));}
591  if(svec_extract(mask,3)) { r[3] = *((RetScalar*)svec_extract(ptrs, 3));}
592  INC_STATS_NAME(STATS_GATHER_SLOW,1, "Gather general");
593  return RetVec(r[0],r[1],r[2],r[3]);
594 }
595 
602 #define GATHER_GENERAL(STYPE, PSTYPE) \
603 template<> \
604 FORCEINLINE svec<LANES,STYPE> svec_gather<svec<LANES,STYPE> >(svec<LANES,PSTYPE> ptrs, svec<LANES,bool> mask) { \
605  svec<LANES,STYPE> ret;\
606  for(int i = 0; i < LANES; ++i) {if(mask[i]){ret[i] = *(STYPE*)(ptrs[i]); } }\
607  INC_STATS_NAME(STATS_GATHER_SLOW, 1, "Gather genera"); \
608  return ret; \
609 }
610 
617 #define GATHER_GENERAL_L4(STYPE, PSTYPE) \
618 template<> \
619 FORCEINLINE svec<LANES,STYPE> svec_gather<svec<LANES,STYPE> >(svec<LANES,PSTYPE> ptrs, svec<LANES,bool> mask) { \
620  return lGatherGeneral<svec<LANES,STYPE>, STYPE, svec<LANES,PSTYPE>, svec<LANES,bool> >(ptrs, mask); \
621 }
622 
631 template<typename RetVec, typename RetScalar, typename OFF, typename MSK>
632 static FORCEINLINE RetVec
633 lGatherBaseOffsets(unsigned char *p, uint32_t scale,
634  OFF offsets, MSK mask) {
635  RetScalar r[4];
636  OFF vzero(0);
637 
638  offsets = svec_select(mask.v, offsets, vzero);
639  r[0] = *(RetScalar *)(p + scale * svec_extract(offsets, 0));
640  r[1] = *(RetScalar *)(p + scale * svec_extract(offsets, 1));
641  r[2] = *(RetScalar *)(p + scale * svec_extract(offsets, 2));
642  r[3] = *(RetScalar *)(p + scale * svec_extract(offsets, 3));
643  INC_STATS_NAME(STATS_GATHER_SLOW,1, "Gather offset with select");
644  return RetVec(r[0], r[1], r[2], r[3]);
645 }
646 
647 #define GATHER_BASE_OFFSETS(STYPE, OSTYPE) \
648 FORCEINLINE svec<LANES,STYPE> svec_gather_base_offsets(STYPE* b, uint32_t scale, svec<LANES,OSTYPE> offsets, svec<LANES,bool> mask) { \
649  svec<LANES,STYPE> ret;\
650  for(int i = 0; i < LANES; ++i) {if(mask[i]){ret[i] = *(STYPE*)((uint8_t*)b + scale * offsets[i]);} }\
651  INC_STATS_NAME(STATS_GATHER_SLOW,1, "Gather offset with select"); \
652  return ret; \
653 }
654 
658 #define GATHER_BASE_OFFSETS_L4(STYPE, OSTYPE) \
659 FORCEINLINE svec<LANES,STYPE> svec_gather_base_offsets(STYPE* b, uint32_t scale, svec<LANES,OSTYPE> offsets, svec<LANES,bool> mask) { \
660  return lGatherBaseOffsets<svec<LANES,STYPE>, STYPE, svec<LANES,OSTYPE>, svec<LANES,bool> >((uint8_t*)b, scale, offsets, mask); \
661 }
662 
666 #define GATHER_STRIDE(STYPE, OSTYPE) \
667 template <class RetVecType> static RetVecType svec_gather_stride(STYPE* b, OSTYPE o, OSTYPE s); \
668 template<> \
669 FORCEINLINE svec<LANES,STYPE> svec_gather_stride<svec<LANES,STYPE> >(STYPE* b, OSTYPE o, OSTYPE s) { \
670  svec<LANES,STYPE> ret; \
671  b += o; \
672  for(int i = 0; i < LANES; ++i, b+=s) { \
673  ret[i] = *b; \
674  } \
675  INC_STATS_NAME(STATS_GATHER_SLOW,1, "Gather Steps"); \
676  return ret; \
677 }
678 
682 #define GATHER_STRIDE_L4(STYPE, OSTYPE) \
683 template <class RetVecType> static RetVecType svec_gather_stride(STYPE* b, OSTYPE o, OSTYPE s); \
684 template<> \
685 FORCEINLINE svec<LANES,STYPE> svec_gather_stride<svec<LANES,STYPE> >(STYPE* b, OSTYPE o, OSTYPE s) { \
686  int64_t off = (int64_t)o; int64_t stride = (int64_t)s;\
687  OSTYPE stride2 = stride * 2; \
688  STYPE v0 = *(b + off); \
689  STYPE v1 = *(b + off + stride); \
690  STYPE v2 = *(b + off + stride2); \
691  STYPE v3 = *(b + off + stride2 + stride); \
692  return svec<LANES,STYPE>(v0, v1, v2, v3); \
693 }
694 
695 //#define GATHER_STRIDE_L4(STYPE, OSTYPE) \
696 //FORCEINLINE svec<LANES,STYPE> svec_gather_stride(STYPE* b, OSTYPE o, OSTYPE s) { \
697 // int64_t off = (int64_t)o; int64_t stride = (int64_t)s;\
698 // b += off; STYPE v0 = *b; \
699 // b += stride; STYPE v1 = *b; \
700 // b += stride; STYPE v2 = *b; \
701 // b += stride; STYPE v3 = *b; \
702 // return svec<LANES,STYPE>(v0, v1, v2, v3); \
703 //}
704 
705 #define SCATTER_STRIDE(STYPE, OSTYPE) \
706 FORCEINLINE void svec_scatter_stride(STYPE* b, OSTYPE o, OSTYPE s, svec<LANES,STYPE> val) { \
707  b += o; \
708  for(int i = 0; i < LANES; ++i, b+=s) { \
709  *b = svec_extract(val, i); \
710  }\
711  INC_STATS_NAME(STATS_SCATTER_SLOW,1, "scatter stride general svec<LANES,"#STYPE">"); \
712 }
713 
714 
715 #define SCATTER_STRIDE_L4(STYPE, OSTYPE) \
716 FORCEINLINE void svec_scatter_stride(STYPE* b, OSTYPE o, OSTYPE s, svec<LANES,STYPE> val) { \
717  int64_t off = (int64_t)o; int64_t stride = (int64_t)s;\
718  OSTYPE stride2 = stride * 2; \
719  *(b + off) = svec_extract(val, 0); \
720  *(b + off + stride) = svec_extract(val, 1); \
721  *(b + off + stride2) = svec_extract(val, 2); \
722  *(b + off + stride2 + stride) = svec_extract(val, 3); \
723 }
724 
725 //#define SCATTER_STRIDE_L4(STYPE, OSTYPE) \
726 //FORCEINLINE void svec_scatter_stride(STYPE* b, OSTYPE o, OSTYPE s, svec<LANES,STYPE> val) { \
727 // int64_t off = (int64_t)o; int64_t stride = (int64_t)s;\
728 // b += off; *b = svec_extract(val, 0);\
729 // b += stride; *b = svec_extract(val, 1); \
730 // b += stride; *b = svec_extract(val, 2); \
731 // b += stride; *b = svec_extract(val, 3); \
732 //}
733 
734 
735 
736 #define SCATTER_GENERAL(STYPE, PSTYPE) \
737 static FORCEINLINE void svec_scatter(svec<LANES,PSTYPE> ptrs, svec<LANES,STYPE> val, svec<LANES,bool> mask) { \
738  for(int i = 0; i < LANES; ++i) { if(mask[i]){ *((STYPE*)ptrs[i]) = val[i];} } \
739  INC_STATS_NAME(STATS_SCATTER_SLOW,1, "scatter general svec<LANES,"#STYPE">"); \
740 }
741 
742 
746 template<typename STYPE, typename PTRTYPE, typename VTYPE, typename MTYPE>
747 static FORCEINLINE void lScatterGeneral(PTRTYPE ptrs,
748  VTYPE val, MTYPE mask) {
749  if(svec_extract(mask,0)) { *((STYPE*)svec_extract(ptrs, 0)) = val[0]; }
750  if(svec_extract(mask,1)) { *((STYPE*)svec_extract(ptrs, 1)) = val[1]; }
751  if(svec_extract(mask,2)) { *((STYPE*)svec_extract(ptrs, 2)) = val[2]; }
752  if(svec_extract(mask,3)) { *((STYPE*)svec_extract(ptrs, 3)) = val[3]; }
753  INC_STATS_NAME(STATS_SCATTER_SLOW,1, "scatter general");
754 }
755 
756 #define SCATTER_GENERAL_L4(STYPE, PSTYPE) \
757 static FORCEINLINE void svec_scatter(svec<LANES,PSTYPE> ptrs, svec<LANES,STYPE> val, svec<LANES,bool> mask) { \
758  lScatterGeneral<STYPE, svec<LANES,PSTYPE>, svec<LANES,STYPE>, svec<LANES,bool> >(ptrs, val, mask); \
759 }
760 
761 
765 #define SCATTER_BASE_OFFSETS(STYPE, OSTYPE) \
766 FORCEINLINE void svec_scatter_base_offsets(STYPE* b, uint32_t scale, svec<LANES,OSTYPE> offsets, svec<LANES,STYPE> val, svec<LANES,bool> mask) { \
767  for(int i=0;i<LANES;++i){if(mask[i]){*(STYPE*)((uint8_t*)b + scale * offsets[i]) = val[i];}}\
768  INC_STATS_NAME(STATS_SCATTER_SLOW,1,"scatter offset svec<LANES,"#STYPE">"); \
769 }
770 
774 template<typename STYPE, typename OTYPE, typename VTYPE, typename MTYPE>
775 static FORCEINLINE void lScatterBaseOffsets(unsigned char *b,
776  uint32_t scale, OTYPE offsets,
777  VTYPE val, MTYPE mask) {
778  unsigned char *base = b;
779  if(svec_extract(mask,0)) { *(STYPE*)(b + scale * svec_extract(offsets, 0)) = val[0]; }
780  if(svec_extract(mask,1)) { *(STYPE*)(b + scale * svec_extract(offsets, 1)) = val[1]; }
781  if(svec_extract(mask,2)) { *(STYPE*)(b + scale * svec_extract(offsets, 2)) = val[2]; }
782  if(svec_extract(mask,3)) { *(STYPE*)(b + scale * svec_extract(offsets, 3)) = val[3]; }
783  INC_STATS_NAME(STATS_SCATTER_SLOW,1, "scatter offset");
784 }
785 
789 #define SCATTER_BASE_OFFSETS_L4(STYPE, OSTYPE) \
790 FORCEINLINE void svec_scatter_base_offsets(STYPE* b, uint32_t scale, svec<LANES,OSTYPE> offsets, svec<LANES,STYPE> val, svec<LANES,bool> mask) { \
791  lScatterBaseOffsets<STYPE, svec<LANES,OSTYPE>, svec<LANES,STYPE>, svec<LANES,bool> >((uint8_t*)b, scale, offsets, val, mask); \
792 }
793 
794 
795 
796 
797 #define MASKED_LOAD_STORE_L4(STYPE) \
798 static FORCEINLINE svec<LANES,STYPE> svec_masked_load(svec<LANES,STYPE> *p, svec<LANES,bool> mask) { \
799  return svec_gather_base_offsets((STYPE*)p, sizeof(STYPE), svec<LANES,int32_t>(0,1,2,3), mask); \
800 } \
801 static FORCEINLINE void svec_masked_store(svec<LANES,STYPE> *p, svec<LANES,STYPE> v, svec<LANES,bool> mask) { \
802  svec_scatter_base_offsets((STYPE*)p, sizeof(STYPE), svec<LANES,int32_t>(0,1,2,3), v, mask); \
803 }
804 
805 #define MASKED_LOAD_STORE_L8(STYPE) \
806 static FORCEINLINE svec<LANES,STYPE> svec_masked_load(svec<LANES,STYPE> *p, svec<LANES,bool> mask) { \
807  return svec_gather_base_offsets((STYPE*)p, sizeof(STYPE), svec<LANES,int32_t>(0,1,2,3,4,5,6,7), mask); \
808 } \
809 static FORCEINLINE void svec_masked_store(svec<LANES,STYPE> *p, svec<LANES,STYPE> v, svec<LANES,bool> mask) { \
810  svec_scatter_base_offsets((STYPE*)p, sizeof(STYPE), svec<LANES,int32_t>(0,1,2,3,4,5,6,7), v, mask); \
811 }
812 
813 
814 
816 //
817 // Mask type (i1) interfaces
818 //
820 
821 
822 
824 //
825 // General data operation interfaces
826 //
828 
829 template<class T> static FORCEINLINE T abs(T a) {
830  return a >= 0 ? a : -a;
831 }
832 
833 #define UNARY_OP(STYPE, NAME, OP) \
834 static FORCEINLINE svec<LANES,STYPE> NAME(svec<LANES,STYPE> v) { \
835  INC_STATS_NAME(STATS_UNARY_SLOW, 1, #OP); \
836  svec<LANES,STYPE> ret; \
837  for (int i = 0; i < LANES; ++i) { ret[i] = OP(v[i]); } \
838  return ret; \
839 }
840 
841 #define UNARY_OP_L4(STYPE, NAME, OP) \
842 static FORCEINLINE svec<LANES,STYPE> NAME(svec<LANES,STYPE> v) { \
843  INC_STATS_NAME(STATS_UNARY_SLOW, 1, #OP); \
844  return svec<LANES,STYPE>(OP(svec_extract(v, 0)),\
845  OP(svec_extract(v, 1)),\
846  OP(svec_extract(v, 2)),\
847  OP(svec_extract(v, 3)));\
848 }
849 
853 #define BINARY_OP(STYPE, NAME, OP) \
854 static FORCEINLINE svec<LANES,STYPE> NAME(svec<LANES,STYPE> a, svec<LANES,STYPE> b) { \
855  INC_STATS_NAME(STATS_BINARY_SLOW, 1, #NAME); \
856  svec<LANES,STYPE> ret; \
857  for (int i = 0; i < LANES; ++i) { ret[i] = a[i] OP b[i]; } \
858  return ret; \
859 }
860 
861 #define BINARY_OP2(STYPE, STYPE2, NAME, OP) \
862 static FORCEINLINE svec<LANES,STYPE> NAME(svec<LANES,STYPE> a, svec<LANES,STYPE2> b) { \
863  INC_STATS_NAME(STATS_BINARY_SLOW, 1, #NAME); \
864  svec<LANES,STYPE> ret; \
865  for (int i = 0; i < LANES; ++i) { ret[i] = a[i] OP b[i]; } \
866  return ret; \
867 }
868 
869 #define BINARY_OP_FUNC(STYPE, NAME, FUNC) \
870 static FORCEINLINE svec<LANES,STYPE> NAME(svec<LANES,STYPE> a, svec<LANES,STYPE> b) { \
871  INC_STATS_NAME(STATS_BINARY_SLOW, 1, #NAME); \
872  svec<LANES,STYPE> ret; \
873  for (int i = 0; i < LANES; ++i) { ret[i] = FUNC(a[i], b[i]); } \
874  return ret; \
875 }
876 
880 #define BINARY_OP_L4(STYPE, NAME, OP) \
881 static FORCEINLINE svec<LANES,STYPE> NAME(svec<LANES,STYPE> a, svec<LANES,STYPE> b) { \
882  INC_STATS_NAME(STATS_BINARY_SLOW, 1, #NAME); \
883  svec<LANES,STYPE> ret(svec_extract(a, 0) OP svec_extract(b, 0),\
884  svec_extract(a, 1) OP svec_extract(b, 1),\
885  svec_extract(a, 2) OP svec_extract(b, 2),\
886  svec_extract(a, 3) OP svec_extract(b, 3));\
887  return ret; \
888 }
889 
893 #define BINARY_OP2_L4(STYPE, STYPE2, NAME, OP) \
894 static FORCEINLINE svec<LANES,STYPE> NAME(svec<LANES,STYPE> a, svec<LANES,STYPE2> b) { \
895  INC_STATS_NAME(STATS_BINARY_SLOW, 1, #NAME); \
896  svec<LANES,STYPE> ret(svec_extract(a, 0) OP svec_extract(b, 0),\
897  svec_extract(a, 1) OP svec_extract(b, 1),\
898  svec_extract(a, 2) OP svec_extract(b, 2),\
899  svec_extract(a, 3) OP svec_extract(b, 3));\
900  return ret; \
901 }
902 
903 
904 #define BINARY_OP_FUNC_L4(STYPE, NAME, FUNC) \
905 static FORCEINLINE svec<LANES,STYPE> NAME(svec<LANES,STYPE> a, svec<LANES,STYPE> b) { \
906  INC_STATS_NAME(STATS_BINARY_SLOW, 1, #NAME); \
907  svec<LANES,STYPE> ret(FUNC(svec_extract(a, 0), svec_extract(b, 0)),\
908  FUNC(svec_extract(a, 1), svec_extract(b, 1)),\
909  FUNC(svec_extract(a, 2), svec_extract(b, 2)),\
910  FUNC(svec_extract(a, 3), svec_extract(b, 3))); \
911  return ret; \
912 }
913 
917 #define BINARY_OP_SCALAR_L4(STYPE, STYPE2, NAME, OP) \
918 static FORCEINLINE svec<LANES,STYPE> NAME(svec<LANES,STYPE> a, STYPE2 s) { \
919  INC_STATS_NAME(STATS_BINARY_SLOW, 1, #NAME); \
920  svec<LANES,STYPE> ret(svec_extract(a, 0) OP s,\
921  svec_extract(a, 1) OP s,\
922  svec_extract(a, 2) OP s,\
923  svec_extract(a, 3) OP s);\
924  return ret; \
925 }
926 
930 #define BINARY_OP_SCALAR(STYPE, NAME, OP) \
931 static FORCEINLINE svec<LANES,STYPE> NAME(svec<LANES,STYPE> a, STYPE s) { \
932  INC_STATS_NAME(STATS_BINARY_SLOW, 1, #NAME); \
933  svec<LANES,STYPE> ret; \
934  for (int i = 0; i < LANES; ++i) { ret[i] = a[i] OP s; } \
935  return ret; \
936 }
937 
940 #define BINARY_SHT_SCALAR(STYPE, SHTTYPE, NAME, OP) \
941 static FORCEINLINE svec<LANES,STYPE> NAME(svec<LANES,STYPE> a, SHTTYPE s) { \
942  INC_STATS_NAME(STATS_BINARY_SLOW, 1, #NAME); \
943  svec<LANES,STYPE> ret; \
944  for (int i = 0; i < LANES; ++i) { ret[i] = a[i] OP s; } \
945  return ret; \
946 }
947 
950 #define BINARY_SCALAR_OP(STYPE, NAME, OP) \
951 static FORCEINLINE svec<LANES,STYPE> NAME(STYPE s, svec<LANES,STYPE> a) { \
952  INC_STATS_NAME(STATS_BINARY_SLOW, 1, #NAME); \
953  svec<LANES,STYPE> ret; \
954  for (int i = 0; i < LANES; ++i) { ret[i] = s OP a[i]; }\
955  return ret; \
956 }
957 
958 #define TERNERY(STYPE) \
959  \
962 FORCEINLINE svec<LANES,STYPE> svec_madd(svec<LANES,STYPE> a, svec<LANES,STYPE> b, svec<LANES,STYPE> c) { \
963  svec<LANES,STYPE> res; \
964  for(int i = 0; i < LANES; ++i) { res[i] = a[i]*b[i]+c[i]; } \
965  return res; \
966 } \
967  \
970 FORCEINLINE svec<LANES,STYPE> svec_msub(svec<LANES,STYPE> a, svec<LANES,STYPE> b, svec<LANES,STYPE> c) { \
971  svec<LANES,STYPE> res; \
972  for(int i = 0; i < LANES; ++i) { res[i] = a[i]*b[i]-c[i]; } \
973  return res; \
974 } \
975  \
978 FORCEINLINE svec<LANES,STYPE> svec_nmsub(svec<LANES,STYPE> a, svec<LANES,STYPE> b, svec<LANES,STYPE> c) { \
979  svec<LANES,STYPE> res; \
980  for(int i = 0; i < LANES; ++i) { res[i] = -(a[i]*b[i]-c[i]); } \
981  return res; \
982 }
983 
984 #define TERNERY_L4(STYPE) \
985  \
988 FORCEINLINE svec<LANES,STYPE> svec_madd(svec<LANES,STYPE> a, svec<LANES,STYPE> b, svec<LANES,STYPE> c) { \
989  svec<LANES,STYPE> ret(svec_extract(a, 0) * svec_extract(b, 0) + svec_extract(c, 0),\
990  svec_extract(a, 1) * svec_extract(b, 1) + svec_extract(c, 1),\
991  svec_extract(a, 2) * svec_extract(b, 2) + svec_extract(c, 2),\
992  svec_extract(a, 3) * svec_extract(b, 3) + svec_extract(c, 3));\
993  return ret; \
994 } \
995  \
998 FORCEINLINE svec<LANES,STYPE> svec_msub(svec<LANES,STYPE> a, svec<LANES,STYPE> b, svec<LANES,STYPE> c) { \
999  svec<LANES,STYPE> ret(svec_extract(a, 0) * svec_extract(b, 0) - svec_extract(c, 0),\
1000  svec_extract(a, 1) * svec_extract(b, 1) - svec_extract(c, 1),\
1001  svec_extract(a, 2) * svec_extract(b, 2) - svec_extract(c, 2),\
1002  svec_extract(a, 3) * svec_extract(b, 3) - svec_extract(c, 3));\
1003  return ret; \
1004 } \
1005  \
1008 FORCEINLINE svec<LANES,STYPE> svec_nmsub(svec<LANES,STYPE> a, svec<LANES,STYPE> b, svec<LANES,STYPE> c) { \
1009  svec<LANES,STYPE> ret(- (svec_extract(a, 0) * svec_extract(b, 0) - svec_extract(c, 0)),\
1010  - (svec_extract(a, 1) * svec_extract(b, 1) - svec_extract(c, 1)),\
1011  - (svec_extract(a, 2) * svec_extract(b, 2) - svec_extract(c, 2)),\
1012  - (svec_extract(a, 3) * svec_extract(b, 3) - svec_extract(c, 3)));\
1013  return ret; \
1014 }
1015 
1016 
1017 
1018 // 5. Max/Min
1019 
1020 //add/max/min
1021 template<class T> static FORCEINLINE T add(T a, T b) {
1022  return a+b;
1023 }
1024 template<class T> static FORCEINLINE T max(T a, T b) {
1025  return a > b ? a : b;
1026 }
1027 template<class T> static FORCEINLINE T min(T a, T b) {
1028  return a < b ? a : b;
1029 }
1030 
1031 #define BINARY_OP_REDUCE_FUNC(STYPE, NAME, FUNC) \
1032 static FORCEINLINE STYPE NAME(svec<LANES,STYPE> a) { \
1033  INC_STATS_NAME(STATS_OTHER_SLOW, 1, "reduce"); \
1034  STYPE r = a[0]; \
1035  for(int i = 1; i < LANES; ++i) { r = FUNC(r, a[i]); } \
1036  return r; \
1037 }
1038 
1039 #define BINARY_OP_REDUCE_FUNC_L4(STYPE, NAME, FUNC) \
1040 static FORCEINLINE STYPE NAME(svec<LANES,STYPE> a) { \
1041  INC_STATS_NAME(STATS_OTHER_SLOW, 1, "reduce"); \
1042  return FUNC(FUNC(FUNC(a[0], a[1]), a[2]), a[3]); \
1043 }
1044 
1045 // 7. Compare
1049 #define CMP_OP(STYPE, NAME, OP) \
1050 static FORCEINLINE svec<LANES,bool> svec_##NAME(svec<LANES,STYPE> a, svec<LANES,STYPE> b) { \
1051  INC_STATS_NAME(STATS_BINARY_SLOW, 1, #NAME); \
1052  svec<LANES,bool> ret; \
1053  for (int i = 0; i < LANES; ++i) { ret[i] = a[i] OP b[i]; } \
1054  return ret; \
1055 }
1056 
1057 #define CMP_OP_L4(STYPE, NAME, OP) \
1058  static FORCEINLINE svec<LANES,bool> svec_##NAME(svec<LANES,STYPE> a, svec<LANES,STYPE> b) { \
1059  INC_STATS_NAME(STATS_COMPARE_SLOW, 1, #NAME); \
1060  uint32_t r0 = (a[0] OP b[0]); \
1061  uint32_t r1 = (a[1] OP b[1]); \
1062  uint32_t r2 = (a[2] OP b[2]); \
1063  uint32_t r3 = (a[3] OP b[3]); \
1064  return svec<LANES,bool>(r0,r1,r2,r3); \
1065  }
1066 
1070 #define CMP_MASKED_OP(STYPE, NAME, OP) \
1071 \
1075 FORCEINLINE svec<LANES,bool> svec_masked_##NAME(svec<LANES,STYPE> a, svec<LANES,STYPE> b, \
1076  svec<LANES,bool> mask) { \
1077  return svec_and(svec_##NAME(a,b) , mask); \
1078 }
1079 
1080 
1081 
1082 
1083 #define CMP_ALL_NOMASK_OP(STYPE) \
1084  CMP_OP(STYPE, equal, ==) \
1085  CMP_OP(STYPE, not_equal, !=) \
1086  CMP_OP(STYPE, less_than, <) \
1087  CMP_OP(STYPE, less_equal, <=) \
1088  CMP_OP(STYPE, greater_than, >) \
1089  CMP_OP(STYPE, greater_equal, >=)
1090 
1091 #define CMP_ALL_NOMASK_OP_L4(STYPE) \
1092  CMP_OP_L4(STYPE, equal, ==) \
1093  CMP_OP_L4(STYPE, not_equal, !=) \
1094  CMP_OP_L4(STYPE, less_than, <) \
1095  CMP_OP_L4(STYPE, less_equal, <=) \
1096  CMP_OP_L4(STYPE, greater_than, >) \
1097  CMP_OP_L4(STYPE, greater_equal, >=)
1098 
1099 #define CMP_ALL_MASKED_OP(STYPE) \
1100  CMP_MASKED_OP(STYPE, equal, ==) \
1101  CMP_MASKED_OP(STYPE, not_equal, !=) \
1102  CMP_MASKED_OP(STYPE, less_than, <) \
1103  CMP_MASKED_OP(STYPE, less_equal, <=) \
1104  CMP_MASKED_OP(STYPE, greater_than, >) \
1105  CMP_MASKED_OP(STYPE, greater_equal, >=)
1106 
1107 #define CMP_ALL_OP(STYPE) \
1108  CMP_ALL_NOMASK_OP(STYPE) \
1109  CMP_ALL_MASKED_OP(STYPE)
1110 
1111 // 8. Cast
1112 #define CAST(SFROM, STO) \
1113 template <class T> static T svec_cast(svec<LANES,SFROM> val); \
1114  \
1117 template <> FORCEINLINE svec<LANES,STO> svec_cast<svec<LANES,STO> >(svec<LANES,SFROM> val) { \
1118  INC_STATS_NAME(STATS_CAST_SLOW, 1, "svec<LANES,"#SFROM">-svec<LANES,"#STO">"); \
1119  svec<LANES,STO> ret; \
1120  for (int i = 0; i < LANES; ++i) { ret[i] = (STO)val[i]; } \
1121  return ret; \
1122 }
1123 
1124 #define CAST_L4(SFROM, STO) \
1125 template <class T> static T svec_cast(svec<LANES,SFROM> val); \
1126  \
1129 template <> FORCEINLINE svec<LANES,STO> svec_cast<svec<LANES,STO> >(svec<LANES,SFROM> val) { \
1130  INC_STATS_NAME(STATS_CAST_SLOW, 1, "svec<LANES,"#SFROM">-svec<LANES,"#STO">"); \
1131  return svec<LANES,STO>((STO)val[0],(STO)val[1],(STO)val[2],(STO)val[3]); \
1132 }
1133 
1134 typedef union {
1135  int32_t i32;
1136  uint32_t u32;
1137  float f;
1138  int64_t i64;
1139  uint64_t u64;
1140  double d;
1141 } BitcastUnion;
1142 
1143 #define CAST_BITS(SFROM, FROM_F, STO, TO_F) \
1144 template <class T> static T svec_cast_bits(svec<LANES,SFROM> val); \
1145 template <> FORCEINLINE svec<LANES,STO> svec_cast_bits<svec<LANES,STO> >(svec<LANES,SFROM> val) { \
1146  INC_STATS_NAME(STATS_CAST_SLOW, 1, "svec<LANES,"#SFROM">-svec<LANES,"#STO">"); \
1147  BitcastUnion u[LANES]; \
1148  svec<LANES,STO> ret; \
1149  for(int i = 0; i < LANES; ++i) {u[i].FROM_F = val[i]; ret[i] = u[i].TO_F;} \
1150  return ret; \
1151 }
1152 
1153 
1155 //
1156 // Class operations based on the above interfaces
1157 //
1159 
1160 #define SUBSCRIPT_FUNC_IMPL(STYPE) \
1161 FORCEINLINE STYPE& svec<LANES,STYPE>::operator[](int index) { \
1162  INC_STATS_NAME(STATS_INSERT, 1, "insert "#STYPE); \
1163  return v[index]; \
1164 } \
1165 const FORCEINLINE STYPE svec<LANES,STYPE>::operator[](int index) const { \
1166  return v[index]; \
1167 }
1168 
1169 
1175 #define VEC_CMP_IMPL(STYPE) \
1176 \
1181  FORCEINLINE svec<LANES,bool> svec<LANES,STYPE>::operator==(svec<LANES,STYPE> a) { return svec_equal(*this, a); } \
1182 \
1187  FORCEINLINE svec<LANES,bool> svec<LANES,STYPE>::operator!=(svec<LANES,STYPE> a) { return svec_not_equal(*this, a); } \
1188 \
1193  FORCEINLINE svec<LANES,bool> svec<LANES,STYPE>::operator<(svec<LANES,STYPE> a) { return svec_less_than(*this, a); } \
1194 \
1199  FORCEINLINE svec<LANES,bool> svec<LANES,STYPE>::operator<=(svec<LANES,STYPE> a) { return svec_less_equal(*this, a); } \
1200 \
1205  FORCEINLINE svec<LANES,bool> svec<LANES,STYPE>::operator>(svec<LANES,STYPE> a) { return svec_greater_than(*this, a); } \
1206 \
1211  FORCEINLINE svec<LANES,bool> svec<LANES,STYPE>::operator>=(svec<LANES,STYPE> a) { return svec_greater_equal(*this, a); }
1212 
1213 #define VEC_UNARY_IMPL(STYPE) \
1214 \
1217  FORCEINLINE svec<LANES,STYPE> svec<LANES,STYPE>::operator-() {return svec_neg(*this); } \
1218 \
1221  FORCEINLINE STYPE svec<LANES,STYPE>::reduce_add() {return svec_reduce_add(*this); } \
1222 \
1225  FORCEINLINE STYPE svec<LANES,STYPE>::reduce_max() {return svec_reduce_max(*this); } \
1226 \
1229  FORCEINLINE STYPE svec<LANES,STYPE>::reduce_min() {return svec_reduce_min(*this); }
1230 
1231 
1232 #define VEC_BIN_IMPL(STYPE) \
1233 \
1236  FORCEINLINE svec<LANES,STYPE> svec<LANES,STYPE>::operator+(svec<LANES,STYPE> a) { return svec_add(*this, a); } \
1237 \
1240  FORCEINLINE svec<LANES,STYPE> svec<LANES,STYPE>::operator+(STYPE s) { return svec_add_scalar(*this, s); } \
1241 \
1244  FORCEINLINE svec<LANES,STYPE> operator+(STYPE s, svec<LANES,STYPE> a) {return svec_scalar_add(s, a);} \
1245 \
1248  FORCEINLINE svec<LANES,STYPE> svec<LANES,STYPE>::operator-(svec<LANES,STYPE> a) { return svec_sub(*this, a); } \
1249 \
1252  FORCEINLINE svec<LANES,STYPE> svec<LANES,STYPE>::operator-(STYPE s) { return svec_sub_scalar(*this, s); } \
1253 \
1256  FORCEINLINE svec<LANES,STYPE> operator-(STYPE s, svec<LANES,STYPE> a) {return svec_scalar_sub(s, a);} \
1257 \
1260  FORCEINLINE svec<LANES,STYPE> svec<LANES,STYPE>::operator*(svec<LANES,STYPE> a) { return svec_mul(*this, a); } \
1261 \
1264  FORCEINLINE svec<LANES,STYPE> svec<LANES,STYPE>::operator*(STYPE s) { return svec_mul_scalar(*this, s) ;} \
1265 \
1268  FORCEINLINE svec<LANES,STYPE> operator*(STYPE s, svec<LANES,STYPE> a) {return svec_scalar_mul(s, a);} \
1269 \
1272  FORCEINLINE svec<LANES,STYPE> svec<LANES,STYPE>::operator/(svec<LANES,STYPE> a) { return svec_div(*this, a); } \
1273 \
1276  FORCEINLINE svec<LANES,STYPE> svec<LANES,STYPE>::operator/(STYPE s) { return svec_div_scalar(*this, s) ;} \
1277 \
1280  FORCEINLINE svec<LANES,STYPE> operator/(STYPE s, svec<LANES,STYPE> a) {return svec_scalar_div(s, a);} \
1281 
1282 
1285  #define MVEC_CLASS_METHOD_IMPL(STYPE) \
1286 \
1292  FORCEINLINE svec<LANES,STYPE> svec<LANES,STYPE>::load(svec<LANES,STYPE>* p){ return svec_load(p); } \
1293 \
1298  FORCEINLINE void svec<LANES,STYPE>::store(svec<LANES,STYPE>* p){ svec_store(p, *this); }
1299 
1300 
1301 #define VEC_CLASS_METHOD_IMPL(STYPE) \
1302  MVEC_CLASS_METHOD_IMPL(STYPE); \
1303 \
1306  FORCEINLINE svec<LANES,STYPE> svec<LANES,STYPE>::masked_load(svec<LANES,STYPE>* p, svec<LANES,bool> mask){ return svec_masked_load(p, mask); } \
1307 \
1310  FORCEINLINE void svec<LANES,STYPE>::masked_store(svec<LANES,STYPE>* p, svec<LANES,bool> mask){ svec_masked_store(p, *this, mask); } \
1311  VEC_UNARY_IMPL(STYPE); \
1312  VEC_BIN_IMPL(STYPE); \
1313 \
1316  FORCEINLINE svec<LANES,STYPE> svec<LANES,STYPE>::load_const(const STYPE* p) {return svec_load_const<svec<LANES,STYPE> >(p);} \
1317 \
1320  FORCEINLINE svec<LANES,STYPE> svec<LANES,STYPE>::load_and_splat(STYPE* p) {return svec_load_and_splat<svec<LANES,STYPE> >(p); } \
1321 \
1324  FORCEINLINE svec<LANES,STYPE> svec<LANES,STYPE>::gather(svec<LANES,void*> ptrs, svec<LANES,bool> mask) {return svec_gather<svec<LANES,STYPE> >(ptrs, mask); } \
1325 \
1328  FORCEINLINE void svec<LANES,STYPE>::scatter(svec<LANES,void*> ptrs, svec<LANES,bool> mask) { svec_scatter(ptrs, *this, mask); } \
1329 \
1332  FORCEINLINE svec<LANES,STYPE> svec<LANES,STYPE>::gather_base_offsets(STYPE* b, uint32_t scale, svec<LANES,int32_t> offsets, svec<LANES,bool> mask) { \
1333  return svec_gather_base_offsets(b, scale, offsets, mask); \
1334  } \
1335 \
1338  FORCEINLINE svec<LANES,STYPE> svec<LANES,STYPE>::gather_base_offsets(STYPE* b, uint32_t scale, svec<LANES,int64_t> offsets, svec<LANES,bool> mask) {\
1339  return svec_gather_base_offsets(b, scale, offsets, mask); \
1340  } \
1341 \
1344  FORCEINLINE void svec<LANES,STYPE>::scatter_base_offsets(STYPE* b, uint32_t scale, svec<LANES,int32_t> offsets, svec<LANES,bool> mask) { \
1345  svec_scatter_base_offsets(b, scale, offsets, *this, mask); \
1346  } \
1347 \
1350  FORCEINLINE void svec<LANES,STYPE>::scatter_base_offsets(STYPE* b, uint32_t scale, svec<LANES,int64_t> offsets, svec<LANES,bool> mask) {\
1351  svec_scatter_base_offsets(b, scale, offsets, *this, mask); \
1352  } \
1353 \
1356  FORCEINLINE svec<LANES,STYPE> svec<LANES,STYPE>::gather_stride(STYPE* b, int32_t off, int32_t stride) { \
1357  return svec_gather_stride<svec<LANES,STYPE> >(b, off, stride); \
1358  } \
1359 \
1362  FORCEINLINE svec<LANES,STYPE> svec<LANES,STYPE>::gather_stride(STYPE* b, int64_t off, int64_t stride) {\
1363  return svec_gather_stride<svec<LANES,STYPE> >(b, off, stride); \
1364  } \
1365 \
1368  FORCEINLINE void svec<LANES,STYPE>::scatter_stride(STYPE* b, int32_t off, int32_t stride) { \
1369  svec_scatter_stride(b, off, stride, *this); \
1370  } \
1371 \
1374  FORCEINLINE void svec<LANES,STYPE>::scatter_stride(STYPE* b, int64_t off, int64_t stride) {\
1375  svec_scatter_stride(b, off, stride, *this); \
1376  } \
1377 \
1380  FORCEINLINE svec<LANES,STYPE> svec<LANES,STYPE>::broadcast(int32_t index) { return svec_broadcast(*this, index);} \
1381 \
1384  FORCEINLINE svec<LANES,STYPE> svec<LANES,STYPE>::rotate(int32_t index) { return svec_rotate(*this, index); } \
1385 \
1388  FORCEINLINE svec<LANES,STYPE> svec<LANES,STYPE>::shuffle(svec<LANES,int32_t> index) { return svec_shuffle(*this, index); } \
1389 \
1392  FORCEINLINE svec<LANES,STYPE> svec<LANES,STYPE>::abs() { return svec_abs(*this); }
1393 
1394 #define VEC_INT_CLASS_METHOD_IMPL(STYPE, STYPE2) \
1395 \
1398  FORCEINLINE svec<LANES,STYPE> svec<LANES,STYPE>::operator|(svec<LANES,STYPE> a) { return svec_or(*this, a); } \
1399 \
1402  FORCEINLINE svec<LANES,STYPE> svec<LANES,STYPE>::operator&(svec<LANES,STYPE> a) { return svec_and(*this, a); } \
1403 \
1406  FORCEINLINE svec<LANES,STYPE> svec<LANES,STYPE>::operator^(svec<LANES,STYPE> a) { return svec_xor(*this, a); } \
1407 \
1410  FORCEINLINE svec<LANES,STYPE> svec<LANES,STYPE>::operator<<(svec<LANES,STYPE2> a) { return svec_shl(*this, a); } \
1411 \
1414  FORCEINLINE svec<LANES,STYPE> svec<LANES,STYPE>::operator<<(int32_t s) { return svec_shl(*this, s); } \
1415 \
1418  FORCEINLINE svec<LANES,STYPE> svec<LANES,STYPE>::operator>>(svec<LANES,STYPE2> a) { return svec_shr(*this, a); } \
1419 \
1422  FORCEINLINE svec<LANES,STYPE> svec<LANES,STYPE>::operator>>(int32_t s) { return svec_shr(*this, s); } \
1423 \
1426  FORCEINLINE svec<LANES,STYPE> svec<LANES,STYPE>::operator%(svec<LANES,STYPE> a) { return svec_rem(*this, a); } \
1427 \
1430  FORCEINLINE svec<LANES,STYPE> svec<LANES,STYPE>::operator%(STYPE s) { return svec_rem(*this, s); }
1431 
1432 
1433 #define VEC_FLOAT_CLASS_METHOD_IMPL(STYPE) \
1434 \
1437  FORCEINLINE svec<LANES,STYPE> svec<LANES,STYPE>::round() { return svec_round(*this);} \
1438 \
1441  FORCEINLINE svec<LANES,STYPE> svec<LANES,STYPE>::floor() { return svec_floor(*this);} \
1442 \
1445  FORCEINLINE svec<LANES,STYPE> svec<LANES,STYPE>::ceil() { return svec_ceil(*this);} \
1446 \
1449  FORCEINLINE svec<LANES,STYPE> svec<LANES,STYPE>::sqrt() { return svec_sqrt(*this);} \
1450 \
1453  FORCEINLINE svec<LANES,STYPE> svec<LANES,STYPE>::rcp() { return svec_rcp(*this);} \
1454 \
1457  FORCEINLINE svec<LANES,STYPE> svec<LANES,STYPE>::rsqrt() { return svec_rsqrt(*this);}\
1458 \
1461  FORCEINLINE svec<LANES,STYPE> svec<LANES,STYPE>::exp() {return svec_exp(*this);} \
1462 \
1465  FORCEINLINE svec<LANES,STYPE> svec<LANES,STYPE>::log() {return svec_log(*this);} \
1466 \
1469  FORCEINLINE svec<LANES,STYPE> svec<LANES,STYPE>::pow(svec<LANES,STYPE> a) { return svec_pow(*this, a); }
1470 
1471 
1472 #endif /* GSIMD_UTILITY_H_ */
uint32_t svec1_u32
Definition: gsimd_utility.h:188
void stdout_scalar< uint8_t >(std::ostream &out, uint8_t v)
Definition: gsimd_utility.h:226
Definition: gsimd_utility.h:93
double svec1_d
Definition: gsimd_utility.h:193
int32_t i32
Definition: gsimd_utility.h:1135
void stdout_scalar(std::ostream &out, T v)
Definition: gsimd_utility.h:218
uint64_t u64
Definition: gsimd_utility.h:1139
svec< 4, bool > svec_select(svec< 4, bool > mask, svec< 4, bool > a, svec< 4, bool > b)
construct c by selecting elements from two input vectors according to the mask
Definition: power_vsx4.h:1126
void type
Definition: gsimd_utility.h:94
float svec1_f
Definition: gsimd_utility.h:192
int8_t svec1_i8
Definition: gsimd_utility.h:185
const bool check_lanes< 16 >(int n)
Definition: gsimd_utility.h:236
#define INC_STATS_NAME(stat, inc, opname)
Definition: gsimd_utility.h:156
int32_t svec1_i32
Definition: gsimd_utility.h:189
const bool check_lanes< 2 >(int n)
Definition: gsimd_utility.h:233
const bool check_lanes< 8 >(int n)
Definition: gsimd_utility.h:235
#define DEFINE_TYPE_NAME(type, name)
Definition: gsimd_utility.h:202
int64_t svec1_i64
Definition: gsimd_utility.h:191
const bool check_lanes(int n)
float f
Definition: gsimd_utility.h:1137
uint16_t svec1_u16
Definition: gsimd_utility.h:186
void stdout_scalar< int8_t >(std::ostream &out, int8_t v)
Definition: gsimd_utility.h:222
Definition: gsimd_utility.h:1134
uint8_t svec1_u8
Definition: gsimd_utility.h:184
const char * iu_get_type_name()
int16_t svec1_i16
Definition: gsimd_utility.h:187
uint32_t u32
Definition: gsimd_utility.h:1136
const bool check_lanes< 4 >(int n)
Definition: gsimd_utility.h:234
double d
Definition: gsimd_utility.h:1140
#define FORCEINLINE
Definition: gsimd_utility.h:175
uint64_t svec1_u64
Definition: gsimd_utility.h:190
int64_t i64
Definition: gsimd_utility.h:1138