78 #ifndef GSIMD_UTILITY_H_
79 #define GSIMD_UTILITY_H_
83 #define DUMP(v) std::cout << #v << ":" << (v) << std::endl
84 #if ((ULONG_MAX) == (UINT_MAX))
92 template <
int Lanes,
class T>
97 #if defined(ENABLE_STATS) || defined(ENABLE_STATS_AND_TRACE)
99 STATS_MASKED_LOAD = 0,
106 STATS_BINARY_FUNC_SLOW,
117 static const char* gStatNames[LAST_STATS] = {
134 static float gStats[LAST_STATS] = {0,0,0,0,0,0,0,0,0,0,0,0,0};
136 extern "C" void print_stats()
138 std::cout <<
" DUMP INTRINSICS STATS" << std::endl;
139 for (
int i=0; i<LAST_STATS; i++) {
141 std::cout <<
" - " << gStatNames[i] <<
": " << gStats[i] << std::endl;
146 #define INC_STATS(stat,inc) gStats[stat] += inc;
147 #ifdef ENABLE_STATS_AND_TRACE
148 #define INC_STATS_NAME(stat,inc,opname) \
149 std::cout << "slow impl of " << opname << " @ "\
150 << __FILE__ << " Line: " << __LINE__ << std::endl; \
153 #define INC_STATS_NAME(stat,inc,opname) gStats[stat] += inc
154 #endif // ENABLE_STATS_AND_TRACE
156 #define INC_STATS_NAME(stat,inc,opname)
157 #define INC_STATS(stat,inc)
160 #define NOT_IMPLEMENTED(msg) \
161 std::cout << "WARNING: operation " << msg << " is not implemented yet" << std::endl; \
169 #define FORCEINLINE __forceinline
171 #define POST_ALIGN(x)
172 #define roundf(x) (floorf(x + .5f))
173 #define round(x) (floor(x + .5))
175 #define FORCEINLINE inline __attribute__((always_inline))
177 #define POST_ALIGN(x) __attribute__ ((aligned(x)))
202 #define DEFINE_TYPE_NAME(type, name) \
203 template<> FORCEINLINE const char *iu_get_type_name<type>(){return name;} \
247 #define SUBSCRIPT_FUNC_DECL(STYPE) \
248 FORCEINLINE STYPE& operator[](int index); \
249 const FORCEINLINE STYPE operator[](int index) const;
251 #define SUBSCRIPT_FUNC_BOOL_DECL(STYPE) \
256 int m_index; svec<LANES,bool> *m_self; \
257 FORCEINLINE Helper(svec<LANES,bool> *p_vec, int index): m_self(p_vec), m_index(index) {} \
258 FORCEINLINE void operator=(STYPE value); \
259 FORCEINLINE void operator=(Helper helper); \
260 FORCEINLINE operator STYPE() const; \
262 FORCEINLINE Helper operator[](int index) { return Helper(this, index);} \
263 const FORCEINLINE STYPE operator[](int index) const;
266 #define COUT_FUNC_BOOL_DECL() \
267 friend std::ostream& operator<< (std::ostream &out, const svec<LANES,bool> &v) { \
268 out << "svec<" << LANES << ",bool> " << "[" << (v[0]?1:0); \
269 for(int i = 1; i < LANES ; i++) { out << ", " << (v[i]?1:0);} \
275 #define COUT_FUNC_CHAR_DECL(STYPE) \
276 friend std::ostream& operator<< (std::ostream &out, const svec<LANES,STYPE> &v) { \
277 out << "svec<" << LANES << "," << #STYPE <<"> [" << short(v[0]); \
278 for(int i = 1; i < LANES ; i++) { out << ", " << short(v[i]);} \
283 #define COUT_FUNC_DECL(STYPE) \
284 friend std::ostream& operator<< (std::ostream &out, const svec<LANES,STYPE> &v) { \
285 out << "svec<" << LANES << "," << #STYPE <<"> [" << v[0]; \
286 for(int i = 1; i < LANES ; i++) { out << ", " << v[i];} \
296 #define VEC_CMP_DECL(STYPE) \
297 FORCEINLINE svec<LANES,bool> operator==(svec<LANES,STYPE> a); \
298 FORCEINLINE svec<LANES,bool> operator!=(svec<LANES,STYPE> a); \
299 FORCEINLINE svec<LANES,bool> operator<(svec<LANES,STYPE> a); \
300 FORCEINLINE svec<LANES,bool> operator<=(svec<LANES,STYPE> a); \
301 FORCEINLINE svec<LANES,bool> operator>(svec<LANES,STYPE> a); \
302 FORCEINLINE svec<LANES,bool> operator>=(svec<LANES,STYPE> a); \
307 #define VEC_UNARY_DECL(STYPE) \
308 FORCEINLINE svec<LANES,STYPE> operator-(); \
309 FORCEINLINE STYPE reduce_add(); \
310 FORCEINLINE STYPE reduce_max(); \
311 FORCEINLINE STYPE reduce_min();
316 #define VEC_BIN_DECL(STYPE) \
317 FORCEINLINE svec<LANES,STYPE> operator+(svec<LANES,STYPE> a); \
318 FORCEINLINE svec<LANES,STYPE> operator+(STYPE s); \
319 FORCEINLINE svec<LANES,STYPE> operator-(svec<LANES,STYPE> a); \
320 FORCEINLINE svec<LANES,STYPE> operator-(STYPE s); \
321 FORCEINLINE svec<LANES,STYPE> operator*(svec<LANES,STYPE> a); \
322 FORCEINLINE svec<LANES,STYPE> operator*(STYPE s); \
323 FORCEINLINE svec<LANES,STYPE> operator/(svec<LANES,STYPE> a); \
324 FORCEINLINE svec<LANES,STYPE> operator/(STYPE s);
330 #define SVEC_BOOL_CLASS_METHOD_DECL() \
331 FORCEINLINE svec<LANES,bool> operator==(svec<LANES,bool> a); \
332 FORCEINLINE svec<LANES,bool> operator!=(svec<LANES,bool> a); \
333 static FORCEINLINE svec<LANES,bool> load(svec<LANES,bool>* p); \
334 FORCEINLINE void store(svec<LANES,bool>* p); \
335 FORCEINLINE bool any_true(); \
336 FORCEINLINE bool all_true(); \
337 FORCEINLINE bool none_true(); \
338 FORCEINLINE svec<LANES,bool> operator|(svec<LANES,bool>); \
339 FORCEINLINE svec<LANES,bool> operator&(svec<LANES,bool> a); \
340 FORCEINLINE svec<LANES,bool> operator^(svec<LANES,bool> a); \
341 FORCEINLINE svec<LANES,bool> operator~(); \
342 FORCEINLINE svec<LANES,bool> operator!(); \
343 FORCEINLINE svec<LANES,bool> operator&&(svec<LANES,bool> a); \
344 FORCEINLINE svec<LANES,bool> operator||(svec<LANES,bool> a);
350 #define VEC_CLASS_METHOD_DECL(STYPE) \
351 VEC_CMP_DECL(STYPE);\
352 VEC_UNARY_DECL(STYPE);\
353 VEC_BIN_DECL(STYPE);\
354 static FORCEINLINE svec<LANES,STYPE> load(svec<LANES,STYPE>* p); \
355 FORCEINLINE void store(svec<LANES,STYPE>* p); \
356 static FORCEINLINE svec<LANES,STYPE> masked_load(svec<LANES,STYPE>* p, svec<LANES,bool> mask); \
357 FORCEINLINE void masked_store(svec<LANES,STYPE>* p, svec<LANES,bool> mask); \
358 static FORCEINLINE svec<LANES,STYPE> load_const(const STYPE* p); \
359 static FORCEINLINE svec<LANES,STYPE> load_and_splat(STYPE* p); \
360 static FORCEINLINE svec<LANES,STYPE> gather(svec<LANES,void*> ptrs, svec<LANES,bool> mask);\
361 FORCEINLINE void scatter(svec<LANES,void*> ptrs, svec<LANES,bool> mask); \
362 static FORCEINLINE svec<LANES,STYPE> gather_base_offsets(STYPE* b, uint32_t scale, svec<LANES,int32_t> offsets, svec<LANES,bool> mask);\
363 static FORCEINLINE svec<LANES,STYPE> gather_base_offsets(STYPE* b, uint32_t scale, svec<LANES,int64_t> offsets, svec<LANES,bool> mask);\
364 FORCEINLINE void scatter_base_offsets(STYPE* b, uint32_t scale, svec<LANES,int32_t> offsets, svec<LANES,bool> mask); \
365 FORCEINLINE void scatter_base_offsets(STYPE* b, uint32_t scale, svec<LANES,int64_t> offsets, svec<LANES,bool> mask); \
366 static FORCEINLINE svec<LANES,STYPE> gather_stride(STYPE* b, int32_t off, int32_t stride);\
367 static FORCEINLINE svec<LANES,STYPE> gather_stride(STYPE* b, int64_t off, int64_t stride);\
368 FORCEINLINE void scatter_stride(STYPE* b, int32_t off, int32_t stride); \
369 FORCEINLINE void scatter_stride(STYPE* b, int64_t off, int64_t stride); \
370 FORCEINLINE svec<LANES,STYPE> broadcast(int32_t index); \
371 FORCEINLINE svec<LANES,STYPE> rotate(int32_t index); \
372 FORCEINLINE svec<LANES,STYPE> shuffle(svec<LANES, int32_t> index); \
373 FORCEINLINE svec<LANES,STYPE> abs();
379 #define VEC_INT_CLASS_METHOD_DECL(STYPE, USTYPE) \
380 FORCEINLINE svec<LANES, STYPE> operator|(svec<LANES, STYPE> a); \
381 FORCEINLINE svec<LANES, STYPE> operator&(svec<LANES, STYPE> a); \
382 FORCEINLINE svec<LANES, STYPE> operator^(svec<LANES, STYPE> a); \
383 FORCEINLINE svec<LANES, STYPE> operator<<(svec<LANES, USTYPE> a); \
384 FORCEINLINE svec<LANES, STYPE> operator<<(int32_t s); \
385 FORCEINLINE svec<LANES, STYPE> operator>>(svec<LANES, USTYPE> a); \
386 FORCEINLINE svec<LANES, STYPE> operator>>(int32_t s); \
387 FORCEINLINE svec<LANES, STYPE> operator%(svec<LANES, STYPE> a); \
388 FORCEINLINE svec<LANES, STYPE> operator%(STYPE s);
393 #define VEC_FLOAT_CLASS_METHOD_DECL(STYPE) \
394 FORCEINLINE svec<LANES,STYPE> round(); \
395 FORCEINLINE svec<LANES,STYPE> floor(); \
396 FORCEINLINE svec<LANES,STYPE> ceil(); \
397 FORCEINLINE svec<LANES,STYPE> sqrt(); \
398 FORCEINLINE svec<LANES,STYPE> rcp(); \
399 FORCEINLINE svec<LANES,STYPE> rsqrt(); \
400 FORCEINLINE svec<LANES,STYPE> exp(); \
401 FORCEINLINE svec<LANES,STYPE> log(); \
402 FORCEINLINE svec<LANES,STYPE> pow(svec<LANES,STYPE> a);
409 #define INSERT_EXTRACT(STYPE) \
410 static FORCEINLINE STYPE svec_extract(svec<LANES,STYPE> v, int index) { \
413 static FORCEINLINE void svec_insert(svec<LANES,STYPE> *v, int index, STYPE val) { \
419 #define LOAD_STORE(STYPE) \
426 static FORCEINLINE svec<LANES,STYPE> svec_load(const svec<LANES,STYPE> *p) { \
427 STYPE *ptr = (STYPE *)p; \
428 svec<LANES,STYPE> ret; \
429 INC_STATS_NAME(STATS_LOAD_SLOW, 1, "load:svec_"#STYPE); \
430 for (int i = 0; i < LANES; ++i) {ret[i] = ptr[i];} \
439 static FORCEINLINE void svec_store(svec<LANES,STYPE> *p, svec<LANES,STYPE> v) { \
440 STYPE *ptr = (STYPE *)p; \
441 INC_STATS_NAME(STATS_STORE_SLOW, 1, "store:svec_"#STYPE); \
442 for (int i = 0; i < LANES; ++i) { ptr[i] = v[i]; } \
448 #define SELECT(STYPE) \
449 static FORCEINLINE svec<LANES,STYPE> svec_select(svec<LANES,bool> mask, svec<LANES,STYPE> a, svec<LANES,STYPE> b) { \
450 svec<LANES,STYPE> ret; \
451 INC_STATS_NAME(STATS_OTHER_SLOW, 1, "select:svec_"#STYPE); \
452 for (int i = 0; i < LANES; ++i) {ret[i] = mask[i] ? a[i] : b[i];} \
459 #define SELECT_BOOLCOND(STYPE) \
463 FORCEINLINE svec<LANES,STYPE> svec_select(bool cond, svec<LANES,STYPE> a, svec<LANES,STYPE> b) { \
464 return cond ? a : b; \
472 #define BROADCAST(STYPE) \
473 static FORCEINLINE svec<LANES,STYPE> svec_broadcast(svec<LANES,STYPE> v, int index) { \
474 INC_STATS_NAME(STATS_OTHER_SLOW, 1, "broadcast"); \
475 STYPE bval = v[index]; \
476 svec<LANES,STYPE> ret; \
477 for (int i = 0; i < LANES; ++i) { ret[i] = bval;} \
485 #define BROADCAST_L4(STYPE) \
486 static FORCEINLINE svec<LANES,STYPE> svec_broadcast(svec<LANES,STYPE> v, int index) { \
487 INC_STATS_NAME(STATS_OTHER_SLOW, 1, "broadcast"); \
488 STYPE bval = v[index]; \
489 svec<LANES,STYPE> ret(bval,bval,bval,bval); \
496 #define ROTATE(STYPE) \
497 static FORCEINLINE svec<LANES,STYPE> svec_rotate(svec<LANES,STYPE> v, int index) { \
498 INC_STATS_NAME(STATS_OTHER_SLOW, 1, "rotate"); \
499 svec<LANES,STYPE> ret; \
500 for (int i = 0; i < LANES; ++i) { ret[i] = v[(i+index) & (LANES-1)];} \
507 #define ROTATE_L4(STYPE) \
508 static FORCEINLINE svec<LANES,STYPE> svec_rotate(svec<LANES,STYPE> v, int index) { \
509 INC_STATS_NAME(STATS_OTHER_SLOW, 1, "rotate"); \
510 svec<LANES,STYPE> ret (v[(0+index) & 0x3], \
511 v[(1+index) & 0x3], \
512 v[(2+index) & 0x3], \
513 v[(3+index) & 0x3]); \
521 #define SHUFFLES(STYPE) \
522 static FORCEINLINE svec<LANES,STYPE> svec_shuffle(svec<LANES,STYPE> v, svec<LANES,int32_t> index) { \
523 INC_STATS_NAME(STATS_OTHER_SLOW, 1, "shuffle"); \
524 svec<LANES,STYPE> ret; \
525 for (int i = 0; i < LANES; ++i) { ret[i] = v[index[i] & (LANES-1)]; }\
528 static FORCEINLINE svec<LANES,STYPE> svec_shuffle2(svec<LANES,STYPE> v0, svec<LANES,STYPE> v1, svec<LANES,int32_t> index) { \
529 svec<LANES,STYPE> ret; \
530 NOT_IMPLEMENTED("shuffle 2"); \
537 #define SHUFFLES_L4(STYPE) \
538 static FORCEINLINE svec<LANES,STYPE> svec_shuffle(svec<LANES,STYPE> v, svec<LANES,int32_t> index) { \
539 INC_STATS_NAME(STATS_OTHER_SLOW, 1, "shuffle"); \
540 svec<LANES,STYPE> ret (v[index[0] & 0x3], \
543 v[index[3] & 0x3] ); \
546 static FORCEINLINE svec<LANES,STYPE> svec_shuffle2(svec<LANES,STYPE> v0, svec<LANES,STYPE> v1, svec<LANES,int32_t> index) { \
547 svec<LANES,STYPE> ret; \
548 NOT_IMPLEMENTED("shuffle 2"); \
555 #define ZERO(STYPE, NAME) \
556 static FORCEINLINE svec<LANES,STYPE> svec_zero(svec<LANES,STYPE>) { \
557 svec<LANES,STYPE> ret(0,0,0,0); \
562 #define LOAD_CONST(STYPE) \
563 template <class RetVecType> static RetVecType svec_load_const(const STYPE* p); \
565 FORCEINLINE svec<LANES,STYPE> svec_load_const<svec<LANES,STYPE> >(const STYPE* p) { \
566 svec<LANES,STYPE> ret; \
567 INC_STATS_NAME(STATS_LOAD_SLOW, 1, "load const"); \
568 for (int i = 0; i < LANES; ++i) { ret[i] = *p; }\
571 template <class RetVecType> static RetVecType svec_load_and_splat(STYPE* p); \
573 FORCEINLINE svec<LANES,STYPE> svec_load_and_splat<svec<LANES,STYPE> >(STYPE* p) { \
574 svec<LANES,STYPE> ret; \
575 INC_STATS_NAME(STATS_LOAD_SLOW, 1, "load const"); \
576 for (int i = 0; i < LANES; ++i) { ret[i] = *p; }\
584 template<
typename RetVec,
typename RetScalar,
typename PTRS,
typename MSK>
586 lGatherGeneral(PTRS ptrs, MSK mask) {
588 if(svec_extract(mask,0)) { r[0] = *((RetScalar*)svec_extract(ptrs, 0));}
589 if(svec_extract(mask,1)) { r[1] = *((RetScalar*)svec_extract(ptrs, 1));}
590 if(svec_extract(mask,2)) { r[2] = *((RetScalar*)svec_extract(ptrs, 2));}
591 if(svec_extract(mask,3)) { r[3] = *((RetScalar*)svec_extract(ptrs, 3));}
593 return RetVec(r[0],r[1],r[2],r[3]);
602 #define GATHER_GENERAL(STYPE, PSTYPE) \
604 FORCEINLINE svec<LANES,STYPE> svec_gather<svec<LANES,STYPE> >(svec<LANES,PSTYPE> ptrs, svec<LANES,bool> mask) { \
605 svec<LANES,STYPE> ret;\
606 for(int i = 0; i < LANES; ++i) {if(mask[i]){ret[i] = *(STYPE*)(ptrs[i]); } }\
607 INC_STATS_NAME(STATS_GATHER_SLOW, 1, "Gather genera"); \
617 #define GATHER_GENERAL_L4(STYPE, PSTYPE) \
619 FORCEINLINE svec<LANES,STYPE> svec_gather<svec<LANES,STYPE> >(svec<LANES,PSTYPE> ptrs, svec<LANES,bool> mask) { \
620 return lGatherGeneral<svec<LANES,STYPE>, STYPE, svec<LANES,PSTYPE>, svec<LANES,bool> >(ptrs, mask); \
631 template<
typename RetVec,
typename RetScalar,
typename OFF,
typename MSK>
633 lGatherBaseOffsets(
unsigned char *p, uint32_t scale,
634 OFF offsets, MSK mask) {
639 r[0] = *(RetScalar *)(p + scale * svec_extract(offsets, 0));
640 r[1] = *(RetScalar *)(p + scale * svec_extract(offsets, 1));
641 r[2] = *(RetScalar *)(p + scale * svec_extract(offsets, 2));
642 r[3] = *(RetScalar *)(p + scale * svec_extract(offsets, 3));
644 return RetVec(r[0], r[1], r[2], r[3]);
647 #define GATHER_BASE_OFFSETS(STYPE, OSTYPE) \
648 FORCEINLINE svec<LANES,STYPE> svec_gather_base_offsets(STYPE* b, uint32_t scale, svec<LANES,OSTYPE> offsets, svec<LANES,bool> mask) { \
649 svec<LANES,STYPE> ret;\
650 for(int i = 0; i < LANES; ++i) {if(mask[i]){ret[i] = *(STYPE*)((uint8_t*)b + scale * offsets[i]);} }\
651 INC_STATS_NAME(STATS_GATHER_SLOW,1, "Gather offset with select"); \
658 #define GATHER_BASE_OFFSETS_L4(STYPE, OSTYPE) \
659 FORCEINLINE svec<LANES,STYPE> svec_gather_base_offsets(STYPE* b, uint32_t scale, svec<LANES,OSTYPE> offsets, svec<LANES,bool> mask) { \
660 return lGatherBaseOffsets<svec<LANES,STYPE>, STYPE, svec<LANES,OSTYPE>, svec<LANES,bool> >((uint8_t*)b, scale, offsets, mask); \
666 #define GATHER_STRIDE(STYPE, OSTYPE) \
667 template <class RetVecType> static RetVecType svec_gather_stride(STYPE* b, OSTYPE o, OSTYPE s); \
669 FORCEINLINE svec<LANES,STYPE> svec_gather_stride<svec<LANES,STYPE> >(STYPE* b, OSTYPE o, OSTYPE s) { \
670 svec<LANES,STYPE> ret; \
672 for(int i = 0; i < LANES; ++i, b+=s) { \
675 INC_STATS_NAME(STATS_GATHER_SLOW,1, "Gather Steps"); \
682 #define GATHER_STRIDE_L4(STYPE, OSTYPE) \
683 template <class RetVecType> static RetVecType svec_gather_stride(STYPE* b, OSTYPE o, OSTYPE s); \
685 FORCEINLINE svec<LANES,STYPE> svec_gather_stride<svec<LANES,STYPE> >(STYPE* b, OSTYPE o, OSTYPE s) { \
686 int64_t off = (int64_t)o; int64_t stride = (int64_t)s;\
687 OSTYPE stride2 = stride * 2; \
688 STYPE v0 = *(b + off); \
689 STYPE v1 = *(b + off + stride); \
690 STYPE v2 = *(b + off + stride2); \
691 STYPE v3 = *(b + off + stride2 + stride); \
692 return svec<LANES,STYPE>(v0, v1, v2, v3); \
705 #define SCATTER_STRIDE(STYPE, OSTYPE) \
706 FORCEINLINE void svec_scatter_stride(STYPE* b, OSTYPE o, OSTYPE s, svec<LANES,STYPE> val) { \
708 for(int i = 0; i < LANES; ++i, b+=s) { \
709 *b = svec_extract(val, i); \
711 INC_STATS_NAME(STATS_SCATTER_SLOW,1, "scatter stride general svec<LANES,"#STYPE">"); \
715 #define SCATTER_STRIDE_L4(STYPE, OSTYPE) \
716 FORCEINLINE void svec_scatter_stride(STYPE* b, OSTYPE o, OSTYPE s, svec<LANES,STYPE> val) { \
717 int64_t off = (int64_t)o; int64_t stride = (int64_t)s;\
718 OSTYPE stride2 = stride * 2; \
719 *(b + off) = svec_extract(val, 0); \
720 *(b + off + stride) = svec_extract(val, 1); \
721 *(b + off + stride2) = svec_extract(val, 2); \
722 *(b + off + stride2 + stride) = svec_extract(val, 3); \
736 #define SCATTER_GENERAL(STYPE, PSTYPE) \
737 static FORCEINLINE void svec_scatter(svec<LANES,PSTYPE> ptrs, svec<LANES,STYPE> val, svec<LANES,bool> mask) { \
738 for(int i = 0; i < LANES; ++i) { if(mask[i]){ *((STYPE*)ptrs[i]) = val[i];} } \
739 INC_STATS_NAME(STATS_SCATTER_SLOW,1, "scatter general svec<LANES,"#STYPE">"); \
746 template<
typename STYPE,
typename PTRTYPE,
typename VTYPE,
typename MTYPE>
747 static FORCEINLINE void lScatterGeneral(PTRTYPE ptrs,
748 VTYPE val, MTYPE mask) {
749 if(svec_extract(mask,0)) { *((STYPE*)svec_extract(ptrs, 0)) = val[0]; }
750 if(svec_extract(mask,1)) { *((STYPE*)svec_extract(ptrs, 1)) = val[1]; }
751 if(svec_extract(mask,2)) { *((STYPE*)svec_extract(ptrs, 2)) = val[2]; }
752 if(svec_extract(mask,3)) { *((STYPE*)svec_extract(ptrs, 3)) = val[3]; }
756 #define SCATTER_GENERAL_L4(STYPE, PSTYPE) \
757 static FORCEINLINE void svec_scatter(svec<LANES,PSTYPE> ptrs, svec<LANES,STYPE> val, svec<LANES,bool> mask) { \
758 lScatterGeneral<STYPE, svec<LANES,PSTYPE>, svec<LANES,STYPE>, svec<LANES,bool> >(ptrs, val, mask); \
765 #define SCATTER_BASE_OFFSETS(STYPE, OSTYPE) \
766 FORCEINLINE void svec_scatter_base_offsets(STYPE* b, uint32_t scale, svec<LANES,OSTYPE> offsets, svec<LANES,STYPE> val, svec<LANES,bool> mask) { \
767 for(int i=0;i<LANES;++i){if(mask[i]){*(STYPE*)((uint8_t*)b + scale * offsets[i]) = val[i];}}\
768 INC_STATS_NAME(STATS_SCATTER_SLOW,1,"scatter offset svec<LANES,"#STYPE">"); \
774 template<
typename STYPE,
typename OTYPE,
typename VTYPE,
typename MTYPE>
775 static FORCEINLINE void lScatterBaseOffsets(
unsigned char *b,
776 uint32_t scale, OTYPE offsets,
777 VTYPE val, MTYPE mask) {
778 unsigned char *base = b;
779 if(svec_extract(mask,0)) { *(STYPE*)(b + scale * svec_extract(offsets, 0)) = val[0]; }
780 if(svec_extract(mask,1)) { *(STYPE*)(b + scale * svec_extract(offsets, 1)) = val[1]; }
781 if(svec_extract(mask,2)) { *(STYPE*)(b + scale * svec_extract(offsets, 2)) = val[2]; }
782 if(svec_extract(mask,3)) { *(STYPE*)(b + scale * svec_extract(offsets, 3)) = val[3]; }
789 #define SCATTER_BASE_OFFSETS_L4(STYPE, OSTYPE) \
790 FORCEINLINE void svec_scatter_base_offsets(STYPE* b, uint32_t scale, svec<LANES,OSTYPE> offsets, svec<LANES,STYPE> val, svec<LANES,bool> mask) { \
791 lScatterBaseOffsets<STYPE, svec<LANES,OSTYPE>, svec<LANES,STYPE>, svec<LANES,bool> >((uint8_t*)b, scale, offsets, val, mask); \
797 #define MASKED_LOAD_STORE_L4(STYPE) \
798 static FORCEINLINE svec<LANES,STYPE> svec_masked_load(svec<LANES,STYPE> *p, svec<LANES,bool> mask) { \
799 return svec_gather_base_offsets((STYPE*)p, sizeof(STYPE), svec<LANES,int32_t>(0,1,2,3), mask); \
801 static FORCEINLINE void svec_masked_store(svec<LANES,STYPE> *p, svec<LANES,STYPE> v, svec<LANES,bool> mask) { \
802 svec_scatter_base_offsets((STYPE*)p, sizeof(STYPE), svec<LANES,int32_t>(0,1,2,3), v, mask); \
805 #define MASKED_LOAD_STORE_L8(STYPE) \
806 static FORCEINLINE svec<LANES,STYPE> svec_masked_load(svec<LANES,STYPE> *p, svec<LANES,bool> mask) { \
807 return svec_gather_base_offsets((STYPE*)p, sizeof(STYPE), svec<LANES,int32_t>(0,1,2,3,4,5,6,7), mask); \
809 static FORCEINLINE void svec_masked_store(svec<LANES,STYPE> *p, svec<LANES,STYPE> v, svec<LANES,bool> mask) { \
810 svec_scatter_base_offsets((STYPE*)p, sizeof(STYPE), svec<LANES,int32_t>(0,1,2,3,4,5,6,7), v, mask); \
830 return a >= 0 ? a : -a;
833 #define UNARY_OP(STYPE, NAME, OP) \
834 static FORCEINLINE svec<LANES,STYPE> NAME(svec<LANES,STYPE> v) { \
835 INC_STATS_NAME(STATS_UNARY_SLOW, 1, #OP); \
836 svec<LANES,STYPE> ret; \
837 for (int i = 0; i < LANES; ++i) { ret[i] = OP(v[i]); } \
841 #define UNARY_OP_L4(STYPE, NAME, OP) \
842 static FORCEINLINE svec<LANES,STYPE> NAME(svec<LANES,STYPE> v) { \
843 INC_STATS_NAME(STATS_UNARY_SLOW, 1, #OP); \
844 return svec<LANES,STYPE>(OP(svec_extract(v, 0)),\
845 OP(svec_extract(v, 1)),\
846 OP(svec_extract(v, 2)),\
847 OP(svec_extract(v, 3)));\
853 #define BINARY_OP(STYPE, NAME, OP) \
854 static FORCEINLINE svec<LANES,STYPE> NAME(svec<LANES,STYPE> a, svec<LANES,STYPE> b) { \
855 INC_STATS_NAME(STATS_BINARY_SLOW, 1, #NAME); \
856 svec<LANES,STYPE> ret; \
857 for (int i = 0; i < LANES; ++i) { ret[i] = a[i] OP b[i]; } \
861 #define BINARY_OP2(STYPE, STYPE2, NAME, OP) \
862 static FORCEINLINE svec<LANES,STYPE> NAME(svec<LANES,STYPE> a, svec<LANES,STYPE2> b) { \
863 INC_STATS_NAME(STATS_BINARY_SLOW, 1, #NAME); \
864 svec<LANES,STYPE> ret; \
865 for (int i = 0; i < LANES; ++i) { ret[i] = a[i] OP b[i]; } \
869 #define BINARY_OP_FUNC(STYPE, NAME, FUNC) \
870 static FORCEINLINE svec<LANES,STYPE> NAME(svec<LANES,STYPE> a, svec<LANES,STYPE> b) { \
871 INC_STATS_NAME(STATS_BINARY_SLOW, 1, #NAME); \
872 svec<LANES,STYPE> ret; \
873 for (int i = 0; i < LANES; ++i) { ret[i] = FUNC(a[i], b[i]); } \
880 #define BINARY_OP_L4(STYPE, NAME, OP) \
881 static FORCEINLINE svec<LANES,STYPE> NAME(svec<LANES,STYPE> a, svec<LANES,STYPE> b) { \
882 INC_STATS_NAME(STATS_BINARY_SLOW, 1, #NAME); \
883 svec<LANES,STYPE> ret(svec_extract(a, 0) OP svec_extract(b, 0),\
884 svec_extract(a, 1) OP svec_extract(b, 1),\
885 svec_extract(a, 2) OP svec_extract(b, 2),\
886 svec_extract(a, 3) OP svec_extract(b, 3));\
893 #define BINARY_OP2_L4(STYPE, STYPE2, NAME, OP) \
894 static FORCEINLINE svec<LANES,STYPE> NAME(svec<LANES,STYPE> a, svec<LANES,STYPE2> b) { \
895 INC_STATS_NAME(STATS_BINARY_SLOW, 1, #NAME); \
896 svec<LANES,STYPE> ret(svec_extract(a, 0) OP svec_extract(b, 0),\
897 svec_extract(a, 1) OP svec_extract(b, 1),\
898 svec_extract(a, 2) OP svec_extract(b, 2),\
899 svec_extract(a, 3) OP svec_extract(b, 3));\
904 #define BINARY_OP_FUNC_L4(STYPE, NAME, FUNC) \
905 static FORCEINLINE svec<LANES,STYPE> NAME(svec<LANES,STYPE> a, svec<LANES,STYPE> b) { \
906 INC_STATS_NAME(STATS_BINARY_SLOW, 1, #NAME); \
907 svec<LANES,STYPE> ret(FUNC(svec_extract(a, 0), svec_extract(b, 0)),\
908 FUNC(svec_extract(a, 1), svec_extract(b, 1)),\
909 FUNC(svec_extract(a, 2), svec_extract(b, 2)),\
910 FUNC(svec_extract(a, 3), svec_extract(b, 3))); \
917 #define BINARY_OP_SCALAR_L4(STYPE, STYPE2, NAME, OP) \
918 static FORCEINLINE svec<LANES,STYPE> NAME(svec<LANES,STYPE> a, STYPE2 s) { \
919 INC_STATS_NAME(STATS_BINARY_SLOW, 1, #NAME); \
920 svec<LANES,STYPE> ret(svec_extract(a, 0) OP s,\
921 svec_extract(a, 1) OP s,\
922 svec_extract(a, 2) OP s,\
923 svec_extract(a, 3) OP s);\
930 #define BINARY_OP_SCALAR(STYPE, NAME, OP) \
931 static FORCEINLINE svec<LANES,STYPE> NAME(svec<LANES,STYPE> a, STYPE s) { \
932 INC_STATS_NAME(STATS_BINARY_SLOW, 1, #NAME); \
933 svec<LANES,STYPE> ret; \
934 for (int i = 0; i < LANES; ++i) { ret[i] = a[i] OP s; } \
940 #define BINARY_SHT_SCALAR(STYPE, SHTTYPE, NAME, OP) \
941 static FORCEINLINE svec<LANES,STYPE> NAME(svec<LANES,STYPE> a, SHTTYPE s) { \
942 INC_STATS_NAME(STATS_BINARY_SLOW, 1, #NAME); \
943 svec<LANES,STYPE> ret; \
944 for (int i = 0; i < LANES; ++i) { ret[i] = a[i] OP s; } \
950 #define BINARY_SCALAR_OP(STYPE, NAME, OP) \
951 static FORCEINLINE svec<LANES,STYPE> NAME(STYPE s, svec<LANES,STYPE> a) { \
952 INC_STATS_NAME(STATS_BINARY_SLOW, 1, #NAME); \
953 svec<LANES,STYPE> ret; \
954 for (int i = 0; i < LANES; ++i) { ret[i] = s OP a[i]; }\
958 #define TERNERY(STYPE) \
962 FORCEINLINE svec<LANES,STYPE> svec_madd(svec<LANES,STYPE> a, svec<LANES,STYPE> b, svec<LANES,STYPE> c) { \
963 svec<LANES,STYPE> res; \
964 for(int i = 0; i < LANES; ++i) { res[i] = a[i]*b[i]+c[i]; } \
970 FORCEINLINE svec<LANES,STYPE> svec_msub(svec<LANES,STYPE> a, svec<LANES,STYPE> b, svec<LANES,STYPE> c) { \
971 svec<LANES,STYPE> res; \
972 for(int i = 0; i < LANES; ++i) { res[i] = a[i]*b[i]-c[i]; } \
978 FORCEINLINE svec<LANES,STYPE> svec_nmsub(svec<LANES,STYPE> a, svec<LANES,STYPE> b, svec<LANES,STYPE> c) { \
979 svec<LANES,STYPE> res; \
980 for(int i = 0; i < LANES; ++i) { res[i] = -(a[i]*b[i]-c[i]); } \
984 #define TERNERY_L4(STYPE) \
988 FORCEINLINE svec<LANES,STYPE> svec_madd(svec<LANES,STYPE> a, svec<LANES,STYPE> b, svec<LANES,STYPE> c) { \
989 svec<LANES,STYPE> ret(svec_extract(a, 0) * svec_extract(b, 0) + svec_extract(c, 0),\
990 svec_extract(a, 1) * svec_extract(b, 1) + svec_extract(c, 1),\
991 svec_extract(a, 2) * svec_extract(b, 2) + svec_extract(c, 2),\
992 svec_extract(a, 3) * svec_extract(b, 3) + svec_extract(c, 3));\
998 FORCEINLINE svec<LANES,STYPE> svec_msub(svec<LANES,STYPE> a, svec<LANES,STYPE> b, svec<LANES,STYPE> c) { \
999 svec<LANES,STYPE> ret(svec_extract(a, 0) * svec_extract(b, 0) - svec_extract(c, 0),\
1000 svec_extract(a, 1) * svec_extract(b, 1) - svec_extract(c, 1),\
1001 svec_extract(a, 2) * svec_extract(b, 2) - svec_extract(c, 2),\
1002 svec_extract(a, 3) * svec_extract(b, 3) - svec_extract(c, 3));\
1008 FORCEINLINE svec<LANES,STYPE> svec_nmsub(svec<LANES,STYPE> a, svec<LANES,STYPE> b, svec<LANES,STYPE> c) { \
1009 svec<LANES,STYPE> ret(- (svec_extract(a, 0) * svec_extract(b, 0) - svec_extract(c, 0)),\
1010 - (svec_extract(a, 1) * svec_extract(b, 1) - svec_extract(c, 1)),\
1011 - (svec_extract(a, 2) * svec_extract(b, 2) - svec_extract(c, 2)),\
1012 - (svec_extract(a, 3) * svec_extract(b, 3) - svec_extract(c, 3)));\
1021 template<
class T>
static FORCEINLINE T add(T a, T b) {
1024 template<
class T>
static FORCEINLINE T max(T a, T b) {
1025 return a > b ? a : b;
1027 template<
class T>
static FORCEINLINE T min(T a, T b) {
1028 return a < b ? a : b;
1031 #define BINARY_OP_REDUCE_FUNC(STYPE, NAME, FUNC) \
1032 static FORCEINLINE STYPE NAME(svec<LANES,STYPE> a) { \
1033 INC_STATS_NAME(STATS_OTHER_SLOW, 1, "reduce"); \
1035 for(int i = 1; i < LANES; ++i) { r = FUNC(r, a[i]); } \
1039 #define BINARY_OP_REDUCE_FUNC_L4(STYPE, NAME, FUNC) \
1040 static FORCEINLINE STYPE NAME(svec<LANES,STYPE> a) { \
1041 INC_STATS_NAME(STATS_OTHER_SLOW, 1, "reduce"); \
1042 return FUNC(FUNC(FUNC(a[0], a[1]), a[2]), a[3]); \
1049 #define CMP_OP(STYPE, NAME, OP) \
1050 static FORCEINLINE svec<LANES,bool> svec_##NAME(svec<LANES,STYPE> a, svec<LANES,STYPE> b) { \
1051 INC_STATS_NAME(STATS_BINARY_SLOW, 1, #NAME); \
1052 svec<LANES,bool> ret; \
1053 for (int i = 0; i < LANES; ++i) { ret[i] = a[i] OP b[i]; } \
1057 #define CMP_OP_L4(STYPE, NAME, OP) \
1058 static FORCEINLINE svec<LANES,bool> svec_##NAME(svec<LANES,STYPE> a, svec<LANES,STYPE> b) { \
1059 INC_STATS_NAME(STATS_COMPARE_SLOW, 1, #NAME); \
1060 uint32_t r0 = (a[0] OP b[0]); \
1061 uint32_t r1 = (a[1] OP b[1]); \
1062 uint32_t r2 = (a[2] OP b[2]); \
1063 uint32_t r3 = (a[3] OP b[3]); \
1064 return svec<LANES,bool>(r0,r1,r2,r3); \
1070 #define CMP_MASKED_OP(STYPE, NAME, OP) \
1075 FORCEINLINE svec<LANES,bool> svec_masked_##NAME(svec<LANES,STYPE> a, svec<LANES,STYPE> b, \
1076 svec<LANES,bool> mask) { \
1077 return svec_and(svec_##NAME(a,b) , mask); \
1083 #define CMP_ALL_NOMASK_OP(STYPE) \
1084 CMP_OP(STYPE, equal, ==) \
1085 CMP_OP(STYPE, not_equal, !=) \
1086 CMP_OP(STYPE, less_than, <) \
1087 CMP_OP(STYPE, less_equal, <=) \
1088 CMP_OP(STYPE, greater_than, >) \
1089 CMP_OP(STYPE, greater_equal, >=)
1091 #define CMP_ALL_NOMASK_OP_L4(STYPE) \
1092 CMP_OP_L4(STYPE, equal, ==) \
1093 CMP_OP_L4(STYPE, not_equal, !=) \
1094 CMP_OP_L4(STYPE, less_than, <) \
1095 CMP_OP_L4(STYPE, less_equal, <=) \
1096 CMP_OP_L4(STYPE, greater_than, >) \
1097 CMP_OP_L4(STYPE, greater_equal, >=)
1099 #define CMP_ALL_MASKED_OP(STYPE) \
1100 CMP_MASKED_OP(STYPE, equal, ==) \
1101 CMP_MASKED_OP(STYPE, not_equal, !=) \
1102 CMP_MASKED_OP(STYPE, less_than, <) \
1103 CMP_MASKED_OP(STYPE, less_equal, <=) \
1104 CMP_MASKED_OP(STYPE, greater_than, >) \
1105 CMP_MASKED_OP(STYPE, greater_equal, >=)
1107 #define CMP_ALL_OP(STYPE) \
1108 CMP_ALL_NOMASK_OP(STYPE) \
1109 CMP_ALL_MASKED_OP(STYPE)
1112 #define CAST(SFROM, STO) \
1113 template <class T> static T svec_cast(svec<LANES,SFROM> val); \
1117 template <> FORCEINLINE svec<LANES,STO> svec_cast<svec<LANES,STO> >(svec<LANES,SFROM> val) { \
1118 INC_STATS_NAME(STATS_CAST_SLOW, 1, "svec<LANES,"#SFROM">-svec<LANES,"#STO">"); \
1119 svec<LANES,STO> ret; \
1120 for (int i = 0; i < LANES; ++i) { ret[i] = (STO)val[i]; } \
1124 #define CAST_L4(SFROM, STO) \
1125 template <class T> static T svec_cast(svec<LANES,SFROM> val); \
1129 template <> FORCEINLINE svec<LANES,STO> svec_cast<svec<LANES,STO> >(svec<LANES,SFROM> val) { \
1130 INC_STATS_NAME(STATS_CAST_SLOW, 1, "svec<LANES,"#SFROM">-svec<LANES,"#STO">"); \
1131 return svec<LANES,STO>((STO)val[0],(STO)val[1],(STO)val[2],(STO)val[3]); \
1143 #define CAST_BITS(SFROM, FROM_F, STO, TO_F) \
1144 template <class T> static T svec_cast_bits(svec<LANES,SFROM> val); \
1145 template <> FORCEINLINE svec<LANES,STO> svec_cast_bits<svec<LANES,STO> >(svec<LANES,SFROM> val) { \
1146 INC_STATS_NAME(STATS_CAST_SLOW, 1, "svec<LANES,"#SFROM">-svec<LANES,"#STO">"); \
1147 BitcastUnion u[LANES]; \
1148 svec<LANES,STO> ret; \
1149 for(int i = 0; i < LANES; ++i) {u[i].FROM_F = val[i]; ret[i] = u[i].TO_F;} \
1160 #define SUBSCRIPT_FUNC_IMPL(STYPE) \
1161 FORCEINLINE STYPE& svec<LANES,STYPE>::operator[](int index) { \
1162 INC_STATS_NAME(STATS_INSERT, 1, "insert "#STYPE); \
1165 const FORCEINLINE STYPE svec<LANES,STYPE>::operator[](int index) const { \
1175 #define VEC_CMP_IMPL(STYPE) \
1181 FORCEINLINE svec<LANES,bool> svec<LANES,STYPE>::operator==(svec<LANES,STYPE> a) { return svec_equal(*this, a); } \
1187 FORCEINLINE svec<LANES,bool> svec<LANES,STYPE>::operator!=(svec<LANES,STYPE> a) { return svec_not_equal(*this, a); } \
1193 FORCEINLINE svec<LANES,bool> svec<LANES,STYPE>::operator<(svec<LANES,STYPE> a) { return svec_less_than(*this, a); } \
1199 FORCEINLINE svec<LANES,bool> svec<LANES,STYPE>::operator<=(svec<LANES,STYPE> a) { return svec_less_equal(*this, a); } \
1205 FORCEINLINE svec<LANES,bool> svec<LANES,STYPE>::operator>(svec<LANES,STYPE> a) { return svec_greater_than(*this, a); } \
1211 FORCEINLINE svec<LANES,bool> svec<LANES,STYPE>::operator>=(svec<LANES,STYPE> a) { return svec_greater_equal(*this, a); }
1213 #define VEC_UNARY_IMPL(STYPE) \
1217 FORCEINLINE svec<LANES,STYPE> svec<LANES,STYPE>::operator-() {return svec_neg(*this); } \
1221 FORCEINLINE STYPE svec<LANES,STYPE>::reduce_add() {return svec_reduce_add(*this); } \
1225 FORCEINLINE STYPE svec<LANES,STYPE>::reduce_max() {return svec_reduce_max(*this); } \
1229 FORCEINLINE STYPE svec<LANES,STYPE>::reduce_min() {return svec_reduce_min(*this); }
1232 #define VEC_BIN_IMPL(STYPE) \
1236 FORCEINLINE svec<LANES,STYPE> svec<LANES,STYPE>::operator+(svec<LANES,STYPE> a) { return svec_add(*this, a); } \
1240 FORCEINLINE svec<LANES,STYPE> svec<LANES,STYPE>::operator+(STYPE s) { return svec_add_scalar(*this, s); } \
1244 FORCEINLINE svec<LANES,STYPE> operator+(STYPE s, svec<LANES,STYPE> a) {return svec_scalar_add(s, a);} \
1248 FORCEINLINE svec<LANES,STYPE> svec<LANES,STYPE>::operator-(svec<LANES,STYPE> a) { return svec_sub(*this, a); } \
1252 FORCEINLINE svec<LANES,STYPE> svec<LANES,STYPE>::operator-(STYPE s) { return svec_sub_scalar(*this, s); } \
1256 FORCEINLINE svec<LANES,STYPE> operator-(STYPE s, svec<LANES,STYPE> a) {return svec_scalar_sub(s, a);} \
1260 FORCEINLINE svec<LANES,STYPE> svec<LANES,STYPE>::operator*(svec<LANES,STYPE> a) { return svec_mul(*this, a); } \
1264 FORCEINLINE svec<LANES,STYPE> svec<LANES,STYPE>::operator*(STYPE s) { return svec_mul_scalar(*this, s) ;} \
1268 FORCEINLINE svec<LANES,STYPE> operator*(STYPE s, svec<LANES,STYPE> a) {return svec_scalar_mul(s, a);} \
1272 FORCEINLINE svec<LANES,STYPE> svec<LANES,STYPE>::operator/(svec<LANES,STYPE> a) { return svec_div(*this, a); } \
1276 FORCEINLINE svec<LANES,STYPE> svec<LANES,STYPE>::operator/(STYPE s) { return svec_div_scalar(*this, s) ;} \
1280 FORCEINLINE svec<LANES,STYPE> operator/(STYPE s, svec<LANES,STYPE> a) {return svec_scalar_div(s, a);} \
1285 #define MVEC_CLASS_METHOD_IMPL(STYPE) \
1292 FORCEINLINE svec<LANES,STYPE> svec<LANES,STYPE>::load(svec<LANES,STYPE>* p){ return svec_load(p); } \
1298 FORCEINLINE void svec<LANES,STYPE>::store(svec<LANES,STYPE>* p){ svec_store(p, *this); }
1301 #define VEC_CLASS_METHOD_IMPL(STYPE) \
1302 MVEC_CLASS_METHOD_IMPL(STYPE); \
1306 FORCEINLINE svec<LANES,STYPE> svec<LANES,STYPE>::masked_load(svec<LANES,STYPE>* p, svec<LANES,bool> mask){ return svec_masked_load(p, mask); } \
1310 FORCEINLINE void svec<LANES,STYPE>::masked_store(svec<LANES,STYPE>* p, svec<LANES,bool> mask){ svec_masked_store(p, *this, mask); } \
1311 VEC_UNARY_IMPL(STYPE); \
1312 VEC_BIN_IMPL(STYPE); \
1316 FORCEINLINE svec<LANES,STYPE> svec<LANES,STYPE>::load_const(const STYPE* p) {return svec_load_const<svec<LANES,STYPE> >(p);} \
1320 FORCEINLINE svec<LANES,STYPE> svec<LANES,STYPE>::load_and_splat(STYPE* p) {return svec_load_and_splat<svec<LANES,STYPE> >(p); } \
1324 FORCEINLINE svec<LANES,STYPE> svec<LANES,STYPE>::gather(svec<LANES,void*> ptrs, svec<LANES,bool> mask) {return svec_gather<svec<LANES,STYPE> >(ptrs, mask); } \
1328 FORCEINLINE void svec<LANES,STYPE>::scatter(svec<LANES,void*> ptrs, svec<LANES,bool> mask) { svec_scatter(ptrs, *this, mask); } \
1332 FORCEINLINE svec<LANES,STYPE> svec<LANES,STYPE>::gather_base_offsets(STYPE* b, uint32_t scale, svec<LANES,int32_t> offsets, svec<LANES,bool> mask) { \
1333 return svec_gather_base_offsets(b, scale, offsets, mask); \
1338 FORCEINLINE svec<LANES,STYPE> svec<LANES,STYPE>::gather_base_offsets(STYPE* b, uint32_t scale, svec<LANES,int64_t> offsets, svec<LANES,bool> mask) {\
1339 return svec_gather_base_offsets(b, scale, offsets, mask); \
1344 FORCEINLINE void svec<LANES,STYPE>::scatter_base_offsets(STYPE* b, uint32_t scale, svec<LANES,int32_t> offsets, svec<LANES,bool> mask) { \
1345 svec_scatter_base_offsets(b, scale, offsets, *this, mask); \
1350 FORCEINLINE void svec<LANES,STYPE>::scatter_base_offsets(STYPE* b, uint32_t scale, svec<LANES,int64_t> offsets, svec<LANES,bool> mask) {\
1351 svec_scatter_base_offsets(b, scale, offsets, *this, mask); \
1356 FORCEINLINE svec<LANES,STYPE> svec<LANES,STYPE>::gather_stride(STYPE* b, int32_t off, int32_t stride) { \
1357 return svec_gather_stride<svec<LANES,STYPE> >(b, off, stride); \
1362 FORCEINLINE svec<LANES,STYPE> svec<LANES,STYPE>::gather_stride(STYPE* b, int64_t off, int64_t stride) {\
1363 return svec_gather_stride<svec<LANES,STYPE> >(b, off, stride); \
1368 FORCEINLINE void svec<LANES,STYPE>::scatter_stride(STYPE* b, int32_t off, int32_t stride) { \
1369 svec_scatter_stride(b, off, stride, *this); \
1374 FORCEINLINE void svec<LANES,STYPE>::scatter_stride(STYPE* b, int64_t off, int64_t stride) {\
1375 svec_scatter_stride(b, off, stride, *this); \
1380 FORCEINLINE svec<LANES,STYPE> svec<LANES,STYPE>::broadcast(int32_t index) { return svec_broadcast(*this, index);} \
1384 FORCEINLINE svec<LANES,STYPE> svec<LANES,STYPE>::rotate(int32_t index) { return svec_rotate(*this, index); } \
1388 FORCEINLINE svec<LANES,STYPE> svec<LANES,STYPE>::shuffle(svec<LANES,int32_t> index) { return svec_shuffle(*this, index); } \
1392 FORCEINLINE svec<LANES,STYPE> svec<LANES,STYPE>::abs() { return svec_abs(*this); }
1394 #define VEC_INT_CLASS_METHOD_IMPL(STYPE, STYPE2) \
1398 FORCEINLINE svec<LANES,STYPE> svec<LANES,STYPE>::operator|(svec<LANES,STYPE> a) { return svec_or(*this, a); } \
1402 FORCEINLINE svec<LANES,STYPE> svec<LANES,STYPE>::operator&(svec<LANES,STYPE> a) { return svec_and(*this, a); } \
1406 FORCEINLINE svec<LANES,STYPE> svec<LANES,STYPE>::operator^(svec<LANES,STYPE> a) { return svec_xor(*this, a); } \
1410 FORCEINLINE svec<LANES,STYPE> svec<LANES,STYPE>::operator<<(svec<LANES,STYPE2> a) { return svec_shl(*this, a); } \
1414 FORCEINLINE svec<LANES,STYPE> svec<LANES,STYPE>::operator<<(int32_t s) { return svec_shl(*this, s); } \
1418 FORCEINLINE svec<LANES,STYPE> svec<LANES,STYPE>::operator>>(svec<LANES,STYPE2> a) { return svec_shr(*this, a); } \
1422 FORCEINLINE svec<LANES,STYPE> svec<LANES,STYPE>::operator>>(int32_t s) { return svec_shr(*this, s); } \
1426 FORCEINLINE svec<LANES,STYPE> svec<LANES,STYPE>::operator%(svec<LANES,STYPE> a) { return svec_rem(*this, a); } \
1430 FORCEINLINE svec<LANES,STYPE> svec<LANES,STYPE>::operator%(STYPE s) { return svec_rem(*this, s); }
1433 #define VEC_FLOAT_CLASS_METHOD_IMPL(STYPE) \
1437 FORCEINLINE svec<LANES,STYPE> svec<LANES,STYPE>::round() { return svec_round(*this);} \
1441 FORCEINLINE svec<LANES,STYPE> svec<LANES,STYPE>::floor() { return svec_floor(*this);} \
1445 FORCEINLINE svec<LANES,STYPE> svec<LANES,STYPE>::ceil() { return svec_ceil(*this);} \
1449 FORCEINLINE svec<LANES,STYPE> svec<LANES,STYPE>::sqrt() { return svec_sqrt(*this);} \
1453 FORCEINLINE svec<LANES,STYPE> svec<LANES,STYPE>::rcp() { return svec_rcp(*this);} \
1457 FORCEINLINE svec<LANES,STYPE> svec<LANES,STYPE>::rsqrt() { return svec_rsqrt(*this);}\
1461 FORCEINLINE svec<LANES,STYPE> svec<LANES,STYPE>::exp() {return svec_exp(*this);} \
1465 FORCEINLINE svec<LANES,STYPE> svec<LANES,STYPE>::log() {return svec_log(*this);} \
1469 FORCEINLINE svec<LANES,STYPE> svec<LANES,STYPE>::pow(svec<LANES,STYPE> a) { return svec_pow(*this, a); }
uint32_t svec1_u32
Definition: gsimd_utility.h:188
void stdout_scalar< uint8_t >(std::ostream &out, uint8_t v)
Definition: gsimd_utility.h:226
Definition: gsimd_utility.h:93
double svec1_d
Definition: gsimd_utility.h:193
int32_t i32
Definition: gsimd_utility.h:1135
void stdout_scalar(std::ostream &out, T v)
Definition: gsimd_utility.h:218
uint64_t u64
Definition: gsimd_utility.h:1139
svec< 4, bool > svec_select(svec< 4, bool > mask, svec< 4, bool > a, svec< 4, bool > b)
construct c by selecting elements from two input vectors according to the mask
Definition: power_vsx4.h:1126
void type
Definition: gsimd_utility.h:94
float svec1_f
Definition: gsimd_utility.h:192
int8_t svec1_i8
Definition: gsimd_utility.h:185
const bool check_lanes< 16 >(int n)
Definition: gsimd_utility.h:236
#define INC_STATS_NAME(stat, inc, opname)
Definition: gsimd_utility.h:156
int32_t svec1_i32
Definition: gsimd_utility.h:189
const bool check_lanes< 2 >(int n)
Definition: gsimd_utility.h:233
const bool check_lanes< 8 >(int n)
Definition: gsimd_utility.h:235
#define DEFINE_TYPE_NAME(type, name)
Definition: gsimd_utility.h:202
int64_t svec1_i64
Definition: gsimd_utility.h:191
const bool check_lanes(int n)
float f
Definition: gsimd_utility.h:1137
uint16_t svec1_u16
Definition: gsimd_utility.h:186
void stdout_scalar< int8_t >(std::ostream &out, int8_t v)
Definition: gsimd_utility.h:222
Definition: gsimd_utility.h:1134
uint8_t svec1_u8
Definition: gsimd_utility.h:184
const char * iu_get_type_name()
int16_t svec1_i16
Definition: gsimd_utility.h:187
uint32_t u32
Definition: gsimd_utility.h:1136
const bool check_lanes< 4 >(int n)
Definition: gsimd_utility.h:234
double d
Definition: gsimd_utility.h:1140
#define FORCEINLINE
Definition: gsimd_utility.h:175
uint64_t svec1_u64
Definition: gsimd_utility.h:190
int64_t i64
Definition: gsimd_utility.h:1138