105 #ifndef POWER_VSX4_H_
106 #define POWER_VSX4_H_
115 #include "platform_intrinsics.h"
127 template <
int Lanes,
class T>
136 struct svec<4,int8_t>;
138 struct svec<4,uint8_t>;
140 struct svec<4,int16_t>;
142 struct svec<4,uint16_t>;
144 struct svec<4,int32_t>;
146 struct svec<4,uint32_t>;
148 struct svec<4,int64_t>;
150 struct svec<4,uint64_t>;
152 struct svec<4,float>;
154 struct svec<4,double>;
156 struct svec<4,void*>;
184 __vector
unsigned int v;
205 __vector
unsigned int t = { a ? -1 : 0, b ? -1 : 0, c ? -1 : 0, d ? -1 : 0 };
215 if(__builtin_constant_p(a)){
216 v = (a!=0) ? vec_splat_s32(-1) : vec_splat_s32(0);
219 __vector
unsigned int t = { a ? -1 : 0, a ? -1 : 0, a ? -1 : 0, a ? -1 : 0 };
234 __vector
signed char v;
252 __vector
signed char t = {a,b,c,d,0,0,0,0,
261 if(__builtin_constant_p(a) && (a <= 15) && (a >= -16)){
265 __vector
signed char t = {a,a,a,a,0,0,0,0,
287 __vector
unsigned char v;
304 __vector
unsigned char t = {a,b,c,d,0,0,0,0,
314 if(__builtin_constant_p(a) && (a <= 15)){
318 __vector
unsigned char t = {a,a,a,a,0,0,0,0,
339 __vector
signed short v;
356 __vector
signed short t = {a,b,c,d, 0,0,0,0};
365 if(__builtin_constant_p(a) && (a <= 15) && (a >= -16)){
366 v = vec_splat_s16(a);
369 __vector
signed short t = {a,a,a,a, 0,0,0,0};
390 __vector
unsigned short v;
407 __vector
unsigned short t = {a,b,c,d, 0,0,0,0};
416 if(__builtin_constant_p(a) && (a <= 15)){
417 v = vec_splat_u16(a);
420 __vector
unsigned short t = {a,a,a,a, 0,0,0,0};
441 __vector
signed int v;
458 __vector
signed int t = {a,b,c,d};
467 if(__builtin_constant_p(a)){
468 if((a <= 15) && (a >= -16)) {
469 v = vec_splat_s32(a);
472 __vector
signed int t = {a,a,a,a};
480 __vector
signed int register x = vec_vsx_ld(0, p);
481 v = vec_splat_p7(x, 0);
501 __vector
unsigned int v;
518 __vector
unsigned int t = {a,b,c,d};
527 if(__builtin_constant_p(a)){
529 v = vec_splat_u32(a);
532 __vector
unsigned int t = {a,a,a,a};
540 __vector
unsigned int register x = vec_vsx_ld(0, p);
541 v = vec_splat_p7((__vector
signed)x, 0);
561 __vector
signed long long v[2];
580 __vector
signed long long t1 = {a,b};
581 __vector
signed long long t2 = {c,d};
591 if(__builtin_constant_p(a)){
593 if ((a >= -16l) && (a <= 15l)) {
594 const int iv = (int)a;
595 __vector
signed int x = {iv,iv,iv,iv};
596 __vector
signed long long t = vec_unpackh_p8(x);
601 __vector
signed long long r1 = (__vector
signed long long)vec_splat_s32(0);
604 __vector
long long x = {a,a};
609 __vector
unsigned long long r = vec_smear_i64_p8(a);
613 __vector
signed long long r = vec_smear_i64_p7((
long long*)p);
634 __vector
unsigned long long v[2];
653 __vector
unsigned long long t1 = {a,b};
654 __vector
unsigned long long t2 = {c,d};
664 if(__builtin_constant_p(a)){
666 if ((a >= 0ul) && (a <= 31ul)) {
667 const int iv = (int)v;
668 __vector
signed int x = {iv,iv,iv,iv};
669 __vector
unsigned long long t = vec_unpackh_p8(x);
674 __vector
unsigned long long r1 = (__vector
unsigned long long)vec_splat_u32(0);
675 v[0] = v[1] = r1, r1;
677 __vector
unsigned long long x = {a,a};
682 __vector
unsigned long long r = vec_smear_i64_p8(a);
686 __vector
unsigned long long r = vec_smear_i64_p7((
long long*)p);
724 __vector
float t = {a,b,c,d};
733 if(__builtin_constant_p(a)){
735 v = (__vector float) vec_splat_s32(0);
738 p = 1.0; iv = (int)(p*a);
739 if (( (((
float)iv)/p) == a ) && (iv >= -16) && (iv <= 15)) {
740 v = vec_ctf(vec_splat_s32(iv),0);
742 p = 2.0; iv = (int)(p*a);
743 if (( (((
float)iv)/p) == a ) && (iv >= -16) && (iv <= 15)) {
744 v = vec_ctf(vec_splat_s32(iv),1);
746 p = 4.0; iv = (int)(p*a);
747 if (( (((
float)iv)/p) == a ) && (iv >= -16) && (iv <= 15)) {
748 v = vec_ctf(vec_splat_s32(iv),2);
751 __vector
float t = {a,a,a,a};
762 __vector
float register x = vec_vsx_ld(0, p);
763 v = vec_splat_p7(x, 0);
783 __vector
double v[2];
802 __vector
double t1 = {a,b};
803 __vector
double t2 = {c,d};
813 if(__builtin_constant_p(a)){
815 __vector
double r1 = (__vector double)vec_splat_s32(0);
818 __vector
double t = vec_smear_p7(a);
822 __vector
double t = vec_smear_p7(a);
851 #define INSERT_EXTRACT_OPT(STYPE) \
852 static FORCEINLINE STYPE svec_extract(svec<LANES,STYPE> v, int index) { \
853 return vec_extract(v.v, index); \
855 static FORCEINLINE void svec_insert(svec<LANES,STYPE> *v, int index, STYPE val) { \
856 (*v).v = vec_insert(val, v->v, index); \
859 #define INSERT_EXTRACT_OPT64(STYPE) \
860 static FORCEINLINE STYPE svec_extract(svec<LANES,STYPE> v, int index) { \
861 return vec_extract(v.v[index >> 1], index%2); \
863 static FORCEINLINE void svec_insert(svec<LANES,STYPE> *v, int index, STYPE val) { \
864 (*v).v[index >> 1] = vec_insert(val, v->v[index>>1], index%2); \
867 static FORCEINLINE uint32_t svec_extract(svec<4,bool> v,
int index) {
868 return vec_extract(v.v, index);
870 static FORCEINLINE void svec_insert(svec<4,bool> *v,
int index, uint32_t val) {
871 (*v).v = vec_insert(val ? -1 : 0, (*v).v, index);
917 static FORCEINLINE svec<4,bool> svec_load(
const svec<4,bool> *p) {
918 return *((__vector
unsigned int *)p);
927 static FORCEINLINE void svec_store(svec<4,bool> *p, svec<4,bool> v) {
928 *((__vector
unsigned int*)p) = v.v;
937 static FORCEINLINE svec<4,int8_t> svec_load(
const svec<4,int8_t> *p) {
938 return vec_vsx_ld(0, (
signed int*)p);
947 static FORCEINLINE void svec_store(svec<4,int8_t> *p, svec<4,int8_t> v) {
948 vec_vsx_st(v.v, 0, (
signed char*)p);
957 static FORCEINLINE svec<4,uint8_t> svec_load(
const svec<4,uint8_t> *p) {
958 return vec_vsx_ld(0, (
signed int*)p);
967 static FORCEINLINE void svec_store(svec<4,uint8_t> *p, svec<4,uint8_t> v) {
968 vec_vsx_st(v.v, 0, (
unsigned char*)p);
985 static FORCEINLINE svec<4,int32_t> svec_load(
const svec<4,int32_t> *p) {
986 return *((__vector
signed int *)p);
995 static FORCEINLINE void svec_store(svec<4,int32_t> *p, svec<4,int32_t> v) {
996 *((__vector
signed int*)p) = v.v;
1005 static FORCEINLINE svec<4,uint32_t> svec_load(
const svec<4,uint32_t> *p) {
1006 return *((__vector
unsigned int *)p);
1015 static FORCEINLINE void svec_store(svec<4,uint32_t> *p, svec<4,uint32_t> v) {
1016 *((__vector
unsigned int*)p) = v.v;
1025 static FORCEINLINE svec<4,int64_t> svec_load(
const svec<4,int64_t> *p) {
1026 __vector
signed long long v0 = *(((__vector
signed long long *)p)+0);
1027 __vector
signed long long v1 = *(((__vector
signed long long *)p)+1);
1028 return svec<4,int64_t>(v0,v1);
1037 static FORCEINLINE void svec_store(svec<4,int64_t> *p, svec<4,int64_t> v) {
1038 *(((__vector
signed long long *)p)+0) = v.v[0];
1039 *(((__vector
signed long long *)p)+1) = v.v[1];
1048 static FORCEINLINE svec<4,uint64_t> svec_load(
const svec<4,uint64_t> *p) {
1049 __vector
unsigned long long v0 = *(((__vector
unsigned long long *)p)+0);
1050 __vector
unsigned long long v1 = *(((__vector
unsigned long long *)p)+1);
1051 return svec<4,uint64_t>(v0,v1);
1059 static FORCEINLINE void svec_store(svec<4,uint64_t> *p, svec<4,uint64_t> v) {
1060 *(((__vector
unsigned long long *)p)+0) = v.v[0];
1061 *(((__vector
unsigned long long *)p)+1) = v.v[1];
1070 static FORCEINLINE svec<4,float> svec_load(
const svec<4,float> *p) {
1071 return *((__vector
float *)p);
1081 static FORCEINLINE void svec_store(svec<4,float> *p, svec<4,float> v) {
1082 *((__vector
float*)p) = v.v;
1092 static FORCEINLINE svec<4,double> svec_load(
const svec<4,double> *p) {
1095 __vector
double v0 = vec_vsx_ld(0, ((__vector
double *)p));
1096 __vector
double v1 = vec_vsx_ld(0, ((__vector
double *)p)+1);
1099 return svec<4,double>(v0,v1);
1108 static FORCEINLINE void svec_store(svec<4,double> *p, svec<4,double> v) {
1111 vec_vsx_st(v.v[0], 0, (__vector
double *)p);
1112 vec_vsx_st(v.v[1], 0, (__vector
double *)p + 1);
1127 return vec_sel(b.
v, a.
v, mask.
v);
1135 __vector
unsigned int tsi=vec_splat_s32(0);
1136 __vector
unsigned char t = vec_pack(vec_pack(mask.
v,tsi),(vector
unsigned short)tsi);
1137 return vec_sel(b.v, a.v, t);
1145 __vector
unsigned int tsi=vec_splat_u32(0);
1146 __vector
unsigned char t = vec_pack(vec_pack(mask.
v,tsi),(vector
unsigned short)tsi);
1147 return vec_sel(b.v, a.v, t);
1156 int16_t v0 = mask[0] ? a[0] : b[0];
1157 int16_t v1 = mask[1] ? a[1] : b[1];
1158 int16_t v2 = mask[2] ? a[2] : b[2];
1159 int16_t v3 = mask[3] ? a[3] : b[3];
1169 uint16_t v0 = mask[0] ? a[0] : b[0];
1170 uint16_t v1 = mask[1] ? a[1] : b[1];
1171 uint16_t v2 = mask[2] ? a[2] : b[2];
1172 uint16_t v3 = mask[3] ? a[3] : b[3];
1181 return vec_sel(b.
v, a.
v, mask.
v);
1189 return vec_sel(b.
v, a.
v, mask.
v);
1199 __vector
signed long long t1 = vec_sel(b.
v[0],a.
v[0],vec_unpackh_p8(mask.
v));
1200 __vector
signed long long t2 = vec_sel(b.
v[1],a.
v[1],vec_unpackl_p8(mask.
v));
1205 int64_t v0 = mask[0] ? a[0] : b[0];
1206 int64_t v1 = mask[1] ? a[1] : b[1];
1207 int64_t v2 = mask[2] ? a[2] : b[2];
1208 int64_t v3 = mask[3] ? a[3] : b[3];
1220 __vector
unsigned long long t1 = vec_sel(b.
v[0],a.
v[0],vec_unpackh_p8(mask.
v));
1221 __vector
unsigned long long t2 = vec_sel(b.
v[1],a.
v[1],vec_unpackl_p8(mask.
v));
1226 uint64_t v0 = mask[0] ? a[0] : b[0];
1227 uint64_t v1 = mask[1] ? a[1] : b[1];
1228 uint64_t v2 = mask[2] ? a[2] : b[2];
1229 uint64_t v3 = mask[3] ? a[3] : b[3];
1239 return vec_sel(b.
v, a.
v, mask.
v);
1248 __vector
double t1 = vec_sel(b.
v[0],a.
v[0],vec_unpackh_p8(mask.
v));
1249 __vector
double t2 = vec_sel(b.
v[1],a.
v[1],vec_unpackl_p8(mask.
v));
1254 double v0 = mask[0] ? a[0] : b[0];
1255 double v1 = mask[1] ? a[1] : b[1];
1256 double v2 = mask[2] ? a[2] : b[2];
1257 double v3 = mask[3] ? a[3] : b[3];
1280 #define BROADCAST_OPT32(STYPE) \
1281 static FORCEINLINE svec<LANES,STYPE> svec_broadcast(svec<LANES,STYPE> v, const int index) { \
1282 if(__builtin_constant_p(index) && index >=0 && index < 4){ return svec<LANES,STYPE>(vec_splat_p7(v.v, index)); } \
1283 else { STYPE bval = v[index]; return svec<LANES,STYPE>(bval, bval, bval, bval); } \
1286 #define BROADCAST_OPT64(STYPE) \
1287 static FORCEINLINE svec<LANES,STYPE> svec_broadcast(svec<LANES,STYPE> v, const int index) { \
1288 if(__builtin_constant_p(index) && index >=0 && index < 4){ \
1289 __vector STYPE r = vec_splat_p7(v.v[index >> 1], index %2); \
1290 return svec<LANES,STYPE>(r, r); } \
1291 else { STYPE bval = v[index]; return svec<LANES,STYPE>(bval, bval, bval, bval); } \
1333 template <
class RetVecType>
static RetVecType svec_load_const(
const int8_t* p);
1339 template <
class RetVecType>
static RetVecType svec_load_const(
const uint8_t* p);
1345 template <
class RetVecType>
static RetVecType svec_load_const(
const int16_t* p);
1351 template <
class RetVecType>
static RetVecType svec_load_const(
const uint16_t* p);
1357 template <
class RetVecType>
static RetVecType svec_load_const(
const int32_t* p);
1363 template <
class RetVecType>
static RetVecType svec_load_const(
const uint32_t* p);
1369 template <
class RetVecType>
static RetVecType svec_load_const(
const int64_t* p);
1372 __vector
signed long long t= vec_smear_const_i64_p7((
const long long *)p);
1376 template <
class RetVecType>
static RetVecType svec_load_const(
const uint64_t* p);
1379 __vector
unsigned long long t= vec_smear_const_i64_p7((
const long long *)p);
1383 template <
class RetVecType>
static RetVecType svec_load_const(
const float* p);
1387 return vec_splat(*(__vector
float*)p, 0);
1390 template <
class RetVecType>
static RetVecType svec_load_const(
const double* p);
1393 __vector
double t= vec_smear_const_double_p7(p);
1399 template <
class RetVecType>
static RetVecType svec_load_and_splat(int8_t* p);
1407 template <
class RetVecType>
static RetVecType svec_load_and_splat(uint8_t* p);
1415 template <
class RetVecType>
static RetVecType svec_load_and_splat(int16_t* p);
1423 template <
class RetVecType>
static RetVecType svec_load_and_splat(uint16_t* p);
1431 template <
class RetVecType>
static RetVecType svec_load_and_splat(int32_t* p);
1435 return vec_smear_i32_p8(p);
1437 __vector
signed int register x = vec_vsx_ld(0, p);
1442 template <
class RetVecType>
static RetVecType svec_load_and_splat(uint32_t* p);
1446 return vec_smear_i32_p8(p);
1448 __vector
unsigned int register x = vec_vsx_ld(0, p);
1453 template <
class RetVecType>
static RetVecType svec_load_and_splat(int64_t* p);
1456 __vector
signed long long r = vec_smear_i64_p7((
signed long long*)p);
1460 template <
class RetVecType>
static RetVecType svec_load_and_splat(uint64_t* p);
1463 __vector
unsigned long long r = vec_smear_i64_p7((
unsigned long long*)p);
1467 template <
class RetVecType>
static RetVecType svec_load_and_splat(
float* p);
1471 return vec_smear_float_p8(p);
1473 __vector
float register x = vec_vsx_ld(0, p);
1478 template <
class RetVecType>
static RetVecType svec_load_and_splat(
double* p);
1481 __vector
double t= vec_smear_double_p7(p);
1501 struct svec<4,void*> :
public svec<4,uint64_t>{
1506 FORCEINLINE svec(
void* p0,
void* p1,
void* p2,
void* p3):
1507 svec<4,uint64_t>((uint64_t)(p0),(uint64_t)(p1),(uint64_t)(p2),(uint64_t)(p3)){}
1517 svec<4,uint32_t>((uint32_t)(p0),(uint32_t)(p1),(uint32_t)(p2),(uint32_t)(p3)){}
1521 #ifndef DOXYGEN_SHOULD_SKIP_THIS //not want generate svec_gather*/svec_scatter methods
1523 template <
class RetVecType>
static RetVecType svec_gather(svec<4,uint32_t> ptrs, svec<4,bool> mask);
1524 template <
class RetVecType>
static RetVecType svec_gather(svec<4,uint64_t> ptrs, svec<4,bool> mask);
1541 FORCEINLINE svec<4,int32_t> svec_gather<svec<4,int32_t> >(svec<4,uint64_t> ptrs, svec<4,bool> mask) {
1542 typedef svec<4,int32_t> RetVec;
1543 return lGatherGeneral<RetVec,int32_t,svec<4,uint64_t>,svec<4,bool> >(ptrs,mask);
1550 FORCEINLINE svec<4,uint32_t> svec_gather<svec<4,uint32_t> >(svec<4,uint64_t> ptrs, svec<4,bool> mask) {
1551 typedef svec<4,uint32_t> RetVec;
1552 return lGatherGeneral<RetVec,uint32_t,svec<4,uint64_t>,svec<4,bool> >(ptrs,mask);
1562 FORCEINLINE svec<4,int64_t> svec_gather<svec<4,int64_t> >(svec<4,uint64_t> ptrs, svec<4,bool> mask) {
1563 typedef svec<4,int64_t> RetVec;
1564 return lGatherGeneral<RetVec,int64_t, svec<4,uint64_t>,svec<4,bool> >(ptrs,mask);
1571 FORCEINLINE svec<4,uint64_t> svec_gather<svec<4,uint64_t> >(svec<4,uint64_t> ptrs, svec<4,bool> mask) {
1572 typedef svec<4,uint64_t> RetVec;
1573 return lGatherGeneral<RetVec,uint64_t, svec<4,uint64_t>,svec<4,bool> >(ptrs,mask);
1581 FORCEINLINE svec<4,float> svec_gather<svec<4,float> >(svec<4,uint64_t> ptrs, svec<4,bool> mask) {
1582 typedef svec<4,float> RetVec;
1583 return lGatherGeneral<RetVec,float,svec<4,uint64_t>,svec<4,bool> >(ptrs,mask);
1596 template<
typename RetVec,
typename RetScalar,
typename OFF,
typename MSK>
1598 lGatherBaseOffsets32_32P8(
unsigned char *p, uint32_t scale,
1599 OFF offsets, MSK mask) {
1607 uint64_t doff1 = vec_extract_l(offsets.v);
1608 uint64_t doff2 = vec_extract_r(offsets.v);
1610 uint32_t o1=(uint32_t) doff1;
1611 uint32_t o0=(uint32_t)(doff1 >> 32);
1612 uint32_t o3=(uint32_t) doff2;
1613 uint32_t o2=(uint32_t)(doff2 >> 32);
1614 #ifdef CORRECTNESS_CHECK
1615 if(o0 != offsets[0] ||
1619 printf(
"Error while extracting for gather\n");
1622 return vec_gather_p8((RetScalar*)(p + (scale*o0)),
1623 (RetScalar*)(p+(scale*o1)),
1624 (RetScalar*)(p+(scale*o2)),
1625 (RetScalar*)(p+(scale*o3)) );
1629 template<
typename RetVec,
typename RetScalar,
typename OFF,
typename MSK>
1631 lGatherBaseOffsets32_64P8(
unsigned char *p, uint32_t scale,
1632 OFF offsets, MSK mask) {
1640 uint64_t doff1 = vec_extract_l(offsets.v);
1641 uint64_t doff2 = vec_extract_r(offsets.v);
1643 uint32_t o1=(uint32_t) doff1;
1644 uint32_t o0=(uint32_t)(doff1 >> 32);
1645 uint32_t o3=(uint32_t) doff2;
1646 uint32_t o2=(uint32_t)(doff2 >> 32);
1647 #ifdef CORRECTNESS_CHECK
1648 if(o0 != offsets[0] ||
1652 printf(
"Error while extracting for gather\n");
1655 return RetVec(vec_gather_p8((RetScalar*)(p + (scale*o0)),
1656 (RetScalar*)(p+(scale*o1))) ,
1657 vec_gather_p8((RetScalar*)(p+(scale*o2)),
1658 (RetScalar*)(p+(scale*o3))) );
1663 template<
typename RetVec,
typename RetScalar,
typename OFF,
typename MSK>
1665 lGatherBaseOffsets64_32P8(
unsigned char *p, uint32_t scale,
1666 OFF offsets, MSK mask) {
1674 uint64_t o0 = vec_extract_l(offsets.v[0]);
1675 uint64_t o1 = vec_extract_r(offsets.v[0]);
1676 uint64_t o2 = vec_extract_l(offsets.v[1]);
1677 uint64_t o3 = vec_extract_r(offsets.v[1]);
1679 #ifdef CORRECTNESS_CHECK
1680 if(o0 != offsets[0] ||
1684 printf(
"Error while extracting for gather\n");
1687 return vec_gather_p8((RetScalar*)(p+(scale*o0)),
1688 (RetScalar*)(p+(scale*o1)),
1689 (RetScalar*)(p+(scale*o2)),
1690 (RetScalar*)(p+(scale*o3)) );
1694 template<
typename RetVec,
typename RetScalar,
typename OFF,
typename MSK>
1696 lGatherBaseOffsets64_64P8(
unsigned char *p, uint32_t scale,
1697 OFF offsets, MSK mask) {
1705 uint64_t o0 = vec_extract_l(offsets.v[0]);
1706 uint64_t o1 = vec_extract_r(offsets.v[0]);
1707 uint64_t o2 = vec_extract_l(offsets.v[1]);
1708 uint64_t o3 = vec_extract_r(offsets.v[1]);
1710 #ifdef CORRECTNESS_CHECK
1711 if(o0 != offsets[0] ||
1715 printf(
"Error while extracting for gather\n");
1718 return RetVec(vec_gather_p8((RetScalar*)(p + (scale*o0)),
1719 (RetScalar*)(p+(scale*o1))) ,
1720 vec_gather_p8((RetScalar*)(p+(scale*o2)),
1721 (RetScalar*)(p+(scale*o3))) );
1725 #endif //endif __POWER8
1739 svec_gather_base_offsets(int32_t *b, uint32_t scale, svec<4,int32_t> offsets, svec<4,bool> mask){
1741 return lGatherBaseOffsets32_32P8<svec<4,int32_t>,int32_t,svec<4,int32_t>,svec<4,bool> >((uint8_t*)b,scale,offsets,mask);
1743 return lGatherBaseOffsets<svec<4,int32_t>, int32_t, svec<4,int32_t>,svec<4,bool> >((uint8_t*)b,scale,offsets,mask);
1749 svec_gather_base_offsets(int32_t* b, uint32_t scale, svec<4,int64_t> offsets, svec<4,bool> mask){
1750 uint8_t *p = (uint8_t*)b;
1751 typedef svec<4,int32_t> RetVec;
1753 RetVec r1=lGatherBaseOffsets64_32P8<svec<4,int32_t>,int32_t,svec<4,int64_t>,svec<4,bool> >(p,scale,offsets,mask);
1756 return lGatherBaseOffsets<svec<4,int32_t>, int32_t,svec<4,int64_t>,svec<4,bool> >(p,scale,offsets,mask);
1762 svec_gather_base_offsets(uint32_t *b, uint32_t scale, svec<4,int32_t> offsets, svec<4,bool> mask){
1764 return lGatherBaseOffsets32_32P8<svec<4,uint32_t>,uint32_t,svec<4,int32_t>,svec<4,bool> >((uint8_t*)b,scale,offsets,mask);
1766 return lGatherBaseOffsets<svec<4,uint32_t>, uint32_t, svec<4,int32_t>,svec<4,bool> >((uint8_t*)b,scale,offsets,mask);
1772 svec_gather_base_offsets(uint32_t* b, uint32_t scale, svec<4,int64_t> offsets, svec<4,bool> mask){
1773 uint8_t *p = (uint8_t*)b;
1774 typedef svec<4,uint32_t> RetVec;
1776 RetVec r1=lGatherBaseOffsets64_32P8<svec<4,uint32_t>,uint32_t,svec<4,int64_t>,svec<4,bool> >(p,scale,offsets,mask);
1779 return lGatherBaseOffsets<svec<4,uint32_t>, uint32_t,svec<4,int64_t>,svec<4,bool> >(p,scale,offsets,mask);
1785 svec_gather_base_offsets(int64_t *b, uint32_t scale, svec<4,int32_t> offsets,svec<4,bool> mask){
1786 uint8_t *p = (uint8_t *)b;
1787 typedef svec<4,int64_t> RetVec;
1789 svec<4,int64_t> r2 = lGatherBaseOffsets32_64P8<RetVec,int64_t,svec<4,int32_t>,svec<4,bool> >(p,scale,offsets,mask);
1792 return lGatherBaseOffsets<RetVec, int64_t, svec<4,int32_t>,svec<4,bool> >((uint8_t*)p,scale,offsets,mask);
1800 svec_gather_base_offsets(uint64_t *b, uint32_t scale, svec<4,int32_t> offsets,svec<4,bool> mask){
1801 uint8_t *p = (uint8_t *)b;
1802 typedef svec<4,uint64_t> RetVec;
1804 svec<4,uint64_t> r2 = lGatherBaseOffsets32_64P8<RetVec,uint64_t,svec<4,int32_t>,svec<4,bool> >(p,scale,offsets,mask);
1807 return lGatherBaseOffsets<svec<4,uint64_t>,uint64_t, svec<4,int32_t>,svec<4,bool> >((uint8_t*)p,scale,offsets,mask);
1815 svec_gather_base_offsets(
float *b, uint32_t scale, svec<4,int32_t> offsets, svec<4,bool> mask){
1816 uint8_t *p = (uint8_t*)b;
1818 return lGatherBaseOffsets32_32P8<svec<4,float>,float,svec<4,int32_t>,svec<4,bool> >(p,scale,offsets,mask);
1820 return lGatherBaseOffsets<svec<4,float>,float, svec<4,int32_t>,svec<4,bool> >((uint8_t*)p,scale,offsets,mask);
1826 svec_gather_base_offsets(
float* b, uint32_t scale, svec<4,int64_t> offsets, svec<4,bool> mask){
1827 uint8_t *p = (uint8_t*)b;
1829 typedef svec<4,float> RetVec;
1830 RetVec r1=lGatherBaseOffsets64_32P8<RetVec,float,svec<4,int64_t>,svec<4,bool> >(p,scale,offsets,mask);
1833 return lGatherBaseOffsets<svec<4,float>,float,svec<4,int64_t>,svec<4,bool> >(p,scale,offsets,mask);
1840 svec_gather_base_offsets(
double* b, uint32_t scale, svec<4,int32_t> offsets, svec<4,bool> mask){
1841 typedef svec<4,double> RetVec;
1842 uint8_t* p = (uint8_t*)b;
1844 svec<4,double> r2 = lGatherBaseOffsets32_64P8<RetVec,double,svec<4,int32_t>,svec<4,bool> >(p,scale,offsets,mask);
1847 return lGatherBaseOffsets<svec<4,double>,double,svec<4,int32_t>,svec<4,bool> >(p,scale,offsets,mask);
1853 svec_gather_base_offsets(
double* b, uint32_t scale, svec<4,int64_t> offsets, svec<4,bool> mask){
1854 uint8_t *p = (uint8_t*)b;
1855 typedef svec<4,double> RetVec;
1857 RetVec r1=lGatherBaseOffsets64_64P8<RetVec,double,svec<4,int64_t>,svec<4,bool> >(p,scale,offsets,mask);
1860 return lGatherBaseOffsets<svec<4,double>, double, svec<4,int64_t>,svec<4,bool> >(p,scale,offsets,mask);
1866 template<
typename STYPE,
typename PTRTYPE,
typename VTYPE>
1867 static FORCEINLINE void lScatter64_32(PTRTYPE ptrs,
1868 VTYPE val, svec<4,bool> mask) {
1870 uint64_t p0 = vec_extract_l(ptrs.v[0]);
1871 uint64_t p1 = vec_extract_r(ptrs.v[0]);
1872 uint64_t p2 = vec_extract_l(ptrs.v[1]);
1873 uint64_t p3 = vec_extract_r(ptrs.v[1]);
1876 uint64_t doff1 = vec_extract_l(mask.v);
1877 uint64_t doff2 = vec_extract_r(mask.v);
1879 uint32_t m1=(uint32_t) doff1;
1880 uint32_t m0=(uint32_t)(doff1 >> 32);
1881 uint32_t m3=(uint32_t) doff2;
1882 uint32_t m2=(uint32_t)(doff2 >> 32);
1902 vec_scatter_step_12((STYPE*)p0, val.v);
1904 vec_scatter_step_0((STYPE*)p1, val.v);
1906 vec_scatter_step_4((STYPE*)p2, val.v);
1908 vec_scatter_step_8((STYPE*)p3, val.v);
1960 static FORCEINLINE void svec_scatter(svec<4,uint64_t> ptrs, svec<4,int32_t> val, svec<4,bool> mask) {
1962 lScatter64_32<int32_t, svec<4,uint64_t>, svec<4,int32_t> >(ptrs,val,mask);
1964 lScatterGeneral<int32_t, svec<4,uint64_t>, svec<4,int32_t>, svec<4,bool> >(ptrs,val,mask);
1971 static FORCEINLINE void svec_scatter(svec<4,uint64_t> ptrs, svec<4,uint32_t> val, svec<4,bool> mask) {
1973 lScatter64_32<uint32_t, svec<4,uint64_t>, svec<4,uint32_t> >(ptrs,val,mask);
1975 lScatterGeneral<uint32_t, svec<4,uint64_t>, svec<4,uint32_t>, svec<4,bool> >(ptrs,val,mask);
1986 static FORCEINLINE void svec_scatter (svec<4,uint64_t> ptrs,svec<4,float> val,svec<4,bool> mask) {
1988 lScatter64_32<float, svec<4,uint64_t>, svec<4,float> >(ptrs,val,mask);
1990 lScatterGeneral<float, svec<4,uint64_t>, svec<4,float>, svec<4,bool> >(ptrs,val,mask);
1998 template<
typename STYPE,
typename OTYPE,
typename VTYPE>
1999 static FORCEINLINE void lScatterBaseOffsets32_32(
unsigned char *b,
2000 uint32_t scale, OTYPE offsets,
2001 VTYPE val, svec<4,bool> mask) {
2003 unsigned char *base = b;
2005 uint64_t doff1 = vec_extract_l(offsets.v);
2006 uint64_t doff2 = vec_extract_r(offsets.v);
2008 uint32_t o1=(uint32_t) doff1;
2009 uint32_t o0=(uint32_t)(doff1 >> 32);
2010 uint32_t o3=(uint32_t) doff2;
2011 uint32_t o2=(uint32_t)(doff2 >> 32);
2014 doff1 = vec_extract_l(mask.v);
2015 doff2 = vec_extract_r(mask.v);
2017 uint32_t m1=(uint32_t) doff1;
2018 uint32_t m0=(uint32_t)(doff1 >> 32);
2019 uint32_t m3=(uint32_t) doff2;
2020 uint32_t m2=(uint32_t)(doff2 >> 32);
2039 STYPE *ptr0 = (STYPE *)(base + scale * o0);
2040 STYPE *ptr1 = (STYPE *)(base + scale * o1);
2041 STYPE *ptr2 = (STYPE *)(base + scale * o2);
2042 STYPE *ptr3 = (STYPE *)(base + scale * o3);
2045 vec_scatter_step_12(ptr0, val.v);
2047 vec_scatter_step_0(ptr1, val.v);
2049 vec_scatter_step_4(ptr2, val.v);
2051 vec_scatter_step_8(ptr3, val.v);
2055 template<
typename STYPE,
typename OTYPE,
typename VTYPE>
2056 static FORCEINLINE void lScatterBaseOffsets64_32(
unsigned char *b,
2057 uint32_t scale, OTYPE offsets,
2058 VTYPE val, svec<4,bool> mask) {
2060 unsigned char *base = b;
2062 uint64_t o0 = vec_extract_l(offsets.v[0]);
2063 uint64_t o1 = vec_extract_r(offsets.v[0]);
2064 uint64_t o2 = vec_extract_l(offsets.v[1]);
2065 uint64_t o3 = vec_extract_r(offsets.v[1]);
2068 uint64_t doff1 = vec_extract_l(mask.v);
2069 uint64_t doff2 = vec_extract_r(mask.v);
2071 uint32_t m1=(uint32_t) doff1;
2072 uint32_t m0=(uint32_t)(doff1 >> 32);
2073 uint32_t m3=(uint32_t) doff2;
2074 uint32_t m2=(uint32_t)(doff2 >> 32);
2093 STYPE *ptr0 = (STYPE *)(base + scale * o0);
2094 STYPE *ptr1 = (STYPE *)(base + scale * o1);
2095 STYPE *ptr2 = (STYPE *)(base + scale * o2);
2096 STYPE *ptr3 = (STYPE *)(base + scale * o3);
2099 vec_scatter_step_12(ptr0, val.v);
2101 vec_scatter_step_0(ptr1, val.v);
2103 vec_scatter_step_4(ptr2, val.v);
2105 vec_scatter_step_8(ptr3, val.v);
2122 svec_scatter_base_offsets(int32_t* p, uint32_t scale, svec<4,int32_t> offsets,
2123 svec<4,int32_t> val, svec<4,bool> mask){
2124 uint8_t* b = (uint8_t*) p;
2126 lScatterBaseOffsets32_32<int32_t, svec<4,int32_t>, svec<4,int32_t> >(b,scale,offsets,val,mask);
2128 lScatterBaseOffsets<int32_t, svec<4,int32_t>, svec<4,int32_t> >(b,scale,offsets,val,mask);
2135 svec_scatter_base_offsets(int32_t* p, uint32_t scale, svec<4,int64_t> offsets,
2136 svec<4,int32_t> val, svec<4,bool> mask){
2137 uint8_t* b = (uint8_t*) p;
2139 lScatterBaseOffsets64_32<int32_t, svec<4,int64_t>, svec<4,int32_t> >(b,scale,offsets,val,mask);
2141 lScatterBaseOffsets<int32_t,svec<4,int64_t>, svec<4,int32_t> >(b,scale,offsets,val,mask);
2147 svec_scatter_base_offsets(uint32_t* p, uint32_t scale, svec<4,int32_t> offsets,
2148 svec<4,uint32_t> val, svec<4,bool> mask){
2149 uint8_t* b = (uint8_t*) p;
2151 lScatterBaseOffsets32_32<uint32_t, svec<4,int32_t>, svec<4,uint32_t> >(b,scale,offsets,val,mask);
2153 lScatterBaseOffsets<uint32_t, svec<4,int32_t>, svec<4,uint32_t> >(b,scale,offsets,val,mask);
2159 svec_scatter_base_offsets(uint32_t* p, uint32_t scale, svec<4,int64_t> offsets,
2160 svec<4,uint32_t> val, svec<4,bool> mask){
2161 uint8_t* b = (uint8_t*) p;
2163 lScatterBaseOffsets64_32<uint32_t, svec<4,int64_t>, svec<4,uint32_t> >(b,scale,offsets,val,mask);
2165 lScatterBaseOffsets<uint32_t,svec<4,int64_t>, svec<4,uint32_t> >(b,scale,offsets,val,mask);
2176 svec_scatter_base_offsets(
float* p, uint32_t scale, svec<4,int32_t> offsets,
2177 svec<4,float> val,svec<4,bool> mask){
2178 uint8_t* b = (uint8_t*)p;
2180 lScatterBaseOffsets32_32<float, svec<4,int32_t>, svec<4,float> >(b,scale,offsets,val,mask);
2182 lScatterBaseOffsets<float, svec<4,int32_t>, svec<4,float> >(b,scale,offsets,val,mask);
2188 svec_scatter_base_offsets(
float* p,uint32_t scale, svec<4,int64_t> offsets,
2189 svec<4,float> val, svec<4,bool> mask){
2190 uint8_t* b = (uint8_t*)p;
2192 lScatterBaseOffsets64_32<float, svec<4,int64_t>, svec<4,float> >(b,scale,offsets,val,mask);
2194 lScatterBaseOffsets<float, svec<4,int64_t>, svec<4,float> >(b,scale,offsets,val,mask);
2203 svec_scatter_base_offsets(
double* p, uint32_t scale, svec<4,int64_t> offsets,
2204 svec<4,double> val, svec<4,bool> mask){
2205 uint8_t* b = (uint8_t*)p;
2206 lScatterBaseOffsets<double, svec<4,int64_t>, svec<4,double> >(b,scale,offsets,val,mask);
2230 #endif //DOXYGEN_SHOULD_SKIP_THIS
2261 static FORCEINLINE bool svec_any_true(
const svec<4,bool>& mask) {
2262 return vec_any_ne(mask.v, vec_splat_u32(0));
2270 static FORCEINLINE bool svec_all_true(
const svec<4,bool>& mask) {
2271 return vec_all_ne(mask.v, vec_splat_u32(0));
2280 static FORCEINLINE bool svec_none_true(
const svec<4,bool>& mask) {
2281 return vec_all_eq(mask.v, vec_splat_u32(0));
2289 static FORCEINLINE svec<4,bool> svec_and(svec<4,bool> a, svec<4,bool> b) {
2297 static FORCEINLINE svec<4,bool> svec_or(svec<4,bool> a, svec<4,bool> b) {
2304 static FORCEINLINE svec<4,bool> svec_xor(svec<4,bool> a, svec<4,bool> b) {
2311 static FORCEINLINE svec<4,bool> svec_not(svec<4,bool> a) {
2324 #define UNARY_OP_OPT(STYPE, NAME, OP)\
2325 static FORCEINLINE svec<LANES,STYPE> NAME(svec<LANES,STYPE> a) { \
2332 #define UNARY_OP_OPT64(STYPE, NAME, OP)\
2333 static FORCEINLINE svec<LANES,STYPE> NAME(svec<LANES,STYPE> a) { \
2334 return svec<LANES,STYPE>(OP(a.v[0]), OP(a.v[1])); \
2360 static FORCEINLINE svec<4,float> svec_rcp(svec<4,float> v) {
2362 __vector
float estimate = vec_re( v.v );
2364 __vector
float r = vec_madd( vec_nmsub(estimate, v.v, (__vector
float){1.0,1.0,1.0,1.0} ), estimate, estimate);
2365 return svec<4,float>(r);
2370 static FORCEINLINE svec<4,float> svec_rsqrt(svec<4,float> v) {
2373 __vector
float zero = (__vector float){0,0,0,0};
2374 __vector
float oneHalf = (__vector float){0.5,0.5,0.5,0.5};
2375 __vector
float one = (__vector float){1.0,1.0,1.0,1.0};
2376 __vector
float estimate = vec_rsqrte( v.v );
2378 __vector
float estimateSquared = vec_madd( estimate, estimate, zero );
2379 __vector
float halfEstimate = vec_madd( estimate, oneHalf, zero );
2380 __vector
float r = vec_madd( vec_nmsub( v.v, estimateSquared, one ), halfEstimate, estimate );
2381 return svec<4,float>(r);
2387 static FORCEINLINE svec<4,float> svec_sqrt(svec<4,float> v) {
2388 __vector
float r = vec_madd( v.v, svec_rsqrt(v).v, (__vector
float){0,0,0,0} );
2389 return svec<4,float>(r);
2395 static FORCEINLINE svec<4,float> svec_exp(svec<4,float> v) {
2396 return vec_expte(v.v);
2402 static FORCEINLINE svec<4,float> svec_log(svec<4,float> v) {
2403 return svec<4,float>(vec_loge(v.v)) * log(2);
2408 static FORCEINLINE svec<4,uint8_t> svec_abs(svec<4,uint8_t> v) {
return v;}
2410 static FORCEINLINE svec<4,uint16_t> svec_abs(svec<4,uint16_t> v) {
return v;}
2412 static FORCEINLINE svec<4,uint32_t> svec_abs(svec<4,uint32_t> v) {
return v;}
2414 static FORCEINLINE svec<4,uint64_t> svec_abs(svec<4,uint64_t> v) {
return v;}
2427 #define BINARY_OP_OPT(STYPE, NAME, OP) \
2428 static FORCEINLINE svec<LANES,STYPE> NAME(svec<LANES,STYPE> a, svec<LANES,STYPE> b) { \
2429 return svec<LANES,STYPE>(a.v OP b.v); \
2432 #define BINARY_OP_OPT64(STYPE, NAME, OP) \
2433 static FORCEINLINE svec<LANES,STYPE> NAME(svec<LANES,STYPE> a, svec<LANES,STYPE> b) { \
2434 return svec<LANES,STYPE>(a.v[0] OP b.v[0], a.v[1] OP b.v[1]); \
2437 #define BINARY_OP_OPT_FUNC(STYPE, STYPE2, NAME, FUNC) \
2438 static FORCEINLINE svec<LANES,STYPE> NAME(svec<LANES,STYPE> a, svec<LANES,STYPE2> b) { \
2439 return svec<LANES,STYPE>(FUNC(a.v, b.v)); \
2442 #define BINARY_OP_OPT_FUNC64(STYPE, STYPE2, NAME, FUNC) \
2443 static FORCEINLINE svec<LANES,STYPE> NAME(svec<LANES,STYPE> a, svec<LANES,STYPE2> b) { \
2444 return svec<LANES,STYPE>(FUNC(a.v[0], b.v[0]), FUNC(a.v[1], b.v[1])); \
2452 static FORCEINLINE svec<4,int8_t> svec_add (svec<4,int8_t> a, svec<4,int8_t> b) {
2453 return vec_add(a.v,b.v);
2456 static FORCEINLINE svec<4,uint8_t> svec_add(svec<4,uint8_t> a, svec<4,uint8_t> b) {
2457 return vec_add(a.v,b.v);
2460 static FORCEINLINE svec<4,int16_t> svec_add (svec<4,int16_t> a, svec<4,int16_t> b) {
2461 return vec_add(a.v,b.v);
2464 static FORCEINLINE svec<4,uint16_t> svec_add(svec<4,uint16_t> a, svec<4,uint16_t> b) {
2465 return vec_add(a.v,b.v);
2468 static FORCEINLINE svec<4,int32_t> svec_add (svec<4,int32_t> a, svec<4,int32_t> b) {
2469 return vec_add(a.v,b.v);
2472 static FORCEINLINE svec<4,uint32_t> svec_add(svec<4,uint32_t> a, svec<4,uint32_t> b) {
2473 return vec_add(a.v,b.v);
2476 static FORCEINLINE svec<4,int64_t> svec_add (svec<4,int64_t> a, svec<4,int64_t> b) {
2478 return svec<4,int64_t>(vec_add_p8(a.v[0],b.v[0]),vec_add_p8(a.v[1],b.v[1]) );
2480 return svec<4,int64_t>(a.v[0] + b.v[0], a.v[1] + b.v[1]);
2484 static FORCEINLINE svec<4,uint64_t> svec_add(svec<4,uint64_t> a, svec<4,uint64_t> b) {
2486 return svec<4,uint64_t>(vec_add_p8(a.v[0],b.v[0]),vec_add_p8(a.v[1],b.v[1]) );
2488 return svec<4,uint64_t>(a.v[0] + b.v[0], a.v[1] + b.v[1]);
2492 static FORCEINLINE svec<4,float> svec_add (svec<4,float> a, svec<4,float> b) {
2493 return vec_add(a.v,b.v);
2496 static FORCEINLINE svec<4,double> svec_add(svec<4,double> a, svec<4,double> b) {
2497 return svec<4,double>(a.v[0] + b.v[0], a.v[1] + b.v[1]);
2501 static FORCEINLINE svec<4,int8_t> svec_sub (svec<4,int8_t> a, svec<4,int8_t> b) {
2502 return vec_sub(a.v,b.v);
2505 static FORCEINLINE svec<4,uint8_t> svec_sub(svec<4,uint8_t> a, svec<4,uint8_t> b) {
2506 return vec_sub(a.v,b.v);
2509 static FORCEINLINE svec<4,int16_t> svec_sub (svec<4,int16_t> a, svec<4,int16_t> b) {
2510 return vec_sub(a.v,b.v);
2513 static FORCEINLINE svec<4,uint16_t> svec_sub(svec<4,uint16_t> a, svec<4,uint16_t> b) {
2514 return vec_sub(a.v,b.v);
2517 static FORCEINLINE svec<4,int32_t> svec_sub (svec<4,int32_t> a, svec<4,int32_t> b) {
2518 return vec_sub(a.v,b.v);
2521 static FORCEINLINE svec<4,uint32_t> svec_sub(svec<4,uint32_t> a, svec<4,uint32_t> b) {
2522 return vec_sub(a.v,b.v);
2525 static FORCEINLINE svec<4,int64_t> svec_sub (svec<4,int64_t> a, svec<4,int64_t> b) {
2527 return svec<4,int64_t>(vec_sub_p8(a.v[0],b.v[0]),vec_sub_p8(a.v[1],b.v[1]) );
2529 return svec<4,int64_t>(a.v[0] - b.v[0], a.v[1] - b.v[1]);
2533 static FORCEINLINE svec<4,uint64_t> svec_sub(svec<4,uint64_t> a, svec<4,uint64_t> b) {
2535 return svec<4,uint64_t>(vec_sub_p8(a.v[0],b.v[0]),vec_sub_p8(a.v[1],b.v[1]) );
2537 return svec<4,uint64_t>(a.v[0] - b.v[0], a.v[1] - b.v[1]);
2541 static FORCEINLINE svec<4,float> svec_sub (svec<4,float> a, svec<4,float> b) {
2542 return vec_sub(a.v,b.v);
2545 static FORCEINLINE svec<4,double> svec_sub(svec<4,double> a, svec<4,double> b) {
2546 return svec<4,double>(a.v[0] - b.v[0], a.v[1] - b.v[1]);
2552 static FORCEINLINE svec<4,int8_t> svec_mul (svec<4,int8_t> a, svec<4,int8_t> b) {
2556 static FORCEINLINE svec<4,uint8_t> svec_mul(svec<4,uint8_t> a, svec<4,uint8_t> b) {
2560 static FORCEINLINE svec<4,int16_t> svec_mul (svec<4,int16_t> a, svec<4,int16_t> b) {
2564 static FORCEINLINE svec<4,uint16_t> svec_mul(svec<4,uint16_t> a, svec<4,uint16_t> b) {
2568 static FORCEINLINE svec<4,int32_t> svec_mul (svec<4,int32_t> a, svec<4,int32_t> b) {
2570 return ((__vector
signed int)vec_mul_p8((vector
unsigned int)a.v,(vector
unsigned int)b.v));
2573 return vec_mulo((__vector
signed short)a.v, (__vector
signed short)(b.v));
2576 __vector
unsigned int bSwapped, BD, AD_plus_BC;
2577 __vector
unsigned int sixteen = vec_splat_u32(-16 );
2578 __vector
unsigned int zero = vec_splat_u32(0);
2579 bSwapped = vec_rl( b.v, sixteen );
2581 BD = vec_mulo( (__vector
unsigned short) a.v, (__vector
unsigned short) b.v );
2582 AD_plus_BC = vec_msum( (__vector
unsigned short) a.v, (__vector
unsigned short) bSwapped, zero );
2585 AD_plus_BC = vec_sl( AD_plus_BC, sixteen );
2588 return vec_add( AD_plus_BC, BD );
2592 static FORCEINLINE svec<4,uint32_t> svec_mul(svec<4,uint32_t> a, svec<4,uint32_t> b) {
2594 return ((__vector
signed int)vec_mul_p8((vector
unsigned int)a.v,(vector
unsigned int)b.v));
2598 __vector
unsigned int bSwapped, BD, AD_plus_BC;
2599 __vector
unsigned int sixteen = vec_splat_u32(-16 );
2600 __vector
unsigned int zero = vec_splat_u32(0);
2601 bSwapped = vec_rl( b.v, sixteen );
2603 BD = vec_mulo( (__vector
unsigned short) a.v, (__vector
unsigned short) b.v );
2604 AD_plus_BC = vec_msum( (__vector
unsigned short) a.v, (__vector
unsigned short) bSwapped, zero );
2607 AD_plus_BC = vec_sl( AD_plus_BC, sixteen );
2610 return vec_add( AD_plus_BC, BD );
2614 static FORCEINLINE svec<4,int64_t> svec_mul (svec<4,int64_t> a, svec<4,int64_t> b) {
2615 return svec<4,int64_t>(a.v[0] * b.v[0], a.v[1] * b.v[1]);
2618 static FORCEINLINE svec<4,uint64_t> svec_mul(svec<4,uint64_t> a, svec<4,uint64_t> b) {
2619 return svec<4,uint64_t>(a.v[0] * b.v[0], a.v[1] * b.v[1]);
2622 static FORCEINLINE svec<4,float> svec_mul (svec<4,float> a, svec<4,float> b) {
2623 return vec_mul(a.v,b.v);
2626 static FORCEINLINE svec<4,double> svec_mul(svec<4,double> a, svec<4,double> b) {
2627 return svec<4,double>(a.v[0] * b.v[0], a.v[1] * b.v[1]);
2677 #define BIN_VEC_SCAL(STYPE) \
2678 static FORCEINLINE svec<LANES,STYPE> svec_add_scalar(svec<LANES,STYPE> a, STYPE s) { \
2679 return svec_add(a, svec<LANES,STYPE>(s)); \
2681 static FORCEINLINE svec<LANES,STYPE> svec_scalar_add(STYPE s, svec<LANES,STYPE> a) { \
2682 return svec_add(svec<LANES,STYPE>(s), a); \
2684 static FORCEINLINE svec<LANES,STYPE> svec_sub_scalar(svec<LANES,STYPE> a, STYPE s) { \
2685 return svec_sub(a, svec<LANES,STYPE>(s)); \
2687 static FORCEINLINE svec<LANES,STYPE> svec_scalar_sub(STYPE s, svec<LANES,STYPE> a) { \
2688 return svec_sub(svec<LANES,STYPE>(s), a); \
2690 static FORCEINLINE svec<LANES,STYPE> svec_mul_scalar(svec<LANES,STYPE> a, STYPE s) { \
2691 return svec_mul(a, svec<LANES,STYPE>(s)); \
2693 static FORCEINLINE svec<LANES,STYPE> svec_scalar_mul(STYPE s, svec<LANES,STYPE> a) { \
2694 return svec_mul(svec<LANES,STYPE>(s), a); \
2696 static FORCEINLINE svec<LANES,STYPE> svec_div_scalar(svec<LANES,STYPE> a, STYPE s) { \
2697 return svec_div(a, svec<LANES,STYPE>(s)); \
2699 static FORCEINLINE svec<LANES,STYPE> svec_scalar_div(STYPE s, svec<LANES,STYPE> a) { \
2700 return svec_div(svec<LANES,STYPE>(s), a); \
2724 static FORCEINLINE svec<4,int64_t> svec_shl(svec<4,int64_t> a, svec<4,uint64_t> b) {
2726 return svec<4,int64_t>(a[0] << b[0], a[1] << b[1], a[2] << b[2], a[3] << b[3]);
2730 static FORCEINLINE svec<4,uint64_t> svec_shl(svec<4,uint64_t> a, svec<4,uint64_t> b) {
2732 return svec<4,uint64_t>(a[0] << b[0], a[1] << b[1], a[2] << b[2], a[3] << b[3]);
2743 static FORCEINLINE svec<4,int64_t> svec_shr(svec<4,int64_t> a, svec<4,uint64_t> b) {
2745 return svec<4,int64_t>(a[0] >> b[0], a[1] >> b[1], a[2] >> b[2], a[3] >> b[3]);
2749 static FORCEINLINE svec<4,uint64_t> svec_shr(svec<4,uint64_t> a, svec<4,uint64_t> b) {
2751 return svec<4,uint64_t>(a[0] >> b[0], a[1] >> b[1], a[2] >> b[2], a[3] >> b[3]);
2811 return vec_madd(a.
v, b.
v, c.
v);
2823 return vec_msub(a.
v, b.
v, c.
v);
2835 return vec_nmsub(a.
v, b.
v, c.
v);
2871 #define MAX_MIN_REDUCE_METHODS(STYPE) \
2872 BINARY_OP_REDUCE_FUNC_L4(STYPE, svec_reduce_add, add<STYPE>); \
2873 BINARY_OP_REDUCE_FUNC_L4(STYPE, svec_reduce_max, max<STYPE>); \
2874 BINARY_OP_REDUCE_FUNC_L4(STYPE, svec_reduce_min, min<STYPE>); \
2891 svec_reduce_add(v0),
2892 svec_reduce_add(v1),
2893 svec_reduce_add(v2),
2901 __vector
double sv0 = v0.
v[0] + v0.
v[1];
2902 __vector
double sv1 = v1.
v[0] + v1.
v[1];
2903 __vector
double sv2 = v2.
v[0] + v2.
v[1];
2904 __vector
double sv3 = v3.
v[0] + v3.
v[1];
2906 __vector
double h0 = vec_mergeh(sv0, sv1);
2907 __vector
double l0 = vec_mergel(sv0, sv1);
2908 __vector
double h1 = vec_mergeh(sv2, sv3);
2909 __vector
double l1 = vec_mergel(sv2, sv3);
2912 __vector
double s0 = h0 + l0;
2913 __vector
double s1 = h1 + l1;
2925 static FORCEINLINE svec<4,bool> svec_equal(svec<4,bool> a, svec<4,bool> b) {
2926 return (__vector
unsigned int)(vec_cmpeq(a.v, b.v));
2935 static FORCEINLINE svec<4,bool> svec_not_equal(svec<4,bool> a, svec<4,bool> b) {
2936 return ~(__vector
unsigned int)(vec_cmpeq(a.v, b.v));
2940 static FORCEINLINE svec<4,bool> svec_equal(svec<4,int8_t> a, svec<4,int8_t> b) {
2941 __vector
bool char t = vec_cmpeq(a.v,b.v);
2942 return (__vector
unsigned int)vec_unpackh(vec_unpackh(t));
2945 static FORCEINLINE svec<4,bool> svec_not_equal(svec<4,int8_t> a, svec<4,int8_t> b) {
2946 return ~ svec_equal(a, b);
2956 __vector
bool char t = vec_cmpeq(a.v,b.v);
2957 return (__vector
unsigned int)vec_unpackh(vec_unpackh(t));
2960 static FORCEINLINE svec<4,bool> svec_not_equal(svec<4,uint8_t> a, svec<4,uint8_t> b) {
2961 return ~ svec_equal(a, b);
2984 return (__vector
unsigned int)vec_cmpeq(a.
v,b.
v);
2987 static FORCEINLINE svec<4,bool> svec_not_equal(svec<4,int32_t> a, svec<4,int32_t> b) {
2988 return ~(__vector
unsigned int)vec_cmpeq(a.v,b.v);
2991 static FORCEINLINE svec<4,bool> svec_less_than(svec<4,int32_t> a, svec<4,int32_t> b) {
2992 return (__vector
unsigned int)vec_cmplt(a.v,b.v);
2995 static FORCEINLINE svec<4,bool> svec_less_equal(svec<4,int32_t> a, svec<4,int32_t> b) {
2996 return svec_less_than(a, b) | svec_equal(a, b);
2999 static FORCEINLINE svec<4,bool> svec_greater_than(svec<4,int32_t> a, svec<4,int32_t> b) {
3000 return (__vector
unsigned int)vec_cmpgt(a.v,b.v);
3003 static FORCEINLINE svec<4,bool> svec_greater_equal(svec<4,int32_t> a, svec<4,int32_t> b) {
3004 return svec_greater_than(a, b) | svec_equal(a, b);
3010 return (__vector
unsigned int)vec_cmpeq(a.
v,b.
v);
3013 static FORCEINLINE svec<4,bool> svec_not_equal(svec<4,uint32_t> a, svec<4,uint32_t> b) {
3014 return ~(__vector
unsigned int)vec_cmpeq(a.v,b.v);
3017 static FORCEINLINE svec<4,bool> svec_less_than(svec<4,uint32_t> a, svec<4,uint32_t> b) {
3018 return (__vector
unsigned int)vec_cmplt(a.v,b.v);
3021 static FORCEINLINE svec<4,bool> svec_less_equal(svec<4,uint32_t> a, svec<4,uint32_t> b) {
3022 return svec_less_than(a, b) | svec_equal(a, b);
3025 static FORCEINLINE svec<4,bool> svec_greater_than(svec<4,uint32_t> a, svec<4,uint32_t> b) {
3026 return (__vector
unsigned int)vec_cmpgt(a.v,b.v);
3029 static FORCEINLINE svec<4,bool> svec_greater_equal(svec<4,uint32_t> a, svec<4,uint32_t> b) {
3030 return svec_greater_than(a, b) | svec_equal(a, b);
3041 __vector
signed long long tr1 = vec_cmpeq_p8(a.
v[0], b.
v[0]);
3042 __vector
signed long long tr2 = vec_cmpeq_p8(a.
v[1], b.
v[1]);
3047 unsigned int r0 = a[0] == b[0];
3048 unsigned int r1 = a[1] == b[1];
3049 unsigned int r2 = a[2] == b[2];
3050 unsigned int r3 = a[3] == b[3];
3056 static FORCEINLINE svec<4,bool> svec_not_equal(svec<4,int64_t> a, svec<4,int64_t> b) {
3057 return ~ svec_equal(a, b);
3068 __vector
signed long long tr1 = vec_cmpeq_p8(a.
v[0], b.
v[0]);
3069 __vector
signed long long tr2 = vec_cmpeq_p8(a.
v[1], b.
v[1]);
3074 unsigned int r0 = a[0] == b[0];
3075 unsigned int r1 = a[1] == b[1];
3076 unsigned int r2 = a[2] == b[2];
3077 unsigned int r3 = a[3] == b[3];
3083 static FORCEINLINE svec<4,bool> svec_not_equal(svec<4,uint64_t> a, svec<4,uint64_t> b) {
3084 return ~ svec_equal(a, b);
3098 return (__vector
unsigned int)vec_cmpeq(a.
v,b.
v);
3101 static FORCEINLINE svec<4,bool> svec_not_equal(svec<4,float> a, svec<4,float> b) {
3102 return ~(__vector
unsigned int)vec_cmpeq(a.v,b.v);
3105 static FORCEINLINE svec<4,bool> svec_less_than(svec<4,float> a, svec<4,float> b) {
3106 return (__vector
unsigned int)vec_cmplt(a.v,b.v);
3109 static FORCEINLINE svec<4,bool> svec_less_equal(svec<4,float> a, svec<4,float> b) {
3110 return (__vector
unsigned int)vec_cmple(a.v,b.v);
3113 static FORCEINLINE svec<4,bool> svec_greater_than(svec<4,float> a, svec<4,float> b) {
3114 return (__vector
unsigned int)vec_cmpgt(a.v,b.v);
3117 static FORCEINLINE svec<4,bool> svec_greater_equal(svec<4,float> a, svec<4,float> b) {
3118 return (__vector
unsigned int)vec_cmpge(a.v,b.v);
3127 CMP_OP(
double, equal, ==);
3128 CMP_OP(
double, not_equal, !=);
3132 __vector
signed long long tr1 = (__vector
signed long long)vec_cmplt(a.
v[0], b.
v[0]);
3133 __vector
signed long long tr2 = (__vector
signed long long)vec_cmplt(a.
v[1], b.
v[1]);
3134 return vec_pack_p8(tr1,tr2);
3137 unsigned int r0 = a[0] < b[0];
3138 unsigned int r1 = a[1] < b[1];
3139 unsigned int r2 = a[2] < b[2];
3140 unsigned int r3 = a[3] < b[3];
3145 static FORCEINLINE svec<4,bool> svec_less_equal(svec<4,double> a, svec<4,double> b) {
3146 return svec_less_than(a, b) | svec_equal(a, b);
3150 static FORCEINLINE svec<4,bool> svec_greater_than(svec<4,double> a, svec<4,double> b) {
3152 __vector
signed long long tr1 = (__vector
signed long long)vec_cmpgt(a.v[0], b.v[0]);
3153 __vector
signed long long tr2 = (__vector
signed long long)vec_cmpgt(a.v[1], b.v[1]);
3154 return vec_pack_p8(tr1,tr2);
3157 unsigned int r0 = a[0] > b[0];
3158 unsigned int r1 = a[1] > b[1];
3159 unsigned int r2 = a[2] > b[2];
3160 unsigned int r3 = a[3] > b[3];
3161 return svec<4,bool>(r0,r1,r2,r3);
3165 static FORCEINLINE svec<4,bool> svec_greater_equal(svec<4,double> a, svec<4,double> b) {
3166 return svec_greater_than(a, b) | svec_equal(a, b);
3176 #define CAST_OPT(SFROM, STO) \
3177 template <class T> static T svec_cast(svec<LANES,SFROM> val); \
3181 template <> FORCEINLINE svec<LANES,STO> svec_cast<svec<LANES,STO> >(svec<LANES,SFROM> val) { \
3182 return svec<LANES,STO>((val.v)); \
3188 #define CAST_OPT64(SFROM, STO) \
3189 template <class T> static T svec_cast(svec<LANES,SFROM> val); \
3193 template <> FORCEINLINE svec<LANES,STO> svec_cast<svec<LANES,STO> >(svec<LANES,SFROM> val) { \
3194 return svec<LANES,STO>((val.v[0]),(val.v[1])); \
3227 return vec_unpackh(val.v);
3230 template <
class T>
static T svec_cast(svec<4,int8_t> val);
3235 __vector uint16_t v = vec_unpackh(val.v);
3239 template <
class T>
static T svec_cast(svec<4,int8_t> val);
3244 return vec_unpackh(vec_unpackh(val.v));
3247 template <
class T>
static T svec_cast(svec<4,int8_t> val);
3252 __vector uint32_t v = vec_unpackh(vec_unpackh(val.v));
3270 __vector int16_t v = vec_unpackh((__vector int8_t)val.v);
3271 __vector int16_t mask = {0xFF, 0xFF, 0xFF, 0xFF, 0, 0, 0, 0};
3275 template <
class T>
static T svec_cast(svec<4,uint8_t> val);
3280 __vector uint16_t v = vec_unpackh((__vector int8_t)val.v);
3281 __vector uint16_t mask = {0xFF, 0xFF, 0xFF, 0xFF, 0, 0, 0, 0};
3285 template <
class T>
static T svec_cast(svec<4,uint8_t> val);
3290 __vector int32_t v = vec_unpackh(vec_unpackh((__vector int8_t)val.v));
3291 __vector int32_t mask = {0xFF, 0xFF, 0xFF, 0xFF};
3295 template <
class T>
static T svec_cast(svec<4,uint8_t> val);
3300 __vector uint32_t v = vec_unpackh(vec_unpackh((__vector int8_t)val.v));
3301 __vector uint32_t mask = {0xFF, 0xFF, 0xFF, 0xFF};
3321 return vec_unpackh(val.v);
3324 template <
class T>
static T svec_cast(svec<4,int16_t> val);
3329 __vector uint32_t v = vec_unpackh(val.v);
3349 __vector int32_t v = vec_unpackh((__vector int16_t)val.v);
3350 __vector int32_t mask = {0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF};
3354 template <
class T>
static T svec_cast(svec<4,uint16_t> val);
3359 __vector uint32_t v = vec_unpackh((__vector int16_t)val.v);
3360 __vector uint32_t mask = {0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF};
3383 return svec<4,int64_t>(vec_unpackh_p8((__vector
unsigned int)val.v),vec_unpackl_p8((__vector
unsigned int)val.v));
3386 return svec<4,int64_t>((int64_t)val[0], (int64_t)val[1], (int64_t)val[2], (int64_t)val[3]);
3390 template <
class T>
static T svec_cast(svec<4,int32_t> val);
3396 return svec<4,uint64_t>(vec_unpackh_p8((__vector
unsigned int)val.v),vec_unpackl_p8((__vector
unsigned int)val.v));
3399 return svec<4,uint64_t>((uint64_t)val[0], (uint64_t)val[1], (uint64_t)val[2], (uint64_t)val[3]);
3403 template <
class T>
static T svec_cast(svec<4,int32_t> val);
3408 return vec_ctf(val.v,0);
3427 return svec<4,int64_t>(vec_unpackh_p8((__vector
unsigned int)val.v),vec_unpackl_p8((__vector
unsigned int)val.v));
3430 return svec<4,int64_t>((int64_t)val[0], (int64_t)val[1], (int64_t)val[2], (int64_t)val[3]);
3434 template <
class T>
static T svec_cast(svec<4,uint32_t> val);
3440 return svec<4,uint64_t>(vec_unpackh_p8((__vector
unsigned int)val.v),vec_unpackl_p8((__vector
unsigned int)val.v));
3443 return svec<4,uint64_t>((uint64_t)val[0], (uint64_t)val[1], (uint64_t)val[2], (uint64_t)val[3]);
3462 return (__vector
signed int)vec_pack_p8(val.v[0],val.v[1]);
3465 return svec<4,int32_t>((int32_t)val[0], (int32_t)val[1], (int32_t)val[2], (int32_t)val[3]);
3469 template <
class T>
static T svec_cast(svec<4,int64_t> val);
3475 return (__vector
unsigned int)vec_pack_p8(val.v[0],val.v[1]);
3478 return svec<4,uint32_t>((uint32_t)val[0], (uint32_t)val[1], (uint32_t)val[2], (uint32_t)val[3]);
3499 return (__vector
signed int)vec_pack_p8(val.v[0],val.v[1]);
3502 return svec<4,int32_t>((int32_t)val[0], (int32_t)val[1], (int32_t)val[2], (int32_t)val[3]);
3506 template <
class T>
static T svec_cast(svec<4,uint64_t> val);
3512 return (__vector
unsigned int)vec_pack_p8(val.v[0],val.v[1]);
3515 return svec<4,uint32_t>((uint32_t)val[0], (uint32_t)val[1], (uint32_t)val[2], (uint32_t)val[3]);
3531 __vector
signed int tsi=vec_splat_s32(0);
3532 return vec_pack(vec_pack(vec_cts(val.v, 0), tsi), (__vector
signed short)tsi);
3535 template <
class T>
static T svec_cast(svec<4,float> val);
3540 __vector
unsigned int tsi=vec_splat_s32(0);
3541 return vec_pack(vec_pack(vec_ctu(val.v, 0), tsi), (__vector
unsigned short)tsi);
3545 template <
class T>
static T svec_cast(svec<4,float> val);
3550 __vector
signed int tsi=vec_splat_s32(0);
3551 return vec_pack(vec_cts(val.v, 0), tsi);
3554 template <
class T>
static T svec_cast(svec<4,float> val);
3559 __vector
unsigned int tsi=vec_splat_s32(0);
3560 return vec_pack(vec_ctu(val.v, 0), tsi);
3563 template <
class T>
static T svec_cast(svec<4,float> val);
3568 return vec_cts(val.v, 0);
3571 template <
class T>
static T svec_cast(svec<4,float> val);
3576 return vec_ctu(val.v, 0);
3621 #define CAST_BITS_OPT(SFROM, STO) \
3622 template <class T> static T svec_cast_bits(svec<LANES,SFROM> val); \
3626 template <> FORCEINLINE svec<LANES,STO> svec_cast_bits<svec<LANES,STO> >(svec<LANES,SFROM> val) { \
3627 return svec<LANES,STO>((__vector STO)(val.v)); \
3633 #define CAST_BITS_OPT64(SFROM, STO) \
3634 template <class T> static T svec_cast_bits(svec<LANES,SFROM> val); \
3638 template <> FORCEINLINE svec<LANES,STO> svec_cast_bits<svec<LANES,STO> >(svec<LANES,SFROM> val) { \
3639 return svec<LANES,STO>((__vector STO)(val.v[0]), (__vector STO)(val.v[1])); \
3663 #define SUBSCRIPT_FUNC_IMPL_VSX(STYPE) \
3664 FORCEINLINE STYPE& svec<LANES,STYPE>::operator[](int index) { \
3665 INC_STATS_NAME(STATS_INSERT, 1, "insert "#STYPE); \
3666 return ((STYPE *)&v)[index]; \
3668 const FORCEINLINE STYPE svec<LANES,STYPE>::operator[](int index) const { \
3669 return svec_extract(*this, index); \
3673 svec_insert(m_self, m_index, value);
3676 svec_insert(m_self, m_index, helper.operator uint32_t());
3679 return svec_extract(*m_self, m_index);
3682 return svec_extract(*
this, index);
3706 res = ((mask[0]>>31) & 0x1) |
3707 ((mask[1]>>30) & 0x2) |
3708 ((mask[2]>>29) & 0x4) |
3709 ((mask[3]>>28) & 0x8);
3784 return svec_equal(*
this, a);
3793 return svec_not_equal(*
this, a);
#define COUT_FUNC_BOOL_DECL()
Definition: gsimd_utility.h:266
#define CAST_OPT64(SFROM, STO)
cast based on directly change the __vector type
Definition: power_vsx4.h:3188
__vector signed short v
Definition: power_vsx4.h:339
svec()
Default constructor.
Definition: power_vsx4.h:712
svec()
Default constructor.
Definition: power_vsx4.h:190
#define CAST_OPT(SFROM, STO)
cast based on directly change the __vector type
Definition: power_vsx4.h:3176
#define GATHER_STRIDE_L4(STYPE, OSTYPE)
macros for fast impl of gather base step
Definition: gsimd_utility.h:682
__vector unsigned int v
Definition: power_vsx4.h:501
svec(int8_t a, int8_t b, int8_t c, int8_t d)
Constructor.
Definition: power_vsx4.h:251
svec(__vector unsigned long long a, __vector unsigned long long b)
For internal use only. Construct svec<4,uint64_t> with two __vector unsigned long long values...
Definition: power_vsx4.h:644
svec(__vector double a, __vector double b)
For internal use only. Construct svec<4,double> with two __vector double values.
Definition: power_vsx4.h:793
#define VEC_INT_CLASS_METHOD_DECL(STYPE, USTYPE)
macros method definition for integer vector only Note: shift's operator can only be unsigned vector ...
Definition: gsimd_utility.h:379
#define BINARY_OP_OPT(STYPE, NAME, OP)
macros based on __vector type's operator overload
Definition: power_vsx4.h:2427
#define TERNERY_L4(STYPE)
Definition: gsimd_utility.h:984
#define CMP_ALL_MASKED_OP(STYPE)
Definition: gsimd_utility.h:1099
__vector unsigned int v
use __vector unsigned int v for storage
Definition: power_vsx4.h:184
#define SCATTER_BASE_OFFSETS_L4(STYPE, OSTYPE)
Definition: gsimd_utility.h:789
svec(__vector signed char vv)
For internal use only.
Definition: power_vsx4.h:246
#define BINARY_OP_OPT64(STYPE, NAME, OP)
Definition: power_vsx4.h:2432
Definition: gsimd_utility.h:93
svec(double a)
Constructor.
Definition: power_vsx4.h:812
svec(__vector unsigned char vv)
For internal use only.
Definition: power_vsx4.h:298
svec()
Default constructor.
Definition: power_vsx4.h:788
#define CAST_BITS_OPT(SFROM, STO)
cast based on directly change the __vector type
Definition: power_vsx4.h:3621
#define GATHER_GENERAL_L4(STYPE, PSTYPE)
slow implementation of gather general Must use template to specify the return type ...
Definition: gsimd_utility.h:617
svec()
Default constructor.
Definition: power_vsx4.h:395
Data representation and operations on a vector of 4 boolean values. This is used in predicated vector...
Definition: power_vsx4.h:182
#define CMP_ALL_NOMASK_OP_L4(STYPE)
Definition: gsimd_utility.h:1091
svec(int64_t a)
Constructor.
Definition: power_vsx4.h:590
#define VEC_FLOAT_CLASS_METHOD_DECL(STYPE)
Definition: gsimd_utility.h:393
__vector double v[2]
Definition: power_vsx4.h:783
#define VEC_CLASS_METHOD_DECL(STYPE)
macros for non-mask i8 - double types's method
Definition: gsimd_utility.h:350
svec< 4, bool > svec_select(svec< 4, bool > mask, svec< 4, bool > a, svec< 4, bool > b)
construct c by selecting elements from two input vectors according to the mask
Definition: power_vsx4.h:1126
svec()
Default constructor.
Definition: power_vsx4.h:639
svec(__vector unsigned int vv)
For internal use only.
Definition: power_vsx4.h:197
#define SVEC_BOOL_CLASS_METHOD_DECL()
macros for svec<N,bool> class's class method
Definition: gsimd_utility.h:330
svec(__vector signed long long a, __vector signed long long b)
For internal use only. Construct svec<4,int64_t> with two __vector signed long long values...
Definition: power_vsx4.h:571
svec(uint32_t a, uint32_t b, uint32_t c, uint32_t d)
Constructor.
Definition: power_vsx4.h:204
#define SHUFFLES_L4(STYPE)
macro for shuffle/shuffle2 methods implementation
Definition: gsimd_utility.h:537
svec(float a, float b, float c, float d)
Constructor.
Definition: power_vsx4.h:723
svec(__vector unsigned int vv)
For internal use only.
Definition: power_vsx4.h:512
#define BROADCAST_OPT32(STYPE)
Definition: power_vsx4.h:1280
#define INC_STATS_NAME(stat, inc, opname)
Definition: gsimd_utility.h:156
svec(__vector signed int vv)
For internal use only.
Definition: power_vsx4.h:452
#define COUT_FUNC_DECL(STYPE)
Definition: gsimd_utility.h:283
svec(uint32_t a)
Constructor.
Definition: power_vsx4.h:214
#define UNARY_OP_OPT(STYPE, NAME, OP)
Definition: power_vsx4.h:2324
svec(uint64_t a, uint64_t b, uint64_t c, uint64_t d)
Constructor.
Definition: power_vsx4.h:652
#define UNARY_OP_L4(STYPE, NAME, OP)
Definition: gsimd_utility.h:841
svec(__vector float vv)
For internal use only.
Definition: power_vsx4.h:718
data representation and operations on a vector of 4 signed short.
Definition: power_vsx4.h:338
#define VEC_CMP_IMPL(STYPE)
Definition: gsimd_utility.h:1175
svec(void *p0, void *p1, void *p2, void *p3)
Constructor.
Definition: power_vsx4.h:1516
svec()
Default constructor.
Definition: power_vsx4.h:292
data representation and operations on a vector of 4 unsigned long long.
Definition: power_vsx4.h:633
svec(uint8_t a, uint8_t b, uint8_t c, uint8_t d)
Constructor.
Definition: power_vsx4.h:303
#define SUBSCRIPT_FUNC_DECL(STYPE)
macros to define a intrinsic based subscript opertor
Definition: gsimd_utility.h:247
svec(int8_t a)
Constructor.
Definition: power_vsx4.h:260
svec(__vector unsigned short vv)
For internal use only.
Definition: power_vsx4.h:401
data representation and operations on a vector of 4 signed int.
Definition: power_vsx4.h:440
svec()
Default constructor.
Definition: power_vsx4.h:344
#define MVEC_CLASS_METHOD_IMPL(STYPE)
mask class's class method impl
Definition: gsimd_utility.h:1285
svec< 4, int32_t > svec_madd(svec< 4, int32_t > a, svec< 4, int32_t > b, svec< 4, int32_t > c)
vector multiply and add operation. return a * b + c.
Definition: power_vsx4.h:2802
#define INSERT_EXTRACT_OPT(STYPE)
Definition: power_vsx4.h:851
#define SUBSCRIPT_FUNC_BOOL_DECL(STYPE)
Definition: gsimd_utility.h:251
#define VEC_CLASS_METHOD_IMPL(STYPE)
Definition: gsimd_utility.h:1301
#define GATHER_BASE_OFFSETS_L4(STYPE, OSTYPE)
Definition: gsimd_utility.h:658
data representation and operations on a vector of 4 signed long long.
Definition: power_vsx4.h:560
#define UNARY_OP_OPT64(STYPE, NAME, OP)
macros for 64bit object, i64/u64/double
Definition: power_vsx4.h:2332
__vector unsigned long long v[2]
Definition: power_vsx4.h:634
data representation and operations on a vector of 4 unsigned short.
Definition: power_vsx4.h:389
#define MASKED_LOAD_STORE_L4(STYPE)
Definition: gsimd_utility.h:797
#define VEC_FLOAT_CLASS_METHOD_IMPL(STYPE)
Definition: gsimd_utility.h:1433
svec(uint32_t a)
Constructor.
Definition: power_vsx4.h:526
#define MAX_MIN_REDUCE_METHODS(STYPE)
Definition: power_vsx4.h:2871
#define SCATTER_STRIDE_L4(STYPE, OSTYPE)
Definition: gsimd_utility.h:715
svec(double a, double b, double c, double d)
Constructor.
Definition: power_vsx4.h:801
svec(uint64_t a)
Constructor.
Definition: power_vsx4.h:663
data representation and operations on a vector of 4 unsigned int.
Definition: power_vsx4.h:500
#define SUBSCRIPT_FUNC_IMPL_VSX(STYPE)
this macro uses vsx specific intrinsics to do extract, insert
Definition: power_vsx4.h:3663
__vector signed long long v[2]
Definition: power_vsx4.h:561
svec(float a)
Constructor.
Definition: power_vsx4.h:732
#define LOAD_STORE(STYPE)
Definition: gsimd_utility.h:419
svec(uint32_t a, uint32_t b, uint32_t c, uint32_t d)
Constructor.
Definition: power_vsx4.h:517
svec(uint8_t a)
Constructor.
Definition: power_vsx4.h:313
__vector unsigned short v
Definition: power_vsx4.h:390
data representation and operations on a vector of 4 double.
Definition: power_vsx4.h:782
#define CAST_L4(SFROM, STO)
Definition: gsimd_utility.h:1124
#define BINARY_OP_L4(STYPE, NAME, OP)
macros for generic slow imple of binary operation
Definition: gsimd_utility.h:880
svec(int16_t a)
Constructor.
Definition: power_vsx4.h:364
svec< 4, int32_t > svec_nmsub(svec< 4, int32_t > a, svec< 4, int32_t > b, svec< 4, int32_t > c)
vector multiply and add operation. return - ( a * b - c).
Definition: power_vsx4.h:2802
svec()
Default constructor.
Definition: power_vsx4.h:240
svec(int a, int b, int c, int d)
Constructor.
Definition: power_vsx4.h:457
data representation and operations on a vector of 4 float.
Definition: power_vsx4.h:706
__vector signed char v
Definition: power_vsx4.h:234
svec< 4,float > svec_preduce_add(svec< 4, float > v0, svec< 4, float > v1, svec< 4, float > v2, svec< 4, float > v3)
Definition: power_vsx4.h:2888
svec(int32_t a)
Constructor.
Definition: power_vsx4.h:466
#define SCATTER_GENERAL_L4(STYPE, PSTYPE)
Definition: gsimd_utility.h:756
#define BINARY_OP_FUNC_L4(STYPE, NAME, FUNC)
Definition: gsimd_utility.h:904
#define ROTATE_L4(STYPE)
macro for rotate method implementation
Definition: gsimd_utility.h:507
svec(uint16_t a, uint16_t b, uint16_t c, uint16_t d)
Constructor.
Definition: power_vsx4.h:406
#define BINARY_OP_SCALAR_L4(STYPE, STYPE2, NAME, OP)
macros for binary: vector op scalar
Definition: gsimd_utility.h:917
#define COUT_FUNC_CHAR_DECL(STYPE)
Definition: gsimd_utility.h:275
#define CAST_BITS_OPT64(SFROM, STO)
cast based on directly change the __vector type
Definition: power_vsx4.h:3633
#define CMP_OP(STYPE, NAME, OP)
macros for binary: vector op scalar
Definition: gsimd_utility.h:1049
svec(int64_t a, int64_t b, int64_t c, int64_t d)
Constructor.
Definition: power_vsx4.h:579
#define BIN_VEC_SCAL(STYPE)
Definition: power_vsx4.h:2677
svec(__vector signed short vv)
For internal use only.
Definition: power_vsx4.h:350
svec()
Default constructor.
Definition: power_vsx4.h:446
#define BINARY_OP_OPT_FUNC(STYPE, STYPE2, NAME, FUNC)
Definition: power_vsx4.h:2437
svec()
Default constructor.
Definition: power_vsx4.h:506
#define CMP_OP_L4(STYPE, NAME, OP)
Definition: gsimd_utility.h:1057
svec< 4, int32_t > svec_msub(svec< 4, int32_t > a, svec< 4, int32_t > b, svec< 4, int32_t > c)
vector multiply and add operation. return a * b - c.
Definition: power_vsx4.h:2802
#define INSERT_EXTRACT_OPT64(STYPE)
Definition: power_vsx4.h:859
Definition: power_vsx4.h:128
#define SELECT_BOOLCOND(STYPE)
macros for svec's select by bool scalar method implementation
Definition: gsimd_utility.h:459
svec()
Default constructor,.
Definition: power_vsx4.h:566
#define VEC_INT_CLASS_METHOD_IMPL(STYPE, STYPE2)
Definition: gsimd_utility.h:1394
#define FORCEINLINE
Definition: gsimd_utility.h:175
#define BROADCAST_OPT64(STYPE)
Definition: power_vsx4.h:1286
__vector signed int v
Definition: power_vsx4.h:441
svec(uint16_t a)
Constructor.
Definition: power_vsx4.h:415
__vector float v
Definition: power_vsx4.h:707
svec(int16_t a, int16_t b, int16_t c, int16_t d)
Constructor.
Definition: power_vsx4.h:355
__vector unsigned char v
Definition: power_vsx4.h:287
#define BROADCAST_L4(STYPE)
macro for broadcast method implementation for lanes4 All broadcast are slow implementation ...
Definition: gsimd_utility.h:485