summaryrefslogtreecommitdiffstats
path: root/clang/lib
diff options
context:
space:
mode:
Diffstat (limited to 'clang/lib')
-rw-r--r--clang/lib/Headers/avx512fintrin.h339
1 files changed, 221 insertions, 118 deletions
diff --git a/clang/lib/Headers/avx512fintrin.h b/clang/lib/Headers/avx512fintrin.h
index 63c49bb76db..0a3cf301b68 100644
--- a/clang/lib/Headers/avx512fintrin.h
+++ b/clang/lib/Headers/avx512fintrin.h
@@ -9845,186 +9845,289 @@ _mm512_mask_reduce_mul_ps(__mmask16 __M, __m512 __W) {
#undef _mm512_reduce_operator_32bit
#undef _mm512_mask_reduce_operator_32bit
-#define _mm512_mask_reduce_operator(op) \
- __m512i __t1 = (__m512i)__builtin_shufflevector((__v8di)__V, (__v8di)__V, 4, 5, 6, 7, 0, 1, 2, 3); \
- __m512i __t2 = _mm512_##op(__V, __t1); \
- __m512i __t3 = (__m512i)__builtin_shufflevector((__v8di)__t2, (__v8di)__t2, 2, 3, 0, 1, 6, 7, 4, 5); \
- __m512i __t4 = _mm512_##op(__t2, __t3); \
- __m512i __t5 = (__m512i)__builtin_shufflevector((__v8di)__t4, (__v8di)__t4, 1, 0, 3, 2, 5, 4, 7, 6); \
- __v8di __t6 = (__v8di)_mm512_##op(__t4, __t5); \
- return __t6[0];
+// Used bisection method. At each step, we partition the vector with previous
+// step in half, and the operation is performed on its two halves.
+// This takes log2(n) steps where n is the number of elements in the vector.
+// This macro uses only intrinsics from the AVX512F feature.
+
+// Vec512 - Vector with size of 512.
+// IntrinName - Can be one of following: {max|min}_{epi64|epu64|pd} for example:
+// __mm512_max_epi64
+// T1 - Can get 'i' for int and 'd' for double.[__m512{i|d}]
+// T2 - Can get 'i' for int and 'f' for float. [__v8d{i|f}]
+
+#define _mm512_reduce_maxMin_64bit(Vec512, IntrinName, T1, T2) __extension__({ \
+ Vec512 = _mm512_##IntrinName( \
+ (__m512##T1)__builtin_shufflevector( \
+ (__v8d##T2)Vec512, \
+ (__v8d##T2)Vec512, \
+ 0, 1, 2, 3, -1, -1, -1, -1), \
+ (__m512##T1)__builtin_shufflevector( \
+ (__v8d##T2)Vec512, \
+ (__v8d##T2)Vec512, \
+ 4, 5, 6, 7, -1, -1, -1, -1)); \
+ Vec512 = _mm512_##IntrinName( \
+ (__m512##T1)__builtin_shufflevector( \
+ (__v8d##T2)Vec512, \
+ (__v8d##T2)Vec512, \
+ 0, 1, -1, -1, -1, -1, -1, -1),\
+ (__m512##T1)__builtin_shufflevector( \
+ (__v8d##T2)Vec512, \
+ (__v8d##T2)Vec512, \
+ 2, 3, -1, -1, -1, -1, -1, \
+ -1)); \
+ Vec512 = _mm512_##IntrinName( \
+ (__m512##T1)__builtin_shufflevector( \
+ (__v8d##T2)Vec512, \
+ (__v8d##T2)Vec512, \
+ 0, -1, -1, -1, -1, -1, -1, -1),\
+ (__m512##T1)__builtin_shufflevector( \
+ (__v8d##T2)Vec512, \
+ (__v8d##T2)Vec512, \
+ 1, -1, -1, -1, -1, -1, -1, -1))\
+ ; \
+ return Vec512[0]; \
+ })
static __inline__ long long __DEFAULT_FN_ATTRS
_mm512_reduce_max_epi64(__m512i __V) {
- _mm512_mask_reduce_operator(max_epi64);
+ _mm512_reduce_maxMin_64bit(__V, max_epi64, i, i);
}
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
_mm512_reduce_max_epu64(__m512i __V) {
- _mm512_mask_reduce_operator(max_epu64);
+ _mm512_reduce_maxMin_64bit(__V, max_epu64, i, i);
}
-static __inline__ long long __DEFAULT_FN_ATTRS
-_mm512_reduce_min_epi64(__m512i __V) {
- _mm512_mask_reduce_operator(min_epi64);
+static __inline__ double __DEFAULT_FN_ATTRS
+_mm512_reduce_max_pd(__m512d __V) {
+ _mm512_reduce_maxMin_64bit(__V, max_pd, d, f);
+}
+
+static __inline__ long long __DEFAULT_FN_ATTRS _mm512_reduce_min_epi64
+(__m512i __V) {
+ _mm512_reduce_maxMin_64bit(__V, min_epi64, i, i);
}
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
_mm512_reduce_min_epu64(__m512i __V) {
- _mm512_mask_reduce_operator(min_epu64);
+ _mm512_reduce_maxMin_64bit(__V, min_epu64, i, i);
}
+static __inline__ double __DEFAULT_FN_ATTRS
+_mm512_reduce_min_pd(__m512d __V) {
+ _mm512_reduce_maxMin_64bit(__V, min_pd, d, f);
+}
+
+// Vec512 - Vector with size 512.
+// Vec512Neutral - A 512 length vector with elements set to the identity element
+// Identity element: {max_epi,0x8000000000000000}
+// {max_epu,0x0000000000000000}
+// {max_pd, 0xFFF0000000000000}
+// {min_epi,0x7FFFFFFFFFFFFFFF}
+// {min_epu,0xFFFFFFFFFFFFFFFF}
+// {min_pd, 0x7FF0000000000000}
+//
+// IntrinName - Can be one of following: {max|min}_{epi64|epu64|pd} for example:
+// __mm512_max_epi64
+// T1 - Can get 'i' for int and 'd' for double.[__m512{i|d}]
+// T2 - Can get 'i' for int and 'f' for float. [__v8d{i|f}]
+// T3 - Can get 'q' q word and 'pd' for packed double.
+// [__builtin_ia32_select{q|pd}_512]
+// Mask - Intrinsic Mask
+
+#define _mm512_mask_reduce_maxMin_64bit(Vec512, Vec512Neutral, IntrinName, T1, \
+ T2, T3, Mask) \
+ __extension__({ \
+ Vec512 = (__m512##T1)__builtin_ia32_select##T3##_512( \
+ (__mmask8)Mask, \
+ (__v8d##T2)Vec512, \
+ (__v8d##T2)Vec512Neutral); \
+ _mm512_reduce_maxMin_64bit(Vec512, IntrinName, T1, T2); \
+ })
+
static __inline__ long long __DEFAULT_FN_ATTRS
_mm512_mask_reduce_max_epi64(__mmask8 __M, __m512i __V) {
- __V = _mm512_mask_mov_epi64(_mm512_set1_epi64(-__LONG_LONG_MAX__ - 1LL), __M, __V);
- _mm512_mask_reduce_operator(max_epi64);
+ _mm512_mask_reduce_maxMin_64bit(__V, _mm512_set1_epi64(0x8000000000000000),
+ max_epi64, i, i, q, __M);
}
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
_mm512_mask_reduce_max_epu64(__mmask8 __M, __m512i __V) {
- __V = _mm512_maskz_mov_epi64(__M, __V);
- _mm512_mask_reduce_operator(max_epu64);
+ _mm512_mask_reduce_maxMin_64bit(__V, _mm512_set1_epi64(0x0000000000000000),
+ max_epu64, i, i, q, __M);
+}
+
+static __inline__ double __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_max_pd(__mmask8 __M, __m512d __V) {
+ _mm512_mask_reduce_maxMin_64bit(__V, _mm512_set1_pd(-__builtin_inf()),
+ max_pd, d, f, pd, __M);
}
static __inline__ long long __DEFAULT_FN_ATTRS
_mm512_mask_reduce_min_epi64(__mmask8 __M, __m512i __V) {
- __V = _mm512_mask_mov_epi64(_mm512_set1_epi64(__LONG_LONG_MAX__), __M, __V);
- _mm512_mask_reduce_operator(min_epi64);
+ _mm512_mask_reduce_maxMin_64bit(__V, _mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),
+ min_epi64, i, i, q, __M);
}
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
_mm512_mask_reduce_min_epu64(__mmask8 __M, __m512i __V) {
- __V = _mm512_mask_mov_epi64(_mm512_set1_epi64(~0ULL), __M, __V);
- _mm512_mask_reduce_operator(min_epu64);
-}
-#undef _mm512_mask_reduce_operator
-
-#define _mm512_mask_reduce_operator(op) \
- __m256i __t1 = _mm512_extracti64x4_epi64(__V, 0); \
- __m256i __t2 = _mm512_extracti64x4_epi64(__V, 1); \
- __m256i __t3 = _mm256_##op(__t1, __t2); \
- __m128i __t4 = _mm256_extracti128_si256(__t3, 0); \
- __m128i __t5 = _mm256_extracti128_si256(__t3, 1); \
- __m128i __t6 = _mm_##op(__t4, __t5); \
- __m128i __t7 = (__m128i)__builtin_shufflevector((__v4si)__t6, (__v4si)__t6, 2, 3, 0, 1); \
- __m128i __t8 = _mm_##op(__t6, __t7); \
- __m128i __t9 = (__m128i)__builtin_shufflevector((__v4si)__t8, (__v4si)__t8, 1, 0, 3, 2); \
- __v4si __t10 = (__v4si)_mm_##op(__t8, __t9); \
- return __t10[0];
-
-static __inline__ int __DEFAULT_FN_ATTRS
-_mm512_reduce_max_epi32(__m512i __V) {
- _mm512_mask_reduce_operator(max_epi32);
+ _mm512_mask_reduce_maxMin_64bit(__V, _mm512_set1_epi64(0xFFFFFFFFFFFFFFFF),
+ min_epu64, i, i, q, __M);
}
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-_mm512_reduce_max_epu32(__m512i __V) {
- _mm512_mask_reduce_operator(max_epu32);
+static __inline__ double __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_min_pd(__mmask8 __M, __m512d __V) {
+ _mm512_mask_reduce_maxMin_64bit(__V, _mm512_set1_pd(__builtin_inf()),
+ min_pd, d, f, pd, __M);
}
+#undef _mm512_reduce_maxMin_64bit
+#undef _mm512_mask_reduce_maxMin_64bit
-static __inline__ int __DEFAULT_FN_ATTRS
-_mm512_reduce_min_epi32(__m512i __V) {
- _mm512_mask_reduce_operator(min_epi32);
+// Vec512 - Vector with size 512.
+// IntrinName - Can be one of following: {max|min}_{epi32|epu32|ps} for example:
+// __mm512_max_epi32
+// T1 - Can get 'i' for int and ' ' .[__m512{i|}]
+// T2 - Can get 'i' for int and 'f' for float.[__v16s{i|f}]
+
+#define _mm512_reduce_maxMin_32bit(Vec512, IntrinName, T1, T2) __extension__({ \
+ Vec512 = _mm512_##IntrinName( \
+ (__m512##T1)__builtin_shufflevector( \
+ (__v16s##T2)Vec512, \
+ (__v16s##T2)Vec512, \
+ 0, 1, 2, 3, 4, 5, 6, 7, \
+ -1, -1, -1, -1, -1, -1, -1, -1), \
+ (__m512##T1)__builtin_shufflevector( \
+ (__v16s##T2)Vec512, \
+ (__v16s##T2)Vec512, \
+ 8, 9, 10, 11, 12, 13, 14, 15, \
+ -1, -1, -1, -1, -1, -1, -1, -1)); \
+ Vec512 = _mm512_##IntrinName( \
+ (__m512##T1)__builtin_shufflevector( \
+ (__v16s##T2)Vec512, \
+ (__v16s##T2)Vec512, \
+ 0, 1, 2, 3, -1, -1, -1, -1, \
+ -1, -1, -1, -1, -1, -1, -1, -1), \
+ (__m512##T1)__builtin_shufflevector( \
+ (__v16s##T2)Vec512, \
+ (__v16s##T2)Vec512, \
+ 4, 5, 6, 7, -1, -1, -1, -1, \
+ -1, -1, -1, -1, -1, -1, -1, -1)); \
+ Vec512 = _mm512_##IntrinName( \
+ (__m512##T1)__builtin_shufflevector( \
+ (__v16s##T2)Vec512, \
+ (__v16s##T2)Vec512, \
+ 0, 1, -1, -1, -1, -1, -1, -1, \
+ -1, -1, -1, -1, -1, -1, -1, -1), \
+ (__m512##T1)__builtin_shufflevector( \
+ (__v16s##T2)Vec512, \
+ (__v16s##T2)Vec512, \
+ 2, 3, -1, -1, -1, -1, -1, -1, \
+ -1, -1, -1, -1, -1, -1, -1, -1)); \
+ Vec512 = _mm512_##IntrinName( \
+ (__m512##T1)__builtin_shufflevector( \
+ (__v16s##T2)Vec512, \
+ (__v16s##T2)Vec512, \
+ 0, -1, -1, -1, -1, -1, -1, -1, \
+ -1, -1, -1, -1, -1, -1, -1, -1), \
+ (__m512##T1)__builtin_shufflevector( \
+ (__v16s##T2)Vec512, \
+ (__v16s##T2)Vec512, \
+ 1, -1, -1, -1, -1, -1, -1, -1, \
+ -1, -1, -1, -1, -1, -1, -1, -1)); \
+ return Vec512[0]; \
+ })
+
+static __inline__ int __DEFAULT_FN_ATTRS _mm512_reduce_max_epi32(__m512i a) {
+ _mm512_reduce_maxMin_32bit(a, max_epi32, i, i);
}
static __inline__ unsigned int __DEFAULT_FN_ATTRS
-_mm512_reduce_min_epu32(__m512i __V) {
- _mm512_mask_reduce_operator(min_epu32);
+_mm512_reduce_max_epu32(__m512i a) {
+ _mm512_reduce_maxMin_32bit(a, max_epu32, i, i);
}
-static __inline__ int __DEFAULT_FN_ATTRS
-_mm512_mask_reduce_max_epi32(__mmask16 __M, __m512i __V) {
- __V = _mm512_mask_mov_epi32(_mm512_set1_epi32(-__INT_MAX__ - 1), __M, __V);
- _mm512_mask_reduce_operator(max_epi32);
+static __inline__ float __DEFAULT_FN_ATTRS _mm512_reduce_max_ps(__m512 a) {
+ _mm512_reduce_maxMin_32bit(a, max_ps, , f);
}
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-_mm512_mask_reduce_max_epu32(__mmask16 __M, __m512i __V) {
- __V = _mm512_maskz_mov_epi32(__M, __V);
- _mm512_mask_reduce_operator(max_epu32);
+static __inline__ int __DEFAULT_FN_ATTRS _mm512_reduce_min_epi32(__m512i a) {
+ _mm512_reduce_maxMin_32bit(a, min_epi32, i, i);
}
-static __inline__ int __DEFAULT_FN_ATTRS
-_mm512_mask_reduce_min_epi32(__mmask16 __M, __m512i __V) {
- __V = _mm512_mask_mov_epi32(_mm512_set1_epi32(__INT_MAX__), __M, __V);
- _mm512_mask_reduce_operator(min_epi32);
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_mm512_reduce_min_epu32(__m512i a) {
+ _mm512_reduce_maxMin_32bit(a, min_epu32, i, i);
}
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-_mm512_mask_reduce_min_epu32(__mmask16 __M, __m512i __V) {
- __V = _mm512_mask_mov_epi32(_mm512_set1_epi32(~0U), __M, __V);
- _mm512_mask_reduce_operator(min_epu32);
+static __inline__ float __DEFAULT_FN_ATTRS _mm512_reduce_min_ps(__m512 a) {
+ _mm512_reduce_maxMin_32bit(a, min_ps, , f);
}
-#define _mm512_mask_reduce_operator(op) \
- __m256d __t1 = _mm512_extractf64x4_pd(__V, 0); \
- __m256d __t2 = _mm512_extractf64x4_pd(__V, 1); \
- __m256d __t3 = _mm256_##op(__t1, __t2); \
- __m128d __t4 = _mm256_extractf128_pd(__t3, 0); \
- __m128d __t5 = _mm256_extractf128_pd(__t3, 1); \
- __m128d __t6 = _mm_##op(__t4, __t5); \
- __m128d __t7 = __builtin_shufflevector(__t6, __t6, 1, 0); \
- __m128d __t8 = _mm_##op(__t6, __t7); \
- return __t8[0];
+// Vec512 - Vector with size 512.
+// Vec512Neutral - A 512 length vector with elements set to the identity element
+// Identity element: {max_epi,0x80000000}
+// {max_epu,0x00000000}
+// {max_ps, 0xFF800000}
+// {min_epi,0x7FFFFFFF}
+// {min_epu,0xFFFFFFFF}
+// {min_ps, 0x7F800000}
+//
+// IntrinName - Can be one of following: {max|min}_{epi32|epu32|ps} for example:
+// __mm512_max_epi32
+// T1 - Can get 'i' for int and ' ' .[__m512{i|}]
+// T2 - Can get 'i' for int and 'f' for float.[__v16s{i|f}]
+// T3 - Can get 'q' q word and 'pd' for packed double.
+// [__builtin_ia32_select{q|pd}_512]
+// Mask - Intrinsic Mask
-static __inline__ double __DEFAULT_FN_ATTRS
-_mm512_reduce_max_pd(__m512d __V) {
- _mm512_mask_reduce_operator(max_pd);
-}
+#define _mm512_mask_reduce_maxMin_32bit(Vec512, Vec512Neutral, IntrinName, T1, \
+ T2, T3, Mask) \
+ __extension__({ \
+ Vec512 = (__m512##T1)__builtin_ia32_select##T3##_512( \
+ (__mmask16)Mask, \
+ (__v16s##T2)Vec512, \
+ (__v16s##T2)Vec512Neutral); \
+ _mm512_reduce_maxMin_32bit(Vec512, IntrinName, T1, T2); \
+ })
-static __inline__ double __DEFAULT_FN_ATTRS
-_mm512_reduce_min_pd(__m512d __V) {
- _mm512_mask_reduce_operator(min_pd);
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_max_epi32(__mmask16 __M, __m512i __V) {
+ _mm512_mask_reduce_maxMin_32bit(__V, _mm512_set1_epi32(0x80000000), max_epi32,
+ i, i, d, __M);
}
-static __inline__ double __DEFAULT_FN_ATTRS
-_mm512_mask_reduce_max_pd(__mmask8 __M, __m512d __V) {
- __V = _mm512_mask_mov_pd(_mm512_set1_pd(-__builtin_inf()), __M, __V);
- _mm512_mask_reduce_operator(max_pd);
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_max_epu32(__mmask16 __M, __m512i __V) {
+ _mm512_mask_reduce_maxMin_32bit(__V, _mm512_set1_epi32(0x00000000), max_epu32,
+ i, i, d, __M);
}
-static __inline__ double __DEFAULT_FN_ATTRS
-_mm512_mask_reduce_min_pd(__mmask8 __M, __m512d __V) {
- __V = _mm512_mask_mov_pd(_mm512_set1_pd(__builtin_inf()), __M, __V);
- _mm512_mask_reduce_operator(min_pd);
-}
-#undef _mm512_mask_reduce_operator
-
-#define _mm512_mask_reduce_operator(op) \
- __m256 __t1 = (__m256)_mm512_extractf64x4_pd((__m512d)__V, 0); \
- __m256 __t2 = (__m256)_mm512_extractf64x4_pd((__m512d)__V, 1); \
- __m256 __t3 = _mm256_##op(__t1, __t2); \
- __m128 __t4 = _mm256_extractf128_ps(__t3, 0); \
- __m128 __t5 = _mm256_extractf128_ps(__t3, 1); \
- __m128 __t6 = _mm_##op(__t4, __t5); \
- __m128 __t7 = __builtin_shufflevector(__t6, __t6, 2, 3, 0, 1); \
- __m128 __t8 = _mm_##op(__t6, __t7); \
- __m128 __t9 = __builtin_shufflevector(__t8, __t8, 1, 0, 3, 2); \
- __m128 __t10 = _mm_##op(__t8, __t9); \
- return __t10[0];
-
static __inline__ float __DEFAULT_FN_ATTRS
-_mm512_reduce_max_ps(__m512 __V) {
- _mm512_mask_reduce_operator(max_ps);
+_mm512_mask_reduce_max_ps(__mmask16 __M, __m512 __V) {
+ _mm512_mask_reduce_maxMin_32bit(__V,_mm512_set1_ps(-__builtin_inff()), max_ps, , f,
+ ps, __M);
}
-static __inline__ float __DEFAULT_FN_ATTRS
-_mm512_reduce_min_ps(__m512 __V) {
- _mm512_mask_reduce_operator(min_ps);
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_min_epi32(__mmask16 __M, __m512i __V) {
+ _mm512_mask_reduce_maxMin_32bit(__V, _mm512_set1_epi32(0x7FFFFFFF), min_epi32,
+ i, i, d, __M);
}
-static __inline__ float __DEFAULT_FN_ATTRS
-_mm512_mask_reduce_max_ps(__mmask16 __M, __m512 __V) {
- __V = _mm512_mask_mov_ps(_mm512_set1_ps(-__builtin_inff()), __M, __V);
- _mm512_mask_reduce_operator(max_ps);
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_mm512_mask_reduce_min_epu32(__mmask16 __M, __m512i __V) {
+ _mm512_mask_reduce_maxMin_32bit(__V, _mm512_set1_epi32(0xFFFFFFFF), min_epu32,
+ i, i, d, __M);
}
static __inline__ float __DEFAULT_FN_ATTRS
_mm512_mask_reduce_min_ps(__mmask16 __M, __m512 __V) {
- __V = _mm512_mask_mov_ps(_mm512_set1_ps(__builtin_inff()), __M, __V);
- _mm512_mask_reduce_operator(min_ps);
+ _mm512_mask_reduce_maxMin_32bit(__V, _mm512_set1_ps(__builtin_inff()), min_ps, , f,
+ ps, __M);
}
-#undef _mm512_mask_reduce_operator
+#undef _mm512_reduce_maxMin_32bit
+#undef _mm512_mask_reduce_maxMin_32bit
#undef __DEFAULT_FN_ATTRS
OpenPOWER on IntegriCloud