diff options
| author | Craig Topper <craig.topper@gmail.com> | 2017-01-18 02:17:10 +0000 |
|---|---|---|
| committer | Craig Topper <craig.topper@gmail.com> | 2017-01-18 02:17:10 +0000 |
| commit | 367c86ddbec5d9da7c4390f0df72adac544f0d1d (patch) | |
| tree | 8928dfe6e2bf0c76ba87bd35d54364fb02d0aabb /clang/lib | |
| parent | f411071d636d83f37ee98ab6a7c36c4c48c8c622 (diff) | |
| download | bcm5719-llvm-367c86ddbec5d9da7c4390f0df72adac544f0d1d.tar.gz bcm5719-llvm-367c86ddbec5d9da7c4390f0df72adac544f0d1d.zip | |
[AVX-512] Replace subvector broadcast builtins with shufflevectors and selects.
Verified that the backend codegens this equally well.
llvm-svn: 292329
Diffstat (limited to 'clang/lib')
| -rw-r--r-- | clang/lib/Headers/avx512dqintrin.h | 95 | ||||
| -rw-r--r-- | clang/lib/Headers/avx512fintrin.h | 102 | ||||
| -rw-r--r-- | clang/lib/Headers/avx512vldqintrin.h | 42 | ||||
| -rw-r--r-- | clang/lib/Headers/avx512vlintrin.h | 45 |
4 files changed, 133 insertions, 151 deletions
diff --git a/clang/lib/Headers/avx512dqintrin.h b/clang/lib/Headers/avx512dqintrin.h index ae44b98a949..4fd1add7735 100644 --- a/clang/lib/Headers/avx512dqintrin.h +++ b/clang/lib/Headers/avx512dqintrin.h @@ -995,51 +995,50 @@ _mm512_maskz_broadcast_f32x2 (__mmask16 __M, __m128 __A) } static __inline__ __m512 __DEFAULT_FN_ATTRS -_mm512_broadcast_f32x8 (__m256 __A) +_mm512_broadcast_f32x8(__m256 __A) { - return (__m512) __builtin_ia32_broadcastf32x8_512_mask ((__v8sf) __A, - _mm512_undefined_ps(), - (__mmask16) -1); + return (__m512)__builtin_shufflevector((__v8sf)__A, (__v8sf)__A, + 0, 1, 2, 3, 4, 5, 6, 7, + 0, 1, 2, 3, 4, 5, 6, 7); } static __inline__ __m512 __DEFAULT_FN_ATTRS -_mm512_mask_broadcast_f32x8 (__m512 __O, __mmask16 __M, __m256 __A) +_mm512_mask_broadcast_f32x8(__m512 __O, __mmask16 __M, __m256 __A) { - return (__m512) __builtin_ia32_broadcastf32x8_512_mask ((__v8sf) __A, - (__v16sf)__O, - __M); + return (__m512)__builtin_ia32_selectps_512((__mmask8)__M, + (__v16sf)_mm512_broadcast_f32x8(__A), + (__v16sf)__O); } static __inline__ __m512 __DEFAULT_FN_ATTRS -_mm512_maskz_broadcast_f32x8 (__mmask16 __M, __m256 __A) +_mm512_maskz_broadcast_f32x8(__mmask16 __M, __m256 __A) { - return (__m512) __builtin_ia32_broadcastf32x8_512_mask ((__v8sf) __A, - (__v16sf)_mm512_setzero_ps (), - __M); + return (__m512)__builtin_ia32_selectps_512((__mmask8)__M, + (__v16sf)_mm512_broadcast_f32x8(__A), + (__v16sf)_mm512_setzero_ps()); } static __inline__ __m512d __DEFAULT_FN_ATTRS -_mm512_broadcast_f64x2 (__m128d __A) +_mm512_broadcast_f64x2(__m128d __A) { - return (__m512d) __builtin_ia32_broadcastf64x2_512_mask ((__v2df) __A, - (__v8df)_mm512_undefined_pd(), - (__mmask8) -1); + return (__m512d)__builtin_shufflevector((__v2df)__A, (__v2df)__A, + 0, 1, 0, 1, 0, 1, 0, 1); } static __inline__ __m512d __DEFAULT_FN_ATTRS -_mm512_mask_broadcast_f64x2 (__m512d __O, __mmask8 __M, __m128d __A) +_mm512_mask_broadcast_f64x2(__m512d __O, __mmask8 __M, __m128d __A) { - return (__m512d) __builtin_ia32_broadcastf64x2_512_mask ((__v2df) __A, - (__v8df) - __O, __M); + return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M, + (__v8df)_mm512_broadcast_f64x2(__A), + (__v8df)__O); } static __inline__ __m512d __DEFAULT_FN_ATTRS -_mm512_maskz_broadcast_f64x2 (__mmask8 __M, __m128d __A) +_mm512_maskz_broadcast_f64x2(__mmask8 __M, __m128d __A) { - return (__m512d) __builtin_ia32_broadcastf64x2_512_mask ((__v2df) __A, - (__v8df)_mm512_setzero_ps (), - __M); + return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M, + (__v8df)_mm512_broadcast_f64x2(__A), + (__v8df)_mm512_setzero_pd()); } static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -1067,52 +1066,50 @@ _mm512_maskz_broadcast_i32x2 (__mmask16 __M, __m128i __A) } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_broadcast_i32x8 (__m256i __A) +_mm512_broadcast_i32x8(__m256i __A) { - return (__m512i) __builtin_ia32_broadcasti32x8_512_mask ((__v8si) __A, - (__v16si)_mm512_setzero_si512(), - (__mmask16) -1); + return (__m512i)__builtin_shufflevector((__v8si)__A, (__v8si)__A, + 0, 1, 2, 3, 4, 5, 6, 7, + 0, 1, 2, 3, 4, 5, 6, 7); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_broadcast_i32x8 (__m512i __O, __mmask16 __M, __m256i __A) +_mm512_mask_broadcast_i32x8(__m512i __O, __mmask16 __M, __m256i __A) { - return (__m512i) __builtin_ia32_broadcasti32x8_512_mask ((__v8si) __A, - (__v16si)__O, - __M); + return (__m512i)__builtin_ia32_selectd_512((__mmask8)__M, + (__v16si)_mm512_broadcast_i32x8(__A), + (__v16si)__O); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_broadcast_i32x8 (__mmask16 __M, __m256i __A) +_mm512_maskz_broadcast_i32x8(__mmask16 __M, __m256i __A) { - return (__m512i) __builtin_ia32_broadcasti32x8_512_mask ((__v8si) __A, - (__v16si) - _mm512_setzero_si512 (), - __M); + return (__m512i)__builtin_ia32_selectd_512((__mmask8)__M, + (__v16si)_mm512_broadcast_i32x8(__A), + (__v16si)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_broadcast_i64x2 (__m128i __A) +_mm512_broadcast_i64x2(__m128i __A) { - return (__m512i) __builtin_ia32_broadcasti64x2_512_mask ((__v2di) __A, - (__v8di)_mm512_setzero_si512(), - (__mmask8) -1); + return (__m512i)__builtin_shufflevector((__v2di)__A, (__v2di)__A, + 0, 1, 0, 1, 0, 1, 0, 1); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_broadcast_i64x2 (__m512i __O, __mmask8 __M, __m128i __A) +_mm512_mask_broadcast_i64x2(__m512i __O, __mmask8 __M, __m128i __A) { - return (__m512i) __builtin_ia32_broadcasti64x2_512_mask ((__v2di) __A, - (__v8di) - __O, __M); + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, + (__v8di)_mm512_broadcast_i64x2(__A), + (__v8di)__O); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_broadcast_i64x2 (__mmask8 __M, __m128i __A) +_mm512_maskz_broadcast_i64x2(__mmask8 __M, __m128i __A) { - return (__m512i) __builtin_ia32_broadcasti64x2_512_mask ((__v2di) __A, - (__v8di)_mm512_setzero_si512 (), - __M); + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, + (__v8di)_mm512_broadcast_i64x2(__A), + (__v8di)_mm512_setzero_si512()); } #define _mm512_extractf32x8_ps(A, imm) __extension__ ({ \ diff --git a/clang/lib/Headers/avx512fintrin.h b/clang/lib/Headers/avx512fintrin.h index e6a7217c896..7d9a52fdf23 100644 --- a/clang/lib/Headers/avx512fintrin.h +++ b/clang/lib/Headers/avx512fintrin.h @@ -7278,107 +7278,97 @@ _mm_maskz_sqrt_ss (__mmask8 __U, __m128 __A, __m128 __B) (__mmask8)(U), (int)(R)); }) static __inline__ __m512 __DEFAULT_FN_ATTRS -_mm512_broadcast_f32x4 (__m128 __A) +_mm512_broadcast_f32x4(__m128 __A) { - return (__m512) __builtin_ia32_broadcastf32x4_512 ((__v4sf) __A, - (__v16sf) - _mm512_undefined_ps (), - (__mmask16) -1); + return (__m512)__builtin_shufflevector((__v4sf)__A, (__v4sf)__A, + 0, 1, 2, 3, 0, 1, 2, 3, + 0, 1, 2, 3, 0, 1, 2, 3); } static __inline__ __m512 __DEFAULT_FN_ATTRS -_mm512_mask_broadcast_f32x4 (__m512 __O, __mmask16 __M, __m128 __A) +_mm512_mask_broadcast_f32x4(__m512 __O, __mmask16 __M, __m128 __A) { - return (__m512) __builtin_ia32_broadcastf32x4_512 ((__v4sf) __A, - (__v16sf) __O, - __M); + return (__m512)__builtin_ia32_selectps_512((__mmask16)__M, + (__v16sf)_mm512_broadcast_f32x4(__A), + (__v16sf)__O); } static __inline__ __m512 __DEFAULT_FN_ATTRS -_mm512_maskz_broadcast_f32x4 (__mmask16 __M, __m128 __A) +_mm512_maskz_broadcast_f32x4(__mmask16 __M, __m128 __A) { - return (__m512) __builtin_ia32_broadcastf32x4_512 ((__v4sf) __A, - (__v16sf) - _mm512_setzero_ps (), - __M); + return (__m512)__builtin_ia32_selectps_512((__mmask16)__M, + (__v16sf)_mm512_broadcast_f32x4(__A), + (__v16sf)_mm512_setzero_ps()); } static __inline__ __m512d __DEFAULT_FN_ATTRS -_mm512_broadcast_f64x4 (__m256d __A) +_mm512_broadcast_f64x4(__m256d __A) { - return (__m512d) __builtin_ia32_broadcastf64x4_512 ((__v4df) __A, - (__v8df) - _mm512_undefined_pd (), - (__mmask8) -1); + return (__m512d)__builtin_shufflevector((__v4df)__A, (__v4df)__A, + 0, 1, 2, 3, 0, 1, 2, 3); } static __inline__ __m512d __DEFAULT_FN_ATTRS -_mm512_mask_broadcast_f64x4 (__m512d __O, __mmask8 __M, __m256d __A) +_mm512_mask_broadcast_f64x4(__m512d __O, __mmask8 __M, __m256d __A) { - return (__m512d) __builtin_ia32_broadcastf64x4_512 ((__v4df) __A, - (__v8df) __O, - __M); + return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M, + (__v8df)_mm512_broadcast_f64x4(__A), + (__v8df)__O); } static __inline__ __m512d __DEFAULT_FN_ATTRS -_mm512_maskz_broadcast_f64x4 (__mmask8 __M, __m256d __A) +_mm512_maskz_broadcast_f64x4(__mmask8 __M, __m256d __A) { - return (__m512d) __builtin_ia32_broadcastf64x4_512 ((__v4df) __A, - (__v8df) - _mm512_setzero_pd (), - __M); + return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M, + (__v8df)_mm512_broadcast_f64x4(__A), + (__v8df)_mm512_setzero_pd()); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_broadcast_i32x4 (__m128i __A) +_mm512_broadcast_i32x4(__m128i __A) { - return (__m512i) __builtin_ia32_broadcasti32x4_512 ((__v4si) __A, - (__v16si) - _mm512_undefined_epi32 (), - (__mmask16) -1); + return (__m512i)__builtin_shufflevector((__v4si)__A, (__v4si)__A, + 0, 1, 2, 3, 0, 1, 2, 3, + 0, 1, 2, 3, 0, 1, 2, 3); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_broadcast_i32x4 (__m512i __O, __mmask16 __M, __m128i __A) +_mm512_mask_broadcast_i32x4(__m512i __O, __mmask16 __M, __m128i __A) { - return (__m512i) __builtin_ia32_broadcasti32x4_512 ((__v4si) __A, - (__v16si) __O, - __M); + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, + (__v16si)_mm512_broadcast_i32x4(__A), + (__v16si)__O); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_broadcast_i32x4 (__mmask16 __M, __m128i __A) +_mm512_maskz_broadcast_i32x4(__mmask16 __M, __m128i __A) { - return (__m512i) __builtin_ia32_broadcasti32x4_512 ((__v4si) __A, - (__v16si) - _mm512_setzero_si512 (), - __M); + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, + (__v16si)_mm512_broadcast_i32x4(__A), + (__v16si)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_broadcast_i64x4 (__m256i __A) +_mm512_broadcast_i64x4(__m256i __A) { - return (__m512i) __builtin_ia32_broadcasti64x4_512 ((__v4di) __A, - (__v8di) - _mm512_undefined_epi32 (), - (__mmask8) -1); + return (__m512i)__builtin_shufflevector((__v4di)__A, (__v4di)__A, + 0, 1, 2, 3, 0, 1, 2, 3); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_broadcast_i64x4 (__m512i __O, __mmask8 __M, __m256i __A) +_mm512_mask_broadcast_i64x4(__m512i __O, __mmask8 __M, __m256i __A) { - return (__m512i) __builtin_ia32_broadcasti64x4_512 ((__v4di) __A, - (__v8di) __O, - __M); + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, + (__v8di)_mm512_broadcast_i64x4(__A), + (__v8di)__O); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_broadcast_i64x4 (__mmask8 __M, __m256i __A) +_mm512_maskz_broadcast_i64x4(__mmask8 __M, __m256i __A) { - return (__m512i) __builtin_ia32_broadcasti64x4_512 ((__v4di) __A, - (__v8di) - _mm512_setzero_si512 (), - __M); + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, + (__v8di)_mm512_broadcast_i64x4(__A), + (__v8di)_mm512_setzero_si512()); } static __inline__ __m512d __DEFAULT_FN_ATTRS diff --git a/clang/lib/Headers/avx512vldqintrin.h b/clang/lib/Headers/avx512vldqintrin.h index cd9da437056..aecd7df34d0 100644 --- a/clang/lib/Headers/avx512vldqintrin.h +++ b/clang/lib/Headers/avx512vldqintrin.h @@ -1000,27 +1000,26 @@ _mm256_maskz_broadcast_f32x2 (__mmask8 __M, __m128 __A) } static __inline__ __m256d __DEFAULT_FN_ATTRS -_mm256_broadcast_f64x2 (__m128d __A) +_mm256_broadcast_f64x2(__m128d __A) { - return (__m256d) __builtin_ia32_broadcastf64x2_256_mask ((__v2df) __A, - (__v4df)_mm256_undefined_pd(), - (__mmask8) -1); + return (__m256d)__builtin_shufflevector((__v2df)__A, (__v2df)__A, + 0, 1, 0, 1); } static __inline__ __m256d __DEFAULT_FN_ATTRS -_mm256_mask_broadcast_f64x2 (__m256d __O, __mmask8 __M, __m128d __A) +_mm256_mask_broadcast_f64x2(__m256d __O, __mmask8 __M, __m128d __A) { - return (__m256d) __builtin_ia32_broadcastf64x2_256_mask ((__v2df) __A, - (__v4df) __O, - __M); + return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__M, + (__v4df)_mm256_broadcast_f64x2(__A), + (__v4df)__O); } static __inline__ __m256d __DEFAULT_FN_ATTRS _mm256_maskz_broadcast_f64x2 (__mmask8 __M, __m128d __A) { - return (__m256d) __builtin_ia32_broadcastf64x2_256_mask ((__v2df) __A, - (__v4df) _mm256_setzero_ps (), - __M); + return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__M, + (__v4df)_mm256_broadcast_f64x2(__A), + (__v4df)_mm256_setzero_pd()); } static __inline__ __m128i __DEFAULT_FN_ATTRS @@ -1072,27 +1071,26 @@ _mm256_maskz_broadcast_i32x2 (__mmask8 __M, __m128i __A) } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_broadcast_i64x2 (__m128i __A) +_mm256_broadcast_i64x2(__m128i __A) { - return (__m256i) __builtin_ia32_broadcasti64x2_256_mask ((__v2di) __A, - (__v4di)_mm256_undefined_si256(), - (__mmask8) -1); + return (__m256i)__builtin_shufflevector((__v2di)__A, (__v2di)__A, + 0, 1, 0, 1); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_broadcast_i64x2 (__m256i __O, __mmask8 __M, __m128i __A) +_mm256_mask_broadcast_i64x2(__m256i __O, __mmask8 __M, __m128i __A) { - return (__m256i) __builtin_ia32_broadcasti64x2_256_mask ((__v2di) __A, - (__v4di) __O, - __M); + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, + (__v4di)_mm256_broadcast_i64x2(__A), + (__v4di)__O); } static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_maskz_broadcast_i64x2 (__mmask8 __M, __m128i __A) { - return (__m256i) __builtin_ia32_broadcasti64x2_256_mask ((__v2di) __A, - (__v4di) _mm256_setzero_si256 (), - __M); + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, + (__v4di)_mm256_broadcast_i64x2(__A), + (__v4di)_mm256_setzero_si256()); } #define _mm256_extractf64x2_pd(A, imm) __extension__ ({ \ diff --git a/clang/lib/Headers/avx512vlintrin.h b/clang/lib/Headers/avx512vlintrin.h index f3744da6ab8..99bb050de4d 100644 --- a/clang/lib/Headers/avx512vlintrin.h +++ b/clang/lib/Headers/avx512vlintrin.h @@ -7189,52 +7189,49 @@ _mm256_maskz_rsqrt14_ps (__mmask8 __U, __m256 __A) } static __inline__ __m256 __DEFAULT_FN_ATTRS -_mm256_broadcast_f32x4 (__m128 __A) +_mm256_broadcast_f32x4(__m128 __A) { - return (__m256) __builtin_ia32_broadcastf32x4_256_mask ((__v4sf) __A, - (__v8sf)_mm256_undefined_pd (), - (__mmask8) -1); + return (__m256)__builtin_shufflevector((__v4sf)__A, (__v4sf)__A, + 0, 1, 2, 3, 0, 1, 2, 3); } static __inline__ __m256 __DEFAULT_FN_ATTRS -_mm256_mask_broadcast_f32x4 (__m256 __O, __mmask8 __M, __m128 __A) +_mm256_mask_broadcast_f32x4(__m256 __O, __mmask8 __M, __m128 __A) { - return (__m256) __builtin_ia32_broadcastf32x4_256_mask ((__v4sf) __A, - (__v8sf) __O, - __M); + return (__m256)__builtin_ia32_selectps_256((__mmask8)__M, + (__v8sf)_mm256_broadcast_f32x4(__A), + (__v8sf)__O); } static __inline__ __m256 __DEFAULT_FN_ATTRS _mm256_maskz_broadcast_f32x4 (__mmask8 __M, __m128 __A) { - return (__m256) __builtin_ia32_broadcastf32x4_256_mask ((__v4sf) __A, - (__v8sf) _mm256_setzero_ps (), - __M); + return (__m256)__builtin_ia32_selectps_256((__mmask8)__M, + (__v8sf)_mm256_broadcast_f32x4(__A), + (__v8sf)_mm256_setzero_ps()); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_broadcast_i32x4 (__m128i __A) +_mm256_broadcast_i32x4(__m128i __A) { - return (__m256i) __builtin_ia32_broadcasti32x4_256_mask ((__v4si) __A, - (__v8si)_mm256_undefined_si256 (), - (__mmask8) -1); + return (__m256i)__builtin_shufflevector((__v4si)__A, (__v4si)__A, + 0, 1, 2, 3, 0, 1, 2, 3); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_broadcast_i32x4 (__m256i __O, __mmask8 __M, __m128i __A) +_mm256_mask_broadcast_i32x4(__m256i __O, __mmask8 __M, __m128i __A) { - return (__m256i) __builtin_ia32_broadcasti32x4_256_mask ((__v4si) __A, - (__v8si) - __O, __M); + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, + (__v8si)_mm256_broadcast_i32x4(__A), + (__v8si)__O); } static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_broadcast_i32x4 (__mmask8 __M, __m128i __A) +_mm256_maskz_broadcast_i32x4(__mmask8 __M, __m128i __A) { - return (__m256i) __builtin_ia32_broadcasti32x4_256_mask ((__v4si) - __A, - (__v8si) _mm256_setzero_si256 (), - __M); + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, + (__v8si)_mm256_broadcast_i32x4(__A), + (__v8si)_mm256_setzero_si256()); } static __inline__ __m256d __DEFAULT_FN_ATTRS |

