diff options
author | Craig Topper <craig.topper@intel.com> | 2018-06-01 18:26:35 +0000 |
---|---|---|
committer | Craig Topper <craig.topper@intel.com> | 2018-06-01 18:26:35 +0000 |
commit | d521d16ba435741a878c9be7208b3330c69fc3d9 (patch) | |
tree | 7fec330ae32b328ce88dc5cfea96fb78ff37ca16 | |
parent | 4cb54e96f9856ce2956995292cfbbda3e67d5771 (diff) | |
download | bcm5719-llvm-d521d16ba435741a878c9be7208b3330c69fc3d9.tar.gz bcm5719-llvm-d521d16ba435741a878c9be7208b3330c69fc3d9.zip |
[X86] Rewrite avx512vbmi unmasked and maskz macro intrinsics to be wrappers around their __builtin function with appropriate arguments rather than just passing arguments to the masked intrinsic.
This is more consistent with all of our other avx512 macro intrinsics.
It also fixes a bad cast where an argument was casted to mmask8 when it should have been a mmask16.
llvm-svn: 333778
-rw-r--r-- | clang/lib/Headers/avx512vbmi2intrin.h | 72 | ||||
-rw-r--r-- | clang/lib/Headers/avx512vlvbmi2intrin.h | 144 |
2 files changed, 180 insertions, 36 deletions
diff --git a/clang/lib/Headers/avx512vbmi2intrin.h b/clang/lib/Headers/avx512vbmi2intrin.h index c19350ebfa8..e0ada4c16f3 100644 --- a/clang/lib/Headers/avx512vbmi2intrin.h +++ b/clang/lib/Headers/avx512vbmi2intrin.h @@ -150,10 +150,18 @@ _mm512_maskz_expandloadu_epi8(__mmask64 __U, void const *__P) (__mmask8)(U)) #define _mm512_maskz_shldi_epi64(U, A, B, I) \ - _mm512_mask_shldi_epi64(_mm512_setzero_si512(), (U), (A), (B), (I)) + (__m512i)__builtin_ia32_vpshldq512_mask((__v8di)(__m512i)(A), \ + (__v8di)(__m512i)(B), \ + (int)(I), \ + (__v8di)_mm512_setzero_si512(), \ + (__mmask8)(U)) #define _mm512_shldi_epi64(A, B, I) \ - _mm512_mask_shldi_epi64(_mm512_undefined(), (__mmask8)(-1), (A), (B), (I)) + (__m512i)__builtin_ia32_vpshldq512_mask((__v8di)(__m512i)(A), \ + (__v8di)(__m512i)(B), \ + (int)(I), \ + (__v8di)_mm512_undefined_epi32(), \ + (__mmask8)-1) #define _mm512_mask_shldi_epi32(S, U, A, B, I) \ (__m512i)__builtin_ia32_vpshldd512_mask((__v16si)(__m512i)(A), \ @@ -163,10 +171,18 @@ _mm512_maskz_expandloadu_epi8(__mmask64 __U, void const *__P) (__mmask16)(U)) #define _mm512_maskz_shldi_epi32(U, A, B, I) \ - _mm512_mask_shldi_epi32(_mm512_setzero_si512(), (U), (A), (B), (I)) + (__m512i)__builtin_ia32_vpshldd512_mask((__v16si)(__m512i)(A), \ + (__v16si)(__m512i)(B), \ + (int)(I), \ + (__v16si)_mm512_setzero_si512(), \ + (__mmask16)(U)) #define _mm512_shldi_epi32(A, B, I) \ - _mm512_mask_shldi_epi32(_mm512_undefined(), (__mmask16)(-1), (A), (B), (I)) + (__m512i)__builtin_ia32_vpshldd512_mask((__v16si)(__m512i)(A), \ + (__v16si)(__m512i)(B), \ + (int)(I), \ + (__v16si)_mm512_undefined_epi32(), \ + (__mmask16)-1) #define _mm512_mask_shldi_epi16(S, U, A, B, I) \ (__m512i)__builtin_ia32_vpshldw512_mask((__v32hi)(__m512i)(A), \ @@ -176,10 +192,18 @@ _mm512_maskz_expandloadu_epi8(__mmask64 __U, void const *__P) (__mmask32)(U)) #define _mm512_maskz_shldi_epi16(U, A, B, I) \ - _mm512_mask_shldi_epi16(_mm512_setzero_si512(), (U), (A), (B), (I)) + (__m512i)__builtin_ia32_vpshldw512_mask((__v32hi)(__m512i)(A), \ + (__v32hi)(__m512i)(B), \ + (int)(I), \ + (__v32hi)_mm512_setzero_si512(), \ + (__mmask32)(U)) #define _mm512_shldi_epi16(A, B, I) \ - _mm512_mask_shldi_epi16(_mm512_undefined(), (__mmask32)(-1), (A), (B), (I)) + (__m512i)__builtin_ia32_vpshldw512_mask((__v32hi)(__m512i)(A), \ + (__v32hi)(__m512i)(B), \ + (int)(I), \ + (__v32hi)_mm512_undefined_epi32(), \ + (__mmask32)-1) #define _mm512_mask_shrdi_epi64(S, U, A, B, I) \ (__m512i)__builtin_ia32_vpshrdq512_mask((__v8di)(__m512i)(A), \ @@ -189,10 +213,18 @@ _mm512_maskz_expandloadu_epi8(__mmask64 __U, void const *__P) (__mmask8)(U)) #define _mm512_maskz_shrdi_epi64(U, A, B, I) \ - _mm512_mask_shrdi_epi64(_mm512_setzero_si512(), (U), (A), (B), (I)) + (__m512i)__builtin_ia32_vpshrdq512_mask((__v8di)(__m512i)(A), \ + (__v8di)(__m512i)(B), \ + (int)(I), \ + (__v8di)_mm512_setzero_si512(), \ + (__mmask8)(U)) #define _mm512_shrdi_epi64(A, B, I) \ - _mm512_mask_shrdi_epi64(_mm512_undefined(), (__mmask8)(-1), (A), (B), (I)) + (__m512i)__builtin_ia32_vpshrdq512_mask((__v8di)(__m512i)(A), \ + (__v8di)(__m512i)(B), \ + (int)(I), \ + (__v8di)_mm512_undefined_epi32(), \ + (__mmask8)-1) #define _mm512_mask_shrdi_epi32(S, U, A, B, I) \ (__m512i)__builtin_ia32_vpshrdd512_mask((__v16si)(__m512i)(A), \ @@ -202,10 +234,18 @@ _mm512_maskz_expandloadu_epi8(__mmask64 __U, void const *__P) (__mmask16)(U)) #define _mm512_maskz_shrdi_epi32(U, A, B, I) \ - _mm512_mask_shrdi_epi32(_mm512_setzero_si512(), (U), (A), (B), (I)) + (__m512i)__builtin_ia32_vpshrdd512_mask((__v16si)(__m512i)(A), \ + (__v16si)(__m512i)(B), \ + (int)(I), \ + (__v16si)_mm512_setzero_si512(), \ + (__mmask16)(U)) #define _mm512_shrdi_epi32(A, B, I) \ - _mm512_mask_shrdi_epi32(_mm512_undefined(), (__mmask16)(-1), (A), (B), (I)) + (__m512i)__builtin_ia32_vpshrdd512_mask((__v16si)(__m512i)(A), \ + (__v16si)(__m512i)(B), \ + (int)(I), \ + (__v16si)_mm512_undefined_epi32(), \ + (__mmask16)-1) #define _mm512_mask_shrdi_epi16(S, U, A, B, I) \ (__m512i)__builtin_ia32_vpshrdw512_mask((__v32hi)(__m512i)(A), \ @@ -215,10 +255,18 @@ _mm512_maskz_expandloadu_epi8(__mmask64 __U, void const *__P) (__mmask32)(U)) #define _mm512_maskz_shrdi_epi16(U, A, B, I) \ - _mm512_mask_shrdi_epi16(_mm512_setzero_si512(), (U), (A), (B), (I)) + (__m512i)__builtin_ia32_vpshrdw512_mask((__v32hi)(__m512i)(A), \ + (__v32hi)(__m512i)(B), \ + (int)(I), \ + (__v32hi)_mm512_setzero_si512(), \ + (__mmask32)(U)) #define _mm512_shrdi_epi16(A, B, I) \ - _mm512_mask_shrdi_epi16(_mm512_undefined(), (__mmask32)(-1), (A), (B), (I)) + (__m512i)__builtin_ia32_vpshrdw512_mask((__v32hi)(__m512i)(A), \ + (__v32hi)(__m512i)(B), \ + (int)(I), \ + (__v32hi)_mm512_undefined_epi32(), \ + (__mmask32)-1) static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_shldv_epi64(__m512i __S, __mmask8 __U, __m512i __A, __m512i __B) diff --git a/clang/lib/Headers/avx512vlvbmi2intrin.h b/clang/lib/Headers/avx512vlvbmi2intrin.h index 94fac5117ff..6dde1e9643e 100644 --- a/clang/lib/Headers/avx512vlvbmi2intrin.h +++ b/clang/lib/Headers/avx512vlvbmi2intrin.h @@ -259,10 +259,18 @@ _mm256_maskz_expandloadu_epi8(__mmask32 __U, void const *__P) (__mmask8)(U)) #define _mm256_maskz_shldi_epi64(U, A, B, I) \ - _mm256_mask_shldi_epi64(_mm256_setzero_si256(), (U), (A), (B), (I)) + (__m256i)__builtin_ia32_vpshldq256_mask((__v4di)(__m256i)(A), \ + (__v4di)(__m256i)(B), \ + (int)(I), \ + (__v4di)_mm256_setzero_si256(), \ + (__mmask8)(U)) #define _mm256_shldi_epi64(A, B, I) \ - _mm256_mask_shldi_epi64(_mm256_undefined_si256(), (__mmask8)(-1), (A), (B), (I)) + (__m256i)__builtin_ia32_vpshldq256_mask((__v4di)(__m256i)(A), \ + (__v4di)(__m256i)(B), \ + (int)(I), \ + (__v4di)_mm256_undefined_si256(), \ + (__mmask8)-1) #define _mm_mask_shldi_epi64(S, U, A, B, I) \ (__m128i)__builtin_ia32_vpshldq128_mask((__v2di)(__m128i)(A), \ @@ -272,10 +280,18 @@ _mm256_maskz_expandloadu_epi8(__mmask32 __U, void const *__P) (__mmask8)(U)) #define _mm_maskz_shldi_epi64(U, A, B, I) \ - _mm_mask_shldi_epi64(_mm_setzero_si128(), (U), (A), (B), (I)) + (__m128i)__builtin_ia32_vpshldq128_mask((__v2di)(__m128i)(A), \ + (__v2di)(__m128i)(B), \ + (int)(I), \ + (__v2di)_mm_setzero_si128(), \ + (__mmask8)(U)) #define _mm_shldi_epi64(A, B, I) \ - _mm_mask_shldi_epi64(_mm_undefined_si128(), (__mmask8)(-1), (A), (B), (I)) + (__m128i)__builtin_ia32_vpshldq128_mask((__v2di)(__m128i)(A), \ + (__v2di)(__m128i)(B), \ + (int)(I), \ + (__v2di)_mm_undefined_si128(), \ + (__mmask8)-1) #define _mm256_mask_shldi_epi32(S, U, A, B, I) \ (__m256i)__builtin_ia32_vpshldd256_mask((__v8si)(__m256i)(A), \ @@ -285,10 +301,18 @@ _mm256_maskz_expandloadu_epi8(__mmask32 __U, void const *__P) (__mmask8)(U)) #define _mm256_maskz_shldi_epi32(U, A, B, I) \ - _mm256_mask_shldi_epi32(_mm256_setzero_si256(), (U), (A), (B), (I)) + (__m256i)__builtin_ia32_vpshldd256_mask((__v8si)(__m256i)(A), \ + (__v8si)(__m256i)(B), \ + (int)(I), \ + (__v8si)_mm256_setzero_si256(), \ + (__mmask8)(U)) #define _mm256_shldi_epi32(A, B, I) \ - _mm256_mask_shldi_epi32(_mm256_undefined_si256(), (__mmask8)(-1), (A), (B), (I)) + (__m256i)__builtin_ia32_vpshldd256_mask((__v8si)(__m256i)(A), \ + (__v8si)(__m256i)(B), \ + (int)(I), \ + (__v8si)_mm256_undefined_si256(), \ + (__mmask8)-1) #define _mm_mask_shldi_epi32(S, U, A, B, I) \ (__m128i)__builtin_ia32_vpshldd128_mask((__v4si)(__m128i)(A), \ @@ -298,10 +322,18 @@ _mm256_maskz_expandloadu_epi8(__mmask32 __U, void const *__P) (__mmask8)(U)) #define _mm_maskz_shldi_epi32(U, A, B, I) \ - _mm_mask_shldi_epi32(_mm_setzero_si128(), (U), (A), (B), (I)) + (__m128i)__builtin_ia32_vpshldd128_mask((__v4si)(__m128i)(A), \ + (__v4si)(__m128i)(B), \ + (int)(I), \ + (__v4si)_mm_setzero_si128(), \ + (__mmask8)(U)) #define _mm_shldi_epi32(A, B, I) \ - _mm_mask_shldi_epi32(_mm_undefined_si128(), (__mmask8)(-1), (A), (B), (I)) + (__m128i)__builtin_ia32_vpshldd128_mask((__v4si)(__m128i)(A), \ + (__v4si)(__m128i)(B), \ + (int)(I), \ + (__v4si)_mm_undefined_si128(), \ + (__mmask8)-1) #define _mm256_mask_shldi_epi16(S, U, A, B, I) \ (__m256i)__builtin_ia32_vpshldw256_mask((__v16hi)(__m256i)(A), \ @@ -311,10 +343,18 @@ _mm256_maskz_expandloadu_epi8(__mmask32 __U, void const *__P) (__mmask16)(U)) #define _mm256_maskz_shldi_epi16(U, A, B, I) \ - _mm256_mask_shldi_epi16(_mm256_setzero_si256(), (U), (A), (B), (I)) + (__m256i)__builtin_ia32_vpshldw256_mask((__v16hi)(__m256i)(A), \ + (__v16hi)(__m256i)(B), \ + (int)(I), \ + (__v16hi)_mm256_setzero_si256(), \ + (__mmask16)(U)) #define _mm256_shldi_epi16(A, B, I) \ - _mm256_mask_shldi_epi16(_mm256_undefined_si256(), (__mmask8)(-1), (A), (B), (I)) + (__m256i)__builtin_ia32_vpshldw256_mask((__v16hi)(__m256i)(A), \ + (__v16hi)(__m256i)(B), \ + (int)(I), \ + (__v16hi)_mm256_undefined_si256(), \ + (__mmask16)-1) #define _mm_mask_shldi_epi16(S, U, A, B, I) \ (__m128i)__builtin_ia32_vpshldw128_mask((__v8hi)(__m128i)(A), \ @@ -324,10 +364,18 @@ _mm256_maskz_expandloadu_epi8(__mmask32 __U, void const *__P) (__mmask8)(U)) #define _mm_maskz_shldi_epi16(U, A, B, I) \ - _mm_mask_shldi_epi16(_mm_setzero_si128(), (U), (A), (B), (I)) + (__m128i)__builtin_ia32_vpshldw128_mask((__v8hi)(__m128i)(A), \ + (__v8hi)(__m128i)(B), \ + (int)(I), \ + (__v8hi)_mm_setzero_si128(), \ + (__mmask8)(U)) #define _mm_shldi_epi16(A, B, I) \ - _mm_mask_shldi_epi16(_mm_undefined_si128(), (__mmask8)(-1), (A), (B), (I)) + (__m128i)__builtin_ia32_vpshldw128_mask((__v8hi)(__m128i)(A), \ + (__v8hi)(__m128i)(B), \ + (int)(I), \ + (__v8hi)_mm_undefined_si128(), \ + (__mmask8)-1) #define _mm256_mask_shrdi_epi64(S, U, A, B, I) \ (__m256i)__builtin_ia32_vpshrdq256_mask((__v4di)(__m256i)(A), \ @@ -337,10 +385,18 @@ _mm256_maskz_expandloadu_epi8(__mmask32 __U, void const *__P) (__mmask8)(U)) #define _mm256_maskz_shrdi_epi64(U, A, B, I) \ - _mm256_mask_shrdi_epi64(_mm256_setzero_si256(), (U), (A), (B), (I)) + (__m256i)__builtin_ia32_vpshrdq256_mask((__v4di)(__m256i)(A), \ + (__v4di)(__m256i)(B), \ + (int)(I), \ + (__v4di)_mm256_setzero_si256(), \ + (__mmask8)(U)) #define _mm256_shrdi_epi64(A, B, I) \ - _mm256_mask_shrdi_epi64(_mm256_undefined_si256(), (__mmask8)(-1), (A), (B), (I)) + (__m256i)__builtin_ia32_vpshrdq256_mask((__v4di)(__m256i)(A), \ + (__v4di)(__m256i)(B), \ + (int)(I), \ + (__v4di)_mm256_undefined_si256(), \ + (__mmask8)-1) #define _mm_mask_shrdi_epi64(S, U, A, B, I) \ (__m128i)__builtin_ia32_vpshrdq128_mask((__v2di)(__m128i)(A), \ @@ -350,10 +406,18 @@ _mm256_maskz_expandloadu_epi8(__mmask32 __U, void const *__P) (__mmask8)(U)) #define _mm_maskz_shrdi_epi64(U, A, B, I) \ - _mm_mask_shrdi_epi64(_mm_setzero_si128(), (U), (A), (B), (I)) + (__m128i)__builtin_ia32_vpshrdq128_mask((__v2di)(__m128i)(A), \ + (__v2di)(__m128i)(B), \ + (int)(I), \ + (__v2di)_mm_setzero_si128(), \ + (__mmask8)(U)) #define _mm_shrdi_epi64(A, B, I) \ - _mm_mask_shrdi_epi64(_mm_undefined_si128(), (__mmask8)(-1), (A), (B), (I)) + (__m128i)__builtin_ia32_vpshrdq128_mask((__v2di)(__m128i)(A), \ + (__v2di)(__m128i)(B), \ + (int)(I), \ + (__v2di)_mm_undefined_si128(), \ + (__mmask8)-1) #define _mm256_mask_shrdi_epi32(S, U, A, B, I) \ (__m256i)__builtin_ia32_vpshrdd256_mask((__v8si)(__m256i)(A), \ @@ -363,10 +427,18 @@ _mm256_maskz_expandloadu_epi8(__mmask32 __U, void const *__P) (__mmask8)(U)) #define _mm256_maskz_shrdi_epi32(U, A, B, I) \ - _mm256_mask_shrdi_epi32(_mm256_setzero_si256(), (U), (A), (B), (I)) + (__m256i)__builtin_ia32_vpshrdd256_mask((__v8si)(__m256i)(A), \ + (__v8si)(__m256i)(B), \ + (int)(I), \ + (__v8si)_mm256_setzero_si256(), \ + (__mmask8)(U)) #define _mm256_shrdi_epi32(A, B, I) \ - _mm256_mask_shrdi_epi32(_mm256_undefined_si256(), (__mmask8)(-1), (A), (B), (I)) + (__m256i)__builtin_ia32_vpshrdd256_mask((__v8si)(__m256i)(A), \ + (__v8si)(__m256i)(B), \ + (int)(I), \ + (__v8si)_mm256_undefined_si256(), \ + (__mmask8)-1) #define _mm_mask_shrdi_epi32(S, U, A, B, I) \ (__m128i)__builtin_ia32_vpshrdd128_mask((__v4si)(__m128i)(A), \ @@ -376,10 +448,18 @@ _mm256_maskz_expandloadu_epi8(__mmask32 __U, void const *__P) (__mmask8)(U)) #define _mm_maskz_shrdi_epi32(U, A, B, I) \ - _mm_mask_shrdi_epi32(_mm_setzero_si128(), (U), (A), (B), (I)) + (__m128i)__builtin_ia32_vpshrdd128_mask((__v4si)(__m128i)(A), \ + (__v4si)(__m128i)(B), \ + (int)(I), \ + (__v4si)_mm_setzero_si128(), \ + (__mmask8)(U)) #define _mm_shrdi_epi32(A, B, I) \ - _mm_mask_shrdi_epi32(_mm_undefined_si128(), (__mmask8)(-1), (A), (B), (I)) + (__m128i)__builtin_ia32_vpshrdd128_mask((__v4si)(__m128i)(A), \ + (__v4si)(__m128i)(B), \ + (int)(I), \ + (__v4si)_mm_undefined_si128(), \ + (__mmask8)-1) #define _mm256_mask_shrdi_epi16(S, U, A, B, I) \ (__m256i)__builtin_ia32_vpshrdw256_mask((__v16hi)(__m256i)(A), \ @@ -389,10 +469,18 @@ _mm256_maskz_expandloadu_epi8(__mmask32 __U, void const *__P) (__mmask16)(U)) #define _mm256_maskz_shrdi_epi16(U, A, B, I) \ - _mm256_mask_shrdi_epi16(_mm256_setzero_si256(), (U), (A), (B), (I)) + (__m256i)__builtin_ia32_vpshrdw256_mask((__v16hi)(__m256i)(A), \ + (__v16hi)(__m256i)(B), \ + (int)(I), \ + (__v16hi)_mm256_setzero_si256(), \ + (__mmask16)(U)) #define _mm256_shrdi_epi16(A, B, I) \ - _mm256_mask_shrdi_epi16(_mm256_undefined_si256(), (__mmask8)(-1), (A), (B), (I)) + (__m256i)__builtin_ia32_vpshrdw256_mask((__v16hi)(__m256i)(A), \ + (__v16hi)(__m256i)(B), \ + (int)(I), \ + (__v16hi)_mm256_undefined_si256(), \ + (__mmask16)-1) #define _mm_mask_shrdi_epi16(S, U, A, B, I) \ (__m128i)__builtin_ia32_vpshrdw128_mask((__v8hi)(__m128i)(A), \ @@ -402,10 +490,18 @@ _mm256_maskz_expandloadu_epi8(__mmask32 __U, void const *__P) (__mmask8)(U)) #define _mm_maskz_shrdi_epi16(U, A, B, I) \ - _mm_mask_shrdi_epi16(_mm_setzero_si128(), (U), (A), (B), (I)) + (__m128i)__builtin_ia32_vpshrdw128_mask((__v8hi)(__m128i)(A), \ + (__v8hi)(__m128i)(B), \ + (int)(I), \ + (__v8hi)_mm_setzero_si128(), \ + (__mmask8)(U)) #define _mm_shrdi_epi16(A, B, I) \ - _mm_mask_shrdi_epi16(_mm_undefined_si128(), (__mmask8)(-1), (A), (B), (I)) + (__m128i)__builtin_ia32_vpshrdw128_mask((__v8hi)(__m128i)(A), \ + (__v8hi)(__m128i)(B), \ + (int)(I), \ + (__v8hi)_mm_undefined_si128(), \ + (__mmask8)-1) static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_mask_shldv_epi64(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) |