summaryrefslogtreecommitdiffstats
path: root/clang/lib
diff options
context:
space:
mode:
authorCraig Topper <craig.topper@intel.com>2018-06-08 03:24:47 +0000
committerCraig Topper <craig.topper@intel.com>2018-06-08 03:24:47 +0000
commit3428beeb2f7753d98572eb04ccda4fb59b0b0af4 (patch)
tree37a2ba731c313982eeb55517709d815266f1f545 /clang/lib
parent010edd37f82270a71129a6cea30d01722c0ead35 (diff)
downloadbcm5719-llvm-3428beeb2f7753d98572eb04ccda4fb59b0b0af4.tar.gz
bcm5719-llvm-3428beeb2f7753d98572eb04ccda4fb59b0b0af4.zip
[X86] Add subvector insert and extract builtins to enable target feature checking and immediate range checking.
Test changes are due to differences in how we generate undef elements now. We also changed the types used for extractf128_si256/insertf128_si256 to match the signature of the builtin that previously existed which this patch resurrects. This also matches gcc. llvm-svn: 334261
Diffstat (limited to 'clang/lib')
-rw-r--r--clang/lib/CodeGen/CGBuiltin.cpp69
-rw-r--r--clang/lib/Headers/avx2intrin.h13
-rw-r--r--clang/lib/Headers/avx512dqintrin.h96
-rw-r--r--clang/lib/Headers/avx512fintrin.h92
-rw-r--r--clang/lib/Headers/avx512vldqintrin.h26
-rw-r--r--clang/lib/Headers/avx512vlintrin.h38
-rw-r--r--clang/lib/Headers/avxintrin.h51
-rw-r--r--clang/lib/Sema/SemaChecking.cpp32
8 files changed, 149 insertions, 268 deletions
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 4331005cb35..025b34e809c 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -9235,6 +9235,75 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
Ops[0] = Builder.CreateBitCast(Ops[0], PtrTy);
return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
}
+ case X86::BI__builtin_ia32_vextractf128_pd256:
+ case X86::BI__builtin_ia32_vextractf128_ps256:
+ case X86::BI__builtin_ia32_vextractf128_si256:
+ case X86::BI__builtin_ia32_extract128i256:
+ case X86::BI__builtin_ia32_extractf64x4:
+ case X86::BI__builtin_ia32_extractf32x4:
+ case X86::BI__builtin_ia32_extracti64x4:
+ case X86::BI__builtin_ia32_extracti32x4:
+ case X86::BI__builtin_ia32_extractf32x8:
+ case X86::BI__builtin_ia32_extracti32x8:
+ case X86::BI__builtin_ia32_extractf32x4_256:
+ case X86::BI__builtin_ia32_extracti32x4_256:
+ case X86::BI__builtin_ia32_extractf64x2_256:
+ case X86::BI__builtin_ia32_extracti64x2_256:
+ case X86::BI__builtin_ia32_extractf64x2_512:
+ case X86::BI__builtin_ia32_extracti64x2_512: {
+ llvm::Type *DstTy = ConvertType(E->getType());
+ unsigned NumElts = DstTy->getVectorNumElements();
+ unsigned Index = cast<ConstantInt>(Ops[1])->getZExtValue() * NumElts;
+
+ uint32_t Indices[16];
+ for (unsigned i = 0; i != NumElts; ++i)
+ Indices[i] = i + Index;
+
+ return Builder.CreateShuffleVector(Ops[0],
+ UndefValue::get(Ops[0]->getType()),
+ makeArrayRef(Indices, NumElts),
+ "extract");
+ }
+ case X86::BI__builtin_ia32_vinsertf128_pd256:
+ case X86::BI__builtin_ia32_vinsertf128_ps256:
+ case X86::BI__builtin_ia32_vinsertf128_si256:
+ case X86::BI__builtin_ia32_insert128i256:
+ case X86::BI__builtin_ia32_insertf64x4:
+ case X86::BI__builtin_ia32_insertf32x4:
+ case X86::BI__builtin_ia32_inserti64x4:
+ case X86::BI__builtin_ia32_inserti32x4:
+ case X86::BI__builtin_ia32_insertf32x8:
+ case X86::BI__builtin_ia32_inserti32x8:
+ case X86::BI__builtin_ia32_insertf32x4_256:
+ case X86::BI__builtin_ia32_inserti32x4_256:
+ case X86::BI__builtin_ia32_insertf64x2_256:
+ case X86::BI__builtin_ia32_inserti64x2_256:
+ case X86::BI__builtin_ia32_insertf64x2_512:
+ case X86::BI__builtin_ia32_inserti64x2_512: {
+ unsigned DstNumElts = Ops[0]->getType()->getVectorNumElements();
+ unsigned SrcNumElts = Ops[1]->getType()->getVectorNumElements();
+ unsigned Index = cast<ConstantInt>(Ops[2])->getZExtValue() * SrcNumElts;
+
+ uint32_t Indices[16];
+ for (unsigned i = 0; i != DstNumElts; ++i)
+ Indices[i] = (i >= SrcNumElts) ? SrcNumElts + (i % SrcNumElts) : i;
+
+ Value *Op1 = Builder.CreateShuffleVector(Ops[1],
+ UndefValue::get(Ops[1]->getType()),
+ makeArrayRef(Indices, DstNumElts),
+ "widen");
+
+ for (unsigned i = 0; i != DstNumElts; ++i) {
+ if (i >= Index && i < (Index + SrcNumElts))
+ Indices[i] = (i - Index) + DstNumElts;
+ else
+ Indices[i] = i;
+ }
+
+ return Builder.CreateShuffleVector(Ops[0], Op1,
+ makeArrayRef(Indices, DstNumElts),
+ "insert");
+ }
case X86::BI__builtin_ia32_pblendw128:
case X86::BI__builtin_ia32_blendpd:
case X86::BI__builtin_ia32_blendps:
diff --git a/clang/lib/Headers/avx2intrin.h b/clang/lib/Headers/avx2intrin.h
index d1c530693bf..3867af08ccc 100644
--- a/clang/lib/Headers/avx2intrin.h
+++ b/clang/lib/Headers/avx2intrin.h
@@ -881,18 +881,11 @@ _mm256_permutevar8x32_ps(__m256 __a, __m256i __b)
(__m256i)__builtin_ia32_permti256((__m256i)(V1), (__m256i)(V2), (M))
#define _mm256_extracti128_si256(V, M) \
- (__m128i)__builtin_shufflevector((__v4di)(__m256i)(V), \
- (__v4di)_mm256_undefined_si256(), \
- (((M) & 1) ? 2 : 0), \
- (((M) & 1) ? 3 : 1) )
+ (__m128i)__builtin_ia32_extract128i256((__v4di)(__m256i)(V), (int)(M))
#define _mm256_inserti128_si256(V1, V2, M) \
- (__m256i)__builtin_shufflevector((__v4di)(__m256i)(V1), \
- (__v4di)_mm256_castsi128_si256((__m128i)(V2)), \
- (((M) & 1) ? 0 : 4), \
- (((M) & 1) ? 1 : 5), \
- (((M) & 1) ? 4 : 2), \
- (((M) & 1) ? 5 : 3) )
+ (__m256i)__builtin_ia32_insert128i256((__v4di)(__m256i)(V1), \
+ (__v2di)(__m128i)(V2), (int)(M))
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_maskload_epi32(int const *__X, __m256i __M)
diff --git a/clang/lib/Headers/avx512dqintrin.h b/clang/lib/Headers/avx512dqintrin.h
index 9ef78041d1c..fbb4bbce8df 100644
--- a/clang/lib/Headers/avx512dqintrin.h
+++ b/clang/lib/Headers/avx512dqintrin.h
@@ -1103,16 +1103,7 @@ _mm512_maskz_broadcast_i64x2(__mmask8 __M, __m128i __A)
}
#define _mm512_extractf32x8_ps(A, imm) \
- (__m256)__builtin_shufflevector((__v16sf)(__m512)(A), \
- (__v16sf)_mm512_undefined_ps(), \
- ((imm) & 1) ? 8 : 0, \
- ((imm) & 1) ? 9 : 1, \
- ((imm) & 1) ? 10 : 2, \
- ((imm) & 1) ? 11 : 3, \
- ((imm) & 1) ? 12 : 4, \
- ((imm) & 1) ? 13 : 5, \
- ((imm) & 1) ? 14 : 6, \
- ((imm) & 1) ? 15 : 7)
+ (__m256)__builtin_ia32_extractf32x8((__v16sf)(__m512)(A), (int)(imm))
#define _mm512_mask_extractf32x8_ps(W, U, A, imm) \
(__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
@@ -1125,10 +1116,7 @@ _mm512_maskz_broadcast_i64x2(__mmask8 __M, __m128i __A)
(__v8sf)_mm256_setzero_ps())
#define _mm512_extractf64x2_pd(A, imm) \
- (__m128d)__builtin_shufflevector((__v8df)(__m512d)(A), \
- (__v8df)_mm512_undefined_pd(), \
- 0 + ((imm) & 0x3) * 2, \
- 1 + ((imm) & 0x3) * 2)
+ (__m128d)__builtin_ia32_extractf64x2_512((__v8df)(__m512d)(A), (int)(imm))
#define _mm512_mask_extractf64x2_pd(W, U, A, imm) \
(__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \
@@ -1141,16 +1129,7 @@ _mm512_maskz_broadcast_i64x2(__mmask8 __M, __m128i __A)
(__v2df)_mm_setzero_pd())
#define _mm512_extracti32x8_epi32(A, imm) \
- (__m256i)__builtin_shufflevector((__v16si)(__m512i)(A), \
- (__v16si)_mm512_undefined_epi32(), \
- ((imm) & 1) ? 8 : 0, \
- ((imm) & 1) ? 9 : 1, \
- ((imm) & 1) ? 10 : 2, \
- ((imm) & 1) ? 11 : 3, \
- ((imm) & 1) ? 12 : 4, \
- ((imm) & 1) ? 13 : 5, \
- ((imm) & 1) ? 14 : 6, \
- ((imm) & 1) ? 15 : 7)
+ (__m256i)__builtin_ia32_extracti32x8((__v16si)(__m512i)(A), (int)(imm))
#define _mm512_mask_extracti32x8_epi32(W, U, A, imm) \
(__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
@@ -1163,10 +1142,7 @@ _mm512_maskz_broadcast_i64x2(__mmask8 __M, __m128i __A)
(__v8si)_mm256_setzero_si256())
#define _mm512_extracti64x2_epi64(A, imm) \
- (__m128i)__builtin_shufflevector((__v8di)(__m512i)(A), \
- (__v8di)_mm512_undefined_epi32(), \
- 0 + ((imm) & 0x3) * 2, \
- 1 + ((imm) & 0x3) * 2)
+ (__m128i)__builtin_ia32_extracti64x2_512((__v8di)(__m512i)(A), (int)(imm))
#define _mm512_mask_extracti64x2_epi64(W, U, A, imm) \
(__m128d)__builtin_ia32_selectq_128((__mmask8)(U), \
@@ -1179,24 +1155,8 @@ _mm512_maskz_broadcast_i64x2(__mmask8 __M, __m128i __A)
(__v2di)_mm_setzero_si128())
#define _mm512_insertf32x8(A, B, imm) \
- (__m512)__builtin_shufflevector((__v16sf)(__m512)(A), \
- (__v16sf)_mm512_castps256_ps512((__m256)(B)),\
- ((imm) & 0x1) ? 0 : 16, \
- ((imm) & 0x1) ? 1 : 17, \
- ((imm) & 0x1) ? 2 : 18, \
- ((imm) & 0x1) ? 3 : 19, \
- ((imm) & 0x1) ? 4 : 20, \
- ((imm) & 0x1) ? 5 : 21, \
- ((imm) & 0x1) ? 6 : 22, \
- ((imm) & 0x1) ? 7 : 23, \
- ((imm) & 0x1) ? 16 : 8, \
- ((imm) & 0x1) ? 17 : 9, \
- ((imm) & 0x1) ? 18 : 10, \
- ((imm) & 0x1) ? 19 : 11, \
- ((imm) & 0x1) ? 20 : 12, \
- ((imm) & 0x1) ? 21 : 13, \
- ((imm) & 0x1) ? 22 : 14, \
- ((imm) & 0x1) ? 23 : 15)
+ (__m512)__builtin_ia32_insertf32x8((__v16sf)(__m512)(A), \
+ (__v8sf)(__m256)(B), (int)(imm))
#define _mm512_mask_insertf32x8(W, U, A, B, imm) \
(__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
@@ -1209,16 +1169,8 @@ _mm512_maskz_broadcast_i64x2(__mmask8 __M, __m128i __A)
(__v16sf)_mm512_setzero_ps())
#define _mm512_insertf64x2(A, B, imm) \
- (__m512d)__builtin_shufflevector((__v8df)(__m512d)(A), \
- (__v8df)_mm512_castpd128_pd512((__m128d)(B)),\
- (((imm) & 0x3) == 0) ? 8 : 0, \
- (((imm) & 0x3) == 0) ? 9 : 1, \
- (((imm) & 0x3) == 1) ? 8 : 2, \
- (((imm) & 0x3) == 1) ? 9 : 3, \
- (((imm) & 0x3) == 2) ? 8 : 4, \
- (((imm) & 0x3) == 2) ? 9 : 5, \
- (((imm) & 0x3) == 3) ? 8 : 6, \
- (((imm) & 0x3) == 3) ? 9 : 7)
+ (__m512d)__builtin_ia32_insertf64x2_512((__v8df)(__m512d)(A), \
+ (__v2df)(__m128d)(B), (int)(imm))
#define _mm512_mask_insertf64x2(W, U, A, B, imm) \
(__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
@@ -1231,24 +1183,8 @@ _mm512_maskz_broadcast_i64x2(__mmask8 __M, __m128i __A)
(__v8df)_mm512_setzero_pd())
#define _mm512_inserti32x8(A, B, imm) \
- (__m512i)__builtin_shufflevector((__v16si)(__m512i)(A), \
- (__v16si)_mm512_castsi256_si512((__m256i)(B)),\
- ((imm) & 0x1) ? 0 : 16, \
- ((imm) & 0x1) ? 1 : 17, \
- ((imm) & 0x1) ? 2 : 18, \
- ((imm) & 0x1) ? 3 : 19, \
- ((imm) & 0x1) ? 4 : 20, \
- ((imm) & 0x1) ? 5 : 21, \
- ((imm) & 0x1) ? 6 : 22, \
- ((imm) & 0x1) ? 7 : 23, \
- ((imm) & 0x1) ? 16 : 8, \
- ((imm) & 0x1) ? 17 : 9, \
- ((imm) & 0x1) ? 18 : 10, \
- ((imm) & 0x1) ? 19 : 11, \
- ((imm) & 0x1) ? 20 : 12, \
- ((imm) & 0x1) ? 21 : 13, \
- ((imm) & 0x1) ? 22 : 14, \
- ((imm) & 0x1) ? 23 : 15)
+ (__m512i)__builtin_ia32_inserti32x8((__v16si)(__m512i)(A), \
+ (__v8si)(__m256i)(B), (int)(imm))
#define _mm512_mask_inserti32x8(W, U, A, B, imm) \
(__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
@@ -1261,16 +1197,8 @@ _mm512_maskz_broadcast_i64x2(__mmask8 __M, __m128i __A)
(__v16si)_mm512_setzero_si512())
#define _mm512_inserti64x2(A, B, imm) \
- (__m512i)__builtin_shufflevector((__v8di)(__m512i)(A), \
- (__v8di)_mm512_castsi128_si512((__m128i)(B)),\
- (((imm) & 0x3) == 0) ? 8 : 0, \
- (((imm) & 0x3) == 0) ? 9 : 1, \
- (((imm) & 0x3) == 1) ? 8 : 2, \
- (((imm) & 0x3) == 1) ? 9 : 3, \
- (((imm) & 0x3) == 2) ? 8 : 4, \
- (((imm) & 0x3) == 2) ? 9 : 5, \
- (((imm) & 0x3) == 3) ? 8 : 6, \
- (((imm) & 0x3) == 3) ? 9 : 7)
+ (__m512i)__builtin_ia32_inserti64x2_512((__v8di)(__m512i)(A), \
+ (__v2di)(__m128i)(B), (int)(imm))
#define _mm512_mask_inserti64x2(W, U, A, B, imm) \
(__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
diff --git a/clang/lib/Headers/avx512fintrin.h b/clang/lib/Headers/avx512fintrin.h
index ccc445a6acd..4ae235e6330 100644
--- a/clang/lib/Headers/avx512fintrin.h
+++ b/clang/lib/Headers/avx512fintrin.h
@@ -3494,12 +3494,7 @@ _mm512_maskz_permutex2var_epi64(__mmask8 __U, __m512i __A, __m512i __I,
/* Vector Extract */
#define _mm512_extractf64x4_pd(A, I) \
- (__m256d)__builtin_shufflevector((__v8df)(__m512d)(A), \
- (__v8df)_mm512_undefined_pd(), \
- ((I) & 1) ? 4 : 0, \
- ((I) & 1) ? 5 : 1, \
- ((I) & 1) ? 6 : 2, \
- ((I) & 1) ? 7 : 3)
+ (__m256d)__builtin_ia32_extractf64x4((__v8df)(__m512d)(A), (int)(I))
#define _mm512_mask_extractf64x4_pd(W, U, A, imm) \
(__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
@@ -3512,12 +3507,7 @@ _mm512_maskz_permutex2var_epi64(__mmask8 __U, __m512i __A, __m512i __I,
(__v4df)_mm256_setzero_pd())
#define _mm512_extractf32x4_ps(A, I) \
- (__m128)__builtin_shufflevector((__v16sf)(__m512)(A), \
- (__v16sf)_mm512_undefined_ps(), \
- 0 + ((I) & 0x3) * 4, \
- 1 + ((I) & 0x3) * 4, \
- 2 + ((I) & 0x3) * 4, \
- 3 + ((I) & 0x3) * 4)
+ (__m128)__builtin_ia32_extractf32x4((__v16sf)(__m512)(A), (int)(I))
#define _mm512_mask_extractf32x4_ps(W, U, A, imm) \
(__m128)__builtin_ia32_selectps_128((__mmask8)(U), \
@@ -7544,12 +7534,7 @@ _mm512_mask_cvtepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A)
}
#define _mm512_extracti32x4_epi32(A, imm) \
- (__m128i)__builtin_shufflevector((__v16si)(__m512i)(A), \
- (__v16si)_mm512_undefined_epi32(), \
- 0 + ((imm) & 0x3) * 4, \
- 1 + ((imm) & 0x3) * 4, \
- 2 + ((imm) & 0x3) * 4, \
- 3 + ((imm) & 0x3) * 4)
+ (__m128i)__builtin_ia32_extracti32x4((__v16si)(__m512i)(A), (int)(imm))
#define _mm512_mask_extracti32x4_epi32(W, U, A, imm) \
(__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
@@ -7562,12 +7547,7 @@ _mm512_mask_cvtepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A)
(__v4si)_mm_setzero_si128())
#define _mm512_extracti64x4_epi64(A, imm) \
- (__m256i)__builtin_shufflevector((__v8di)(__m512i)(A), \
- (__v8di)_mm512_undefined_epi32(), \
- ((imm) & 1) ? 4 : 0, \
- ((imm) & 1) ? 5 : 1, \
- ((imm) & 1) ? 6 : 2, \
- ((imm) & 1) ? 7 : 3)
+ (__m256i)__builtin_ia32_extracti64x4((__v8di)(__m512i)(A), (int)(imm))
#define _mm512_mask_extracti64x4_epi64(W, U, A, imm) \
(__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
@@ -7580,16 +7560,8 @@ _mm512_mask_cvtepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A)
(__v4di)_mm256_setzero_si256())
#define _mm512_insertf64x4(A, B, imm) \
- (__m512d)__builtin_shufflevector((__v8df)(__m512d)(A), \
- (__v8df)_mm512_castpd256_pd512((__m256d)(B)), \
- ((imm) & 0x1) ? 0 : 8, \
- ((imm) & 0x1) ? 1 : 9, \
- ((imm) & 0x1) ? 2 : 10, \
- ((imm) & 0x1) ? 3 : 11, \
- ((imm) & 0x1) ? 8 : 4, \
- ((imm) & 0x1) ? 9 : 5, \
- ((imm) & 0x1) ? 10 : 6, \
- ((imm) & 0x1) ? 11 : 7)
+ (__m512d)__builtin_ia32_insertf64x4((__v8df)(__m512d)(A), \
+ (__v4df)(__m256d)(B), (int)(imm))
#define _mm512_mask_insertf64x4(W, U, A, B, imm) \
(__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
@@ -7602,16 +7574,8 @@ _mm512_mask_cvtepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A)
(__v8df)_mm512_setzero_pd())
#define _mm512_inserti64x4(A, B, imm) \
- (__m512i)__builtin_shufflevector((__v8di)(__m512i)(A), \
- (__v8di)_mm512_castsi256_si512((__m256i)(B)), \
- ((imm) & 0x1) ? 0 : 8, \
- ((imm) & 0x1) ? 1 : 9, \
- ((imm) & 0x1) ? 2 : 10, \
- ((imm) & 0x1) ? 3 : 11, \
- ((imm) & 0x1) ? 8 : 4, \
- ((imm) & 0x1) ? 9 : 5, \
- ((imm) & 0x1) ? 10 : 6, \
- ((imm) & 0x1) ? 11 : 7)
+ (__m512i)__builtin_ia32_inserti64x4((__v8di)(__m512i)(A), \
+ (__v4di)(__m256i)(B), (int)(imm))
#define _mm512_mask_inserti64x4(W, U, A, B, imm) \
(__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
@@ -7624,24 +7588,8 @@ _mm512_mask_cvtepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A)
(__v8di)_mm512_setzero_si512())
#define _mm512_insertf32x4(A, B, imm) \
- (__m512)__builtin_shufflevector((__v16sf)(__m512)(A), \
- (__v16sf)_mm512_castps128_ps512((__m128)(B)),\
- (((imm) & 0x3) == 0) ? 16 : 0, \
- (((imm) & 0x3) == 0) ? 17 : 1, \
- (((imm) & 0x3) == 0) ? 18 : 2, \
- (((imm) & 0x3) == 0) ? 19 : 3, \
- (((imm) & 0x3) == 1) ? 16 : 4, \
- (((imm) & 0x3) == 1) ? 17 : 5, \
- (((imm) & 0x3) == 1) ? 18 : 6, \
- (((imm) & 0x3) == 1) ? 19 : 7, \
- (((imm) & 0x3) == 2) ? 16 : 8, \
- (((imm) & 0x3) == 2) ? 17 : 9, \
- (((imm) & 0x3) == 2) ? 18 : 10, \
- (((imm) & 0x3) == 2) ? 19 : 11, \
- (((imm) & 0x3) == 3) ? 16 : 12, \
- (((imm) & 0x3) == 3) ? 17 : 13, \
- (((imm) & 0x3) == 3) ? 18 : 14, \
- (((imm) & 0x3) == 3) ? 19 : 15)
+ (__m512)__builtin_ia32_insertf32x4((__v16sf)(__m512)(A), \
+ (__v4sf)(__m128)(B), (int)(imm))
#define _mm512_mask_insertf32x4(W, U, A, B, imm) \
(__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
@@ -7654,24 +7602,8 @@ _mm512_mask_cvtepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A)
(__v16sf)_mm512_setzero_ps())
#define _mm512_inserti32x4(A, B, imm) \
- (__m512i)__builtin_shufflevector((__v16si)(__m512i)(A), \
- (__v16si)_mm512_castsi128_si512((__m128i)(B)),\
- (((imm) & 0x3) == 0) ? 16 : 0, \
- (((imm) & 0x3) == 0) ? 17 : 1, \
- (((imm) & 0x3) == 0) ? 18 : 2, \
- (((imm) & 0x3) == 0) ? 19 : 3, \
- (((imm) & 0x3) == 1) ? 16 : 4, \
- (((imm) & 0x3) == 1) ? 17 : 5, \
- (((imm) & 0x3) == 1) ? 18 : 6, \
- (((imm) & 0x3) == 1) ? 19 : 7, \
- (((imm) & 0x3) == 2) ? 16 : 8, \
- (((imm) & 0x3) == 2) ? 17 : 9, \
- (((imm) & 0x3) == 2) ? 18 : 10, \
- (((imm) & 0x3) == 2) ? 19 : 11, \
- (((imm) & 0x3) == 3) ? 16 : 12, \
- (((imm) & 0x3) == 3) ? 17 : 13, \
- (((imm) & 0x3) == 3) ? 18 : 14, \
- (((imm) & 0x3) == 3) ? 19 : 15)
+ (__m512i)__builtin_ia32_inserti32x4((__v16si)(__m512i)(A), \
+ (__v4si)(__m128i)(B), (int)(imm))
#define _mm512_mask_inserti32x4(W, U, A, B, imm) \
(__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
diff --git a/clang/lib/Headers/avx512vldqintrin.h b/clang/lib/Headers/avx512vldqintrin.h
index 60571adb5e9..8d6ff3ec8d9 100644
--- a/clang/lib/Headers/avx512vldqintrin.h
+++ b/clang/lib/Headers/avx512vldqintrin.h
@@ -1083,10 +1083,7 @@ _mm256_maskz_broadcast_i64x2 (__mmask8 __M, __m128i __A)
}
#define _mm256_extractf64x2_pd(A, imm) \
- (__m128d)__builtin_shufflevector((__v4df)(__m256d)(A), \
- (__v4df)_mm256_undefined_pd(), \
- ((imm) & 1) ? 2 : 0, \
- ((imm) & 1) ? 3 : 1)
+ (__m128d)__builtin_ia32_extractf64x2_256((__v4df)(__m256d)(A), (int)(imm))
#define _mm256_mask_extractf64x2_pd(W, U, A, imm) \
(__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \
@@ -1099,10 +1096,7 @@ _mm256_maskz_broadcast_i64x2 (__mmask8 __M, __m128i __A)
(__v2df)_mm_setzero_pd())
#define _mm256_extracti64x2_epi64(A, imm) \
- (__m128i)__builtin_shufflevector((__v4di)(__m256i)(A), \
- (__v4di)_mm256_undefined_si256(), \
- ((imm) & 1) ? 2 : 0, \
- ((imm) & 1) ? 3 : 1)
+ (__m128i)__builtin_ia32_extracti64x2_256((__v4di)(__m256i)(A), (int)(imm))
#define _mm256_mask_extracti64x2_epi64(W, U, A, imm) \
(__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
@@ -1115,12 +1109,8 @@ _mm256_maskz_broadcast_i64x2 (__mmask8 __M, __m128i __A)
(__v2di)_mm_setzero_si128())
#define _mm256_insertf64x2(A, B, imm) \
- (__m256d)__builtin_shufflevector((__v4df)(__m256d)(A), \
- (__v4df)_mm256_castpd128_pd256((__m128d)(B)), \
- ((imm) & 0x1) ? 0 : 4, \
- ((imm) & 0x1) ? 1 : 5, \
- ((imm) & 0x1) ? 4 : 2, \
- ((imm) & 0x1) ? 5 : 3)
+ (__m256d)__builtin_ia32_insertf64x2_256((__v4df)(__m256d)(A), \
+ (__v2df)(__m128d)(B), (int)(imm))
#define _mm256_mask_insertf64x2(W, U, A, B, imm) \
(__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
@@ -1133,12 +1123,8 @@ _mm256_maskz_broadcast_i64x2 (__mmask8 __M, __m128i __A)
(__v4df)_mm256_setzero_pd())
#define _mm256_inserti64x2(A, B, imm) \
- (__m256i)__builtin_shufflevector((__v4di)(__m256i)(A), \
- (__v4di)_mm256_castsi128_si256((__m128i)(B)), \
- ((imm) & 0x1) ? 0 : 4, \
- ((imm) & 0x1) ? 1 : 5, \
- ((imm) & 0x1) ? 4 : 2, \
- ((imm) & 0x1) ? 5 : 3)
+ (__m256i)__builtin_ia32_inserti64x2_256((__v4di)(__m256i)(A), \
+ (__v2di)(__m128i)(B), (int)(imm))
#define _mm256_mask_inserti64x2(W, U, A, B, imm) \
(__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
diff --git a/clang/lib/Headers/avx512vlintrin.h b/clang/lib/Headers/avx512vlintrin.h
index 49198ec5336..d1e81528cde 100644
--- a/clang/lib/Headers/avx512vlintrin.h
+++ b/clang/lib/Headers/avx512vlintrin.h
@@ -7699,12 +7699,7 @@ _mm256_mask_cvtepi64_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A)
}
#define _mm256_extractf32x4_ps(A, imm) \
- (__m128)__builtin_shufflevector((__v8sf)(__m256)(A), \
- (__v8sf)_mm256_undefined_ps(), \
- ((imm) & 1) ? 4 : 0, \
- ((imm) & 1) ? 5 : 1, \
- ((imm) & 1) ? 6 : 2, \
- ((imm) & 1) ? 7 : 3)
+ (__m128)__builtin_ia32_extractf32x4_256((__v8sf)(__m256)(A), (int)(imm))
#define _mm256_mask_extractf32x4_ps(W, U, A, imm) \
(__m128)__builtin_ia32_selectps_128((__mmask8)(U), \
@@ -7717,12 +7712,7 @@ _mm256_mask_cvtepi64_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A)
(__v4sf)_mm_setzero_ps())
#define _mm256_extracti32x4_epi32(A, imm) \
- (__m128i)__builtin_shufflevector((__v8si)(__m256)(A), \
- (__v8si)_mm256_undefined_si256(), \
- ((imm) & 1) ? 4 : 0, \
- ((imm) & 1) ? 5 : 1, \
- ((imm) & 1) ? 6 : 2, \
- ((imm) & 1) ? 7 : 3)
+ (__m128i)__builtin_ia32_extracti32x4_256((__v8si)(__m256i)(A), (int)(imm))
#define _mm256_mask_extracti32x4_epi32(W, U, A, imm) \
(__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
@@ -7735,16 +7725,8 @@ _mm256_mask_cvtepi64_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A)
(__v4si)_mm_setzero_si128())
#define _mm256_insertf32x4(A, B, imm) \
- (__m256)__builtin_shufflevector((__v8sf)(__m256)(A), \
- (__v8sf)_mm256_castps128_ps256((__m128)(B)), \
- ((imm) & 0x1) ? 0 : 8, \
- ((imm) & 0x1) ? 1 : 9, \
- ((imm) & 0x1) ? 2 : 10, \
- ((imm) & 0x1) ? 3 : 11, \
- ((imm) & 0x1) ? 8 : 4, \
- ((imm) & 0x1) ? 9 : 5, \
- ((imm) & 0x1) ? 10 : 6, \
- ((imm) & 0x1) ? 11 : 7)
+ (__m256)__builtin_ia32_insertf32x4_256((__v8sf)(__m256)(A), \
+ (__v4sf)(__m128)(B), (int)(imm))
#define _mm256_mask_insertf32x4(W, U, A, B, imm) \
(__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
@@ -7757,16 +7739,8 @@ _mm256_mask_cvtepi64_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A)
(__v8sf)_mm256_setzero_ps())
#define _mm256_inserti32x4(A, B, imm) \
- (__m256i)__builtin_shufflevector((__v8si)(__m256i)(A), \
- (__v8si)_mm256_castsi128_si256((__m128i)(B)), \
- ((imm) & 0x1) ? 0 : 8, \
- ((imm) & 0x1) ? 1 : 9, \
- ((imm) & 0x1) ? 2 : 10, \
- ((imm) & 0x1) ? 3 : 11, \
- ((imm) & 0x1) ? 8 : 4, \
- ((imm) & 0x1) ? 9 : 5, \
- ((imm) & 0x1) ? 10 : 6, \
- ((imm) & 0x1) ? 11 : 7)
+ (__m256i)__builtin_ia32_inserti32x4_256((__v8si)(__m256i)(A), \
+ (__v4si)(__m128i)(B), (int)(imm))
#define _mm256_mask_inserti32x4(W, U, A, B, imm) \
(__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
diff --git a/clang/lib/Headers/avxintrin.h b/clang/lib/Headers/avxintrin.h
index 7c85893ba13..6c42132cf44 100644
--- a/clang/lib/Headers/avxintrin.h
+++ b/clang/lib/Headers/avxintrin.h
@@ -4613,17 +4613,8 @@ _mm256_zextsi128_si256(__m128i __a)
/// result.
/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
#define _mm256_insertf128_ps(V1, V2, M) \
- (__m256)__builtin_shufflevector( \
- (__v8sf)(__m256)(V1), \
- (__v8sf)_mm256_castps128_ps256((__m128)(V2)), \
- (((M) & 1) ? 0 : 8), \
- (((M) & 1) ? 1 : 9), \
- (((M) & 1) ? 2 : 10), \
- (((M) & 1) ? 3 : 11), \
- (((M) & 1) ? 8 : 4), \
- (((M) & 1) ? 9 : 5), \
- (((M) & 1) ? 10 : 6), \
- (((M) & 1) ? 11 : 7) )
+ (__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)(__m256)(V1), \
+ (__v4sf)(__m128)(V2), (int)(M))
/// Constructs a new 256-bit vector of [4 x double] by first duplicating
/// a 256-bit vector of [4 x double] given in the first parameter, and then
@@ -4660,13 +4651,8 @@ _mm256_zextsi128_si256(__m128i __a)
/// result.
/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
#define _mm256_insertf128_pd(V1, V2, M) \
- (__m256d)__builtin_shufflevector( \
- (__v4df)(__m256d)(V1), \
- (__v4df)_mm256_castpd128_pd256((__m128d)(V2)), \
- (((M) & 1) ? 0 : 4), \
- (((M) & 1) ? 1 : 5), \
- (((M) & 1) ? 4 : 2), \
- (((M) & 1) ? 5 : 3) )
+ (__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)(__m256d)(V1), \
+ (__v2df)(__m128d)(V2), (int)(M))
/// Constructs a new 256-bit integer vector by first duplicating a
/// 256-bit integer vector given in the first parameter, and then replacing
@@ -4703,13 +4689,8 @@ _mm256_zextsi128_si256(__m128i __a)
/// result.
/// \returns A 256-bit integer vector containing the interleaved values.
#define _mm256_insertf128_si256(V1, V2, M) \
- (__m256i)__builtin_shufflevector( \
- (__v4di)(__m256i)(V1), \
- (__v4di)_mm256_castsi128_si256((__m128i)(V2)), \
- (((M) & 1) ? 0 : 4), \
- (((M) & 1) ? 1 : 5), \
- (((M) & 1) ? 4 : 2), \
- (((M) & 1) ? 5 : 3) )
+ (__m256i)__builtin_ia32_vinsertf128_si256((__v4di)(__m256i)(V1), \
+ (__v2di)(__m128i)(V2), (int)(M))
/*
Vector extract.
@@ -4738,13 +4719,7 @@ _mm256_zextsi128_si256(__m128i __a)
/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
/// \returns A 128-bit vector of [4 x float] containing the extracted bits.
#define _mm256_extractf128_ps(V, M) \
- (__m128)__builtin_shufflevector( \
- (__v8sf)(__m256)(V), \
- (__v8sf)(_mm256_undefined_ps()), \
- (((M) & 1) ? 4 : 0), \
- (((M) & 1) ? 5 : 1), \
- (((M) & 1) ? 6 : 2), \
- (((M) & 1) ? 7 : 3) )
+ (__m128)__builtin_ia32_vextractf128_ps256((__v8sf)(__m256)(V), (int)(M))
/// Extracts either the upper or the lower 128 bits from a 256-bit vector
/// of [4 x double], as determined by the immediate integer parameter, and
@@ -4768,11 +4743,7 @@ _mm256_zextsi128_si256(__m128i __a)
/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
/// \returns A 128-bit vector of [2 x double] containing the extracted bits.
#define _mm256_extractf128_pd(V, M) \
- (__m128d)__builtin_shufflevector( \
- (__v4df)(__m256d)(V), \
- (__v4df)(_mm256_undefined_pd()), \
- (((M) & 1) ? 2 : 0), \
- (((M) & 1) ? 3 : 1) )
+ (__m128d)__builtin_ia32_vextractf128_pd256((__v4df)(__m256d)(V), (int)(M))
/// Extracts either the upper or the lower 128 bits from a 256-bit
/// integer vector, as determined by the immediate integer parameter, and
@@ -4796,11 +4767,7 @@ _mm256_zextsi128_si256(__m128i __a)
/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
/// \returns A 128-bit integer vector containing the extracted bits.
#define _mm256_extractf128_si256(V, M) \
- (__m128i)__builtin_shufflevector( \
- (__v4di)(__m256i)(V), \
- (__v4di)(_mm256_undefined_si256()), \
- (((M) & 1) ? 2 : 0), \
- (((M) & 1) ? 3 : 1) )
+ (__m128i)__builtin_ia32_vextractf128_si256((__v4di)(__m256i)(V), (int)(M))
/* SIMD load ops (unaligned) */
/// Loads two 128-bit floating-point vectors of [4 x float] from
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index d16921c6c70..4f5405935e2 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -2607,9 +2607,33 @@ bool Sema::CheckX86BuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
return false;
case X86::BI__builtin_ia32_vec_ext_v2si:
case X86::BI__builtin_ia32_vec_ext_v2di:
+ case X86::BI__builtin_ia32_vextractf128_pd256:
+ case X86::BI__builtin_ia32_vextractf128_ps256:
+ case X86::BI__builtin_ia32_vextractf128_si256:
+ case X86::BI__builtin_ia32_extract128i256:
+ case X86::BI__builtin_ia32_extractf64x4:
+ case X86::BI__builtin_ia32_extracti64x4:
+ case X86::BI__builtin_ia32_extractf32x8:
+ case X86::BI__builtin_ia32_extracti32x8:
+ case X86::BI__builtin_ia32_extractf64x2_256:
+ case X86::BI__builtin_ia32_extracti64x2_256:
+ case X86::BI__builtin_ia32_extractf32x4_256:
+ case X86::BI__builtin_ia32_extracti32x4_256:
i = 1; l = 0; u = 1;
break;
case X86::BI__builtin_ia32_vec_set_v2di:
+ case X86::BI__builtin_ia32_vinsertf128_pd256:
+ case X86::BI__builtin_ia32_vinsertf128_ps256:
+ case X86::BI__builtin_ia32_vinsertf128_si256:
+ case X86::BI__builtin_ia32_insert128i256:
+ case X86::BI__builtin_ia32_insertf32x8:
+ case X86::BI__builtin_ia32_inserti32x8:
+ case X86::BI__builtin_ia32_insertf64x4:
+ case X86::BI__builtin_ia32_inserti64x4:
+ case X86::BI__builtin_ia32_insertf64x2_256:
+ case X86::BI__builtin_ia32_inserti64x2_256:
+ case X86::BI__builtin_ia32_insertf32x4_256:
+ case X86::BI__builtin_ia32_inserti32x4_256:
i = 2; l = 0; u = 1;
break;
case X86::BI__builtin_ia32_vpermilpd:
@@ -2617,6 +2641,10 @@ bool Sema::CheckX86BuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
case X86::BI__builtin_ia32_vec_ext_v4si:
case X86::BI__builtin_ia32_vec_ext_v4sf:
case X86::BI__builtin_ia32_vec_ext_v4di:
+ case X86::BI__builtin_ia32_extractf32x4:
+ case X86::BI__builtin_ia32_extracti32x4:
+ case X86::BI__builtin_ia32_extractf64x2_512:
+ case X86::BI__builtin_ia32_extracti64x2_512:
i = 1; l = 0; u = 3;
break;
case X86::BI_mm_prefetch:
@@ -2633,6 +2661,10 @@ bool Sema::CheckX86BuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
case X86::BI__builtin_ia32_shuf_f64x2_256:
case X86::BI__builtin_ia32_shuf_i32x4_256:
case X86::BI__builtin_ia32_shuf_i64x2_256:
+ case X86::BI__builtin_ia32_insertf64x2_512:
+ case X86::BI__builtin_ia32_inserti64x2_512:
+ case X86::BI__builtin_ia32_insertf32x4:
+ case X86::BI__builtin_ia32_inserti32x4:
i = 2; l = 0; u = 3;
break;
case X86::BI__builtin_ia32_vpermil2pd:
OpenPOWER on IntegriCloud