diff options
| author | Craig Topper <craig.topper@gmail.com> | 2017-01-03 05:45:46 +0000 |
|---|---|---|
| committer | Craig Topper <craig.topper@gmail.com> | 2017-01-03 05:45:46 +0000 |
| commit | 4d47c6ae57b5905f7667202e53732a863773493a (patch) | |
| tree | b73c67ce8c7db46e2a8c943c71f2665c028cf238 /llvm | |
| parent | 1851df563d08a23ce2b4f5a7bd717df1ed7ba86a (diff) | |
| download | bcm5719-llvm-4d47c6ae57b5905f7667202e53732a863773493a.tar.gz bcm5719-llvm-4d47c6ae57b5905f7667202e53732a863773493a.zip | |
[AVX-512] Remove vextract intrinsics and autoupgrade to native shufflevectors. This unfortunately generates some really terrible code without VLX support due to v2i1 and v4i1 not being legal.
Hopefully we can improve that in future patches.
llvm-svn: 290863
Diffstat (limited to 'llvm')
| -rw-r--r-- | llvm/include/llvm/IR/IntrinsicsX86.td | 49 | ||||
| -rw-r--r-- | llvm/lib/IR/AutoUpgrade.cpp | 26 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86InstrAVX512.td | 27 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll | 102 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/avx512-intrinsics.ll | 47 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll | 54 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/avx512dq-intrinsics.ll | 41 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll | 19 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/avx512dqvl-intrinsics.ll | 20 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll | 19 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/avx512vl-intrinsics.ll | 20 |
11 files changed, 210 insertions, 214 deletions
diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td index 3a496cb6645..b1eb2b1c03a 100644 --- a/llvm/include/llvm/IR/IntrinsicsX86.td +++ b/llvm/include/llvm/IR/IntrinsicsX86.td @@ -2065,55 +2065,6 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". // Vector extract and insert let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". - def int_x86_avx512_mask_vextractf32x4_512 : - GCCBuiltin<"__builtin_ia32_extractf32x4_mask">, - Intrinsic<[llvm_v4f32_ty], [llvm_v16f32_ty, llvm_i32_ty, - llvm_v4f32_ty, llvm_i8_ty], [IntrNoMem]>; - def int_x86_avx512_mask_vextracti32x4_512 : - GCCBuiltin<"__builtin_ia32_extracti32x4_mask">, - Intrinsic<[llvm_v4i32_ty], [llvm_v16i32_ty, llvm_i32_ty, - llvm_v4i32_ty, llvm_i8_ty], [IntrNoMem]>; - def int_x86_avx512_mask_vextractf32x4_256 : - GCCBuiltin<"__builtin_ia32_extractf32x4_256_mask">, - Intrinsic<[llvm_v4f32_ty], [llvm_v8f32_ty, llvm_i32_ty, - llvm_v4f32_ty, llvm_i8_ty], [IntrNoMem]>; - def int_x86_avx512_mask_vextracti32x4_256 : - GCCBuiltin<"__builtin_ia32_extracti32x4_256_mask">, - Intrinsic<[llvm_v4i32_ty], [llvm_v8i32_ty, llvm_i32_ty, - llvm_v4i32_ty, llvm_i8_ty], [IntrNoMem]>; - def int_x86_avx512_mask_vextractf64x2_256 : - GCCBuiltin<"__builtin_ia32_extractf64x2_256_mask">, - Intrinsic<[llvm_v2f64_ty], [llvm_v4f64_ty, llvm_i32_ty, - llvm_v2f64_ty, llvm_i8_ty], [IntrNoMem]>; - def int_x86_avx512_mask_vextracti64x2_256 : - GCCBuiltin<"__builtin_ia32_extracti64x2_256_mask">, - Intrinsic<[llvm_v2i64_ty], [llvm_v4i64_ty, llvm_i32_ty, - llvm_v2i64_ty, llvm_i8_ty], [IntrNoMem]>; - def int_x86_avx512_mask_vextractf64x2_512 : - GCCBuiltin<"__builtin_ia32_extractf64x2_512_mask">, - Intrinsic<[llvm_v2f64_ty], [llvm_v8f64_ty, llvm_i32_ty, - llvm_v2f64_ty, llvm_i8_ty], [IntrNoMem]>; - def int_x86_avx512_mask_vextracti64x2_512 : - GCCBuiltin<"__builtin_ia32_extracti64x2_512_mask">, - Intrinsic<[llvm_v2i64_ty], [llvm_v8i64_ty, llvm_i32_ty, - llvm_v2i64_ty, llvm_i8_ty], [IntrNoMem]>; - def int_x86_avx512_mask_vextractf32x8_512 : - GCCBuiltin<"__builtin_ia32_extractf32x8_mask">, - Intrinsic<[llvm_v8f32_ty], [llvm_v16f32_ty, llvm_i32_ty, - llvm_v8f32_ty, llvm_i8_ty], [IntrNoMem]>; - def int_x86_avx512_mask_vextracti32x8_512 : - GCCBuiltin<"__builtin_ia32_extracti32x8_mask">, - Intrinsic<[llvm_v8i32_ty],[llvm_v16i32_ty, llvm_i32_ty, - llvm_v8i32_ty, llvm_i8_ty], [IntrNoMem]>; - def int_x86_avx512_mask_vextractf64x4_512 : - GCCBuiltin<"__builtin_ia32_extractf64x4_mask">, - Intrinsic<[llvm_v4f64_ty], [llvm_v8f64_ty, llvm_i32_ty, - llvm_v4f64_ty, llvm_i8_ty], [IntrNoMem]>; - def int_x86_avx512_mask_vextracti64x4_512 : - GCCBuiltin<"__builtin_ia32_extracti64x4_mask">, - Intrinsic<[llvm_v4i64_ty], [llvm_v8i64_ty, llvm_i32_ty, - llvm_v4i64_ty, llvm_i8_ty], [IntrNoMem]>; - def int_x86_avx512_mask_insertf32x4_256 : GCCBuiltin<"__builtin_ia32_insertf32x4_256_mask">, Intrinsic<[llvm_v8f32_ty], diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index 2d9d0f95efa..c6c6f0055d6 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -344,6 +344,7 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) { Name == "avx2.vinserti128" || // Added in 3.7 Name.startswith("avx.vextractf128.") || // Added in 3.7 Name == "avx2.vextracti128" || // Added in 3.7 + Name.startswith("avx512.mask.vextract") || // Added in 4.0 Name.startswith("sse4a.movnt.") || // Added in 3.9 Name.startswith("avx.movnt.") || // Added in 3.2 Name.startswith("avx512.storent.") || // Added in 3.9 @@ -1188,23 +1189,28 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { Idxs[i] = Imm ? (i + NumElts / 2) : i; Rep = Builder.CreateShuffleVector(Op0, Rep, Idxs); } else if (IsX86 && (Name.startswith("avx.vextractf128.") || - Name == "avx2.vextracti128")) { + Name == "avx2.vextracti128" || + Name.startswith("avx512.mask.vextract"))) { Value *Op0 = CI->getArgOperand(0); unsigned Imm = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue(); - VectorType *VecTy = cast<VectorType>(CI->getType()); - unsigned NumElts = VecTy->getNumElements(); + unsigned DstNumElts = CI->getType()->getVectorNumElements(); + unsigned SrcNumElts = Op0->getType()->getVectorNumElements(); + unsigned Scale = SrcNumElts / DstNumElts; // Mask off the high bits of the immediate value; hardware ignores those. - Imm = Imm & 1; + Imm = Imm % Scale; - // Get indexes for either the high half or low half of the input vector. - SmallVector<uint32_t, 4> Idxs(NumElts); - for (unsigned i = 0; i != NumElts; ++i) { - Idxs[i] = Imm ? (i + NumElts) : i; + // Get indexes for the subvector of the input vector. + SmallVector<uint32_t, 8> Idxs(DstNumElts); + for (unsigned i = 0; i != DstNumElts; ++i) { + Idxs[i] = i + (Imm * DstNumElts); } + Rep = Builder.CreateShuffleVector(Op0, Op0, Idxs); - Value *UndefV = UndefValue::get(Op0->getType()); - Rep = Builder.CreateShuffleVector(Op0, UndefV, Idxs); + // If the intrinsic has a mask operand, handle that. + if (CI->getNumArgOperands() == 4) + Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep, + CI->getArgOperand(2)); } else if (!IsX86 && Name == "stackprotectorcheck") { Rep = nullptr; } else if (IsX86 && (Name.startswith("avx512.mask.perm.df.") || diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index da7437ea0cc..6071009a78e 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -650,33 +650,6 @@ multiclass vextract_for_size<int Opcode, From.ZSuffix # "rrkz") To.KRCWM:$mask, From.RC:$src1, (EXTRACT_get_vextract_imm To.RC:$ext))>; - - // Intrinsic call with masking. - def : Pat<(!cast<Intrinsic>("int_x86_avx512_mask_vextract" # To.EltTypeName # - "x" # To.NumElts # "_" # From.Size) - From.RC:$src1, (iPTR imm:$idx), To.RC:$src0, To.MRC:$mask), - (!cast<Instruction>(NAME # To.EltSize # "x" # To.NumElts # - From.ZSuffix # "rrk") - To.RC:$src0, - (COPY_TO_REGCLASS To.MRC:$mask, To.KRCWM), - From.RC:$src1, imm:$idx)>; - - // Intrinsic call with zero-masking. - def : Pat<(!cast<Intrinsic>("int_x86_avx512_mask_vextract" # To.EltTypeName # - "x" # To.NumElts # "_" # From.Size) - From.RC:$src1, (iPTR imm:$idx), To.ImmAllZerosV, To.MRC:$mask), - (!cast<Instruction>(NAME # To.EltSize # "x" # To.NumElts # - From.ZSuffix # "rrkz") - (COPY_TO_REGCLASS To.MRC:$mask, To.KRCWM), - From.RC:$src1, imm:$idx)>; - - // Intrinsic call without masking. - def : Pat<(!cast<Intrinsic>("int_x86_avx512_mask_vextract" # To.EltTypeName # - "x" # To.NumElts # "_" # From.Size) - From.RC:$src1, (iPTR imm:$idx), To.ImmAllZerosV, (i8 -1)), - (!cast<Instruction>(NAME # To.EltSize # "x" # To.NumElts # - From.ZSuffix # "rr") - From.RC:$src1, imm:$idx)>; } // Codegen pattern for the alternative types diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll index f422a035498..6a813c08c93 100644 --- a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll @@ -2868,3 +2868,105 @@ define <8 x i64> @test_mask_mul_epu32_rmbkz(<16 x i32> %a, i64* %ptr_b, i8 %mask } declare <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32>, <16 x i32>, <8 x i64>, i8) + +define <4 x float> @test_mask_vextractf32x4(<4 x float> %b, <16 x float> %a, i8 %mask) { +; CHECK-LABEL: test_mask_vextractf32x4: +; CHECK: ## BB#0: +; CHECK-NEXT: vextractf32x4 $2, %zmm1, %xmm1 +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: kshiftlw $12, %k1, %k0 +; CHECK-NEXT: kshiftrw $15, %k0, %k0 +; CHECK-NEXT: kshiftlw $13, %k1, %k2 +; CHECK-NEXT: kshiftrw $15, %k2, %k2 +; CHECK-NEXT: kshiftlw $15, %k1, %k3 +; CHECK-NEXT: kshiftrw $15, %k3, %k3 +; CHECK-NEXT: kshiftlw $14, %k1, %k1 +; CHECK-NEXT: kshiftrw $15, %k1, %k1 +; CHECK-NEXT: kmovw %k1, %eax +; CHECK-NEXT: kmovw %k3, %ecx +; CHECK-NEXT: vmovd %ecx, %xmm2 +; CHECK-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 +; CHECK-NEXT: kmovw %k2, %eax +; CHECK-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vpinsrd $3, %eax, %xmm2, %xmm2 +; CHECK-NEXT: vpslld $31, %xmm2, %xmm2 +; CHECK-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512.mask.vextractf32x4.512(<16 x float> %a, i32 2, <4 x float> %b, i8 %mask) + ret <4 x float> %res +} + +declare <4 x float> @llvm.x86.avx512.mask.vextractf32x4.512(<16 x float>, i32, <4 x float>, i8) + +define <4 x i64> @test_mask_vextracti64x4(<4 x i64> %b, <8 x i64> %a, i8 %mask) { +; CHECK-LABEL: test_mask_vextracti64x4: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: kshiftlw $12, %k1, %k0 +; CHECK-NEXT: kshiftrw $15, %k0, %k0 +; CHECK-NEXT: kshiftlw $13, %k1, %k2 +; CHECK-NEXT: kshiftrw $15, %k2, %k2 +; CHECK-NEXT: kshiftlw $15, %k1, %k3 +; CHECK-NEXT: kshiftrw $15, %k3, %k3 +; CHECK-NEXT: kshiftlw $14, %k1, %k1 +; CHECK-NEXT: kshiftrw $15, %k1, %k1 +; CHECK-NEXT: kmovw %k1, %eax +; CHECK-NEXT: kmovw %k3, %ecx +; CHECK-NEXT: vmovd %ecx, %xmm2 +; CHECK-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 +; CHECK-NEXT: kmovw %k2, %eax +; CHECK-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vpinsrd $3, %eax, %xmm2, %xmm2 +; CHECK-NEXT: vpslld $31, %xmm2, %xmm2 +; CHECK-NEXT: vpmovsxdq %xmm2, %ymm2 +; CHECK-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: retq + %res = call <4 x i64> @llvm.x86.avx512.mask.vextracti64x4.512(<8 x i64> %a, i32 2, <4 x i64> %b, i8 %mask) + ret <4 x i64> %res +} + +declare <4 x i64> @llvm.x86.avx512.mask.vextracti64x4.512(<8 x i64>, i32, <4 x i64>, i8) + +define <4 x i32> @test_maskz_vextracti32x4(<16 x i32> %a, i8 %mask) { +; CHECK-LABEL: test_maskz_vextracti32x4: +; CHECK: ## BB#0: +; CHECK-NEXT: vextracti32x4 $2, %zmm0, %xmm0 +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: kshiftlw $12, %k1, %k0 +; CHECK-NEXT: kshiftrw $15, %k0, %k0 +; CHECK-NEXT: kshiftlw $13, %k1, %k2 +; CHECK-NEXT: kshiftrw $15, %k2, %k2 +; CHECK-NEXT: kshiftlw $15, %k1, %k3 +; CHECK-NEXT: kshiftrw $15, %k3, %k3 +; CHECK-NEXT: kshiftlw $14, %k1, %k1 +; CHECK-NEXT: kshiftrw $15, %k1, %k1 +; CHECK-NEXT: kmovw %k1, %eax +; CHECK-NEXT: kmovw %k3, %ecx +; CHECK-NEXT: vmovd %ecx, %xmm1 +; CHECK-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; CHECK-NEXT: kmovw %k2, %eax +; CHECK-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 +; CHECK-NEXT: vpslld $31, %xmm1, %xmm1 +; CHECK-NEXT: vpsrad $31, %xmm1, %xmm1 +; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x i32> @llvm.x86.avx512.mask.vextracti32x4.512(<16 x i32> %a, i32 2, <4 x i32> zeroinitializer, i8 %mask) + ret <4 x i32> %res +} + +declare <4 x i32> @llvm.x86.avx512.mask.vextracti32x4.512(<16 x i32>, i32, <4 x i32>, i8) + +define <4 x double> @test_vextractf64x4(<8 x double> %a) { +; CHECK-LABEL: test_vextractf64x4: +; CHECK: ## BB#0: +; CHECK-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill> +; CHECK-NEXT: retq + %res = call <4 x double> @llvm.x86.avx512.mask.vextractf64x4.512(<8 x double> %a, i32 2, <4 x double> zeroinitializer, i8 -1) + ret <4 x double> %res +} + +declare <4 x double> @llvm.x86.avx512.mask.vextractf64x4.512(<8 x double>, i32, <4 x double>, i8) diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics.ll b/llvm/test/CodeGen/X86/avx512-intrinsics.ll index 5442693806f..a41de711e57 100644 --- a/llvm/test/CodeGen/X86/avx512-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics.ll @@ -1243,53 +1243,6 @@ define <8 x i8> @test_mask_ucmp_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { declare i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64>, <8 x i64>, i32, i8) nounwind readnone -define <4 x float> @test_mask_vextractf32x4(<4 x float> %b, <16 x float> %a, i8 %mask) { -; CHECK-LABEL: test_mask_vextractf32x4: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vextractf32x4 $2, %zmm1, %xmm0 {%k1} -; CHECK-NEXT: retq - %res = call <4 x float> @llvm.x86.avx512.mask.vextractf32x4.512(<16 x float> %a, i32 2, <4 x float> %b, i8 %mask) - ret <4 x float> %res -} - -declare <4 x float> @llvm.x86.avx512.mask.vextractf32x4.512(<16 x float>, i32, <4 x float>, i8) - -define <4 x i64> @test_mask_vextracti64x4(<4 x i64> %b, <8 x i64> %a, i8 %mask) { -; CHECK-LABEL: test_mask_vextracti64x4: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vextracti64x4 $2, %zmm1, %ymm0 {%k1} -; CHECK-NEXT: retq - %res = call <4 x i64> @llvm.x86.avx512.mask.vextracti64x4.512(<8 x i64> %a, i32 2, <4 x i64> %b, i8 %mask) - ret <4 x i64> %res -} - -declare <4 x i64> @llvm.x86.avx512.mask.vextracti64x4.512(<8 x i64>, i32, <4 x i64>, i8) - -define <4 x i32> @test_maskz_vextracti32x4(<16 x i32> %a, i8 %mask) { -; CHECK-LABEL: test_maskz_vextracti32x4: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vextracti32x4 $2, %zmm0, %xmm0 {%k1} {z} -; CHECK-NEXT: retq - %res = call <4 x i32> @llvm.x86.avx512.mask.vextracti32x4.512(<16 x i32> %a, i32 2, <4 x i32> zeroinitializer, i8 %mask) - ret <4 x i32> %res -} - -declare <4 x i32> @llvm.x86.avx512.mask.vextracti32x4.512(<16 x i32>, i32, <4 x i32>, i8) - -define <4 x double> @test_vextractf64x4(<8 x double> %a) { -; CHECK-LABEL: test_vextractf64x4: -; CHECK: ## BB#0: -; CHECK-NEXT: vextractf64x4 $2, %zmm0, %ymm0 -; CHECK-NEXT: retq - %res = call <4 x double> @llvm.x86.avx512.mask.vextractf64x4.512(<8 x double> %a, i32 2, <4 x double> zeroinitializer, i8 -1) - ret <4 x double> %res -} - -declare <4 x double> @llvm.x86.avx512.mask.vextractf64x4.512(<8 x double>, i32, <4 x double>, i8) - declare <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) declare <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) declare <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) diff --git a/llvm/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll new file mode 100644 index 00000000000..e07834eb9c0 --- /dev/null +++ b/llvm/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll @@ -0,0 +1,54 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512dq | FileCheck %s + +declare <2 x double> @llvm.x86.avx512.mask.vextractf64x2.512(<8 x double>, i32, <2 x double>, i8) + +define <2 x double>@test_int_x86_avx512_mask_vextractf64x2_512(<8 x double> %x0, <2 x double> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_vextractf64x2_512: +; CHECK: ## BB#0: +; CHECK-NEXT: vextractf64x2 $1, %zmm0, %xmm0 +; CHECK-NEXT: kmovb %edi, %k0 +; CHECK-NEXT: kshiftlb $7, %k0, %k1 +; CHECK-NEXT: kshiftrb $7, %k1, %k1 +; CHECK-NEXT: kshiftlb $6, %k0, %k0 +; CHECK-NEXT: kshiftrb $7, %k0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: vmovq %rax, %xmm2 +; CHECK-NEXT: kmovw %k1, %eax +; CHECK-NEXT: vmovq %rax, %xmm3 +; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; CHECK-NEXT: vpsllq $63, %xmm2, %xmm2 +; CHECK-NEXT: vpsrad $31, %xmm2, %xmm2 +; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm1 +; CHECK-NEXT: vandpd %xmm0, %xmm2, %xmm2 +; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vaddpd %xmm0, %xmm2, %xmm0 +; CHECK-NEXT: retq + %res = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.512(<8 x double> %x0,i32 1, <2 x double> %x2, i8 %x3) + %res2 = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.512(<8 x double> %x0,i32 1, <2 x double> zeroinitializer, i8 %x3) + %res1 = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.512(<8 x double> %x0,i32 1, <2 x double> zeroinitializer, i8 -1) + %res3 = fadd <2 x double> %res, %res1 + %res4 = fadd <2 x double> %res2, %res3 + ret <2 x double> %res4 +} + +declare <8 x float> @llvm.x86.avx512.mask.vextractf32x8.512(<16 x float>, i32, <8 x float>, i8) + +define <8 x float>@test_int_x86_avx512_mask_vextractf32x8(<16 x float> %x0, <8 x float> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_vextractf32x8: +; CHECK: ## BB#0: +; CHECK-NEXT: vextractf32x8 $1, %zmm0, %ymm2 +; CHECK-NEXT: kmovb %edi, %k1 +; CHECK-NEXT: vextractf32x8 $1, %zmm0, %ymm1 {%k1} +; CHECK-NEXT: vextractf32x8 $1, %zmm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1 +; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: retq + %res = call <8 x float> @llvm.x86.avx512.mask.vextractf32x8.512(<16 x float> %x0,i32 1, <8 x float> %x2, i8 %x3) + %res2 = call <8 x float> @llvm.x86.avx512.mask.vextractf32x8.512(<16 x float> %x0,i32 1, <8 x float> zeroinitializer, i8 %x3) + %res1 = call <8 x float> @llvm.x86.avx512.mask.vextractf32x8.512(<16 x float> %x0,i32 1, <8 x float> zeroinitializer, i8 -1) + %res3 = fadd <8 x float> %res, %res1 + %res4 = fadd <8 x float> %res2, %res3 + ret <8 x float> %res4 +} diff --git a/llvm/test/CodeGen/X86/avx512dq-intrinsics.ll b/llvm/test/CodeGen/X86/avx512dq-intrinsics.ll index 5826bb6fad2..caf932d0cdf 100644 --- a/llvm/test/CodeGen/X86/avx512dq-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512dq-intrinsics.ll @@ -325,47 +325,6 @@ define <2 x double>@test_int_x86_avx512_mask_range_sd(<2 x double> %x0, <2 x dou ret <2 x double> %res2 } - -declare <2 x double> @llvm.x86.avx512.mask.vextractf64x2.512(<8 x double>, i32, <2 x double>, i8) - -define <2 x double>@test_int_x86_avx512_mask_vextractf64x2_512(<8 x double> %x0, <2 x double> %x2, i8 %x3) { -; CHECK-LABEL: test_int_x86_avx512_mask_vextractf64x2_512: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovb %edi, %k1 -; CHECK-NEXT: vextractf64x2 $1, %zmm0, %xmm1 {%k1} -; CHECK-NEXT: vextractf64x2 $1, %zmm0, %xmm2 {%k1} {z} -; CHECK-NEXT: vextractf64x2 $1, %zmm0, %xmm0 -; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 -; CHECK-NEXT: vaddpd %xmm0, %xmm2, %xmm0 -; CHECK-NEXT: retq - %res = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.512(<8 x double> %x0,i32 1, <2 x double> %x2, i8 %x3) - %res2 = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.512(<8 x double> %x0,i32 1, <2 x double> zeroinitializer, i8 %x3) - %res1 = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.512(<8 x double> %x0,i32 1, <2 x double> zeroinitializer, i8 -1) - %res3 = fadd <2 x double> %res, %res1 - %res4 = fadd <2 x double> %res2, %res3 - ret <2 x double> %res4 -} - -declare <8 x float> @llvm.x86.avx512.mask.vextractf32x8.512(<16 x float>, i32, <8 x float>, i8) - -define <8 x float>@test_int_x86_avx512_mask_vextractf32x8(<16 x float> %x0, <8 x float> %x2, i8 %x3) { -; CHECK-LABEL: test_int_x86_avx512_mask_vextractf32x8: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovb %edi, %k1 -; CHECK-NEXT: vextractf32x8 $1, %zmm0, %ymm1 {%k1} -; CHECK-NEXT: vextractf32x8 $1, %zmm0, %ymm2 {%k1} {z} -; CHECK-NEXT: vextractf32x8 $1, %zmm0, %ymm0 -; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; CHECK-NEXT: vaddps %ymm0, %ymm2, %ymm0 -; CHECK-NEXT: retq - %res = call <8 x float> @llvm.x86.avx512.mask.vextractf32x8.512(<16 x float> %x0,i32 1, <8 x float> %x2, i8 %x3) - %res2 = call <8 x float> @llvm.x86.avx512.mask.vextractf32x8.512(<16 x float> %x0,i32 1, <8 x float> zeroinitializer, i8 %x3) - %res1 = call <8 x float> @llvm.x86.avx512.mask.vextractf32x8.512(<16 x float> %x0,i32 1, <8 x float> zeroinitializer, i8 -1) - %res3 = fadd <8 x float> %res, %res1 - %res4 = fadd <8 x float> %res2, %res3 - ret <8 x float> %res4 -} - declare <16 x float> @llvm.x86.avx512.mask.insertf32x8.512(<16 x float>, <8 x float>, i32, <16 x float>, i16) define <16 x float>@test_int_x86_avx512_mask_insertf32x8_512(<16 x float> %x0, <8 x float> %x1, <16 x float> %x3, i16 %x4) { diff --git a/llvm/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll index 9bf989df22a..db39569525f 100644 --- a/llvm/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll @@ -1560,3 +1560,22 @@ define <2 x i64> @test_mask_mullo_epi64_rmbkz_128(<2 x i64> %a, i64* %ptr_b, i8 declare <2 x i64> @llvm.x86.avx512.mask.pmull.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8) +declare <2 x double> @llvm.x86.avx512.mask.vextractf64x2.256(<4 x double>, i32, <2 x double>, i8) + +define <2 x double>@test_int_x86_avx512_mask_vextractf64x2_256(<4 x double> %x0, <2 x double> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_vextractf64x2_256: +; CHECK: ## BB#0: +; CHECK-NEXT: vextractf64x2 $1, %ymm0, %xmm2 ## encoding: [0x62,0xf3,0xfd,0x28,0x19,0xc2,0x01] +; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf] +; CHECK-NEXT: vextractf64x2 $1, %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x19,0xc1,0x01] +; CHECK-NEXT: vextractf64x2 $1, %ymm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x19,0xc0,0x01] +; CHECK-NEXT: vaddpd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xca] +; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc0] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.256(<4 x double> %x0,i32 1, <2 x double> %x2, i8 %x3) + %res2 = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.256(<4 x double> %x0,i32 1, <2 x double> zeroinitializer, i8 %x3) + %res1 = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.256(<4 x double> %x0,i32 1, <2 x double> zeroinitializer, i8 -1) + %res3 = fadd <2 x double> %res, %res1 + %res4 = fadd <2 x double> %res3, %res2 + ret <2 x double> %res4 +} diff --git a/llvm/test/CodeGen/X86/avx512dqvl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512dqvl-intrinsics.ll index eb9c6b64bcf..a0d91f48be7 100644 --- a/llvm/test/CodeGen/X86/avx512dqvl-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512dqvl-intrinsics.ll @@ -549,26 +549,6 @@ define <8 x float>@test_int_x86_avx512_mask_range_ps_256(<8 x float> %x0, <8 x f ret <8 x float> %res2 } -declare <2 x double> @llvm.x86.avx512.mask.vextractf64x2.256(<4 x double>, i32, <2 x double>, i8) - -define <2 x double>@test_int_x86_avx512_mask_vextractf64x2_256(<4 x double> %x0, <2 x double> %x2, i8 %x3) { -; CHECK-LABEL: test_int_x86_avx512_mask_vextractf64x2_256: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf] -; CHECK-NEXT: vextractf64x2 $1, %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x19,0xc1,0x01] -; CHECK-NEXT: vextractf64x2 $1, %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x19,0xc2,0x01] -; CHECK-NEXT: vextractf64x2 $1, %ymm0, %xmm0 ## encoding: [0x62,0xf3,0xfd,0x28,0x19,0xc0,0x01] -; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc0] -; CHECK-NEXT: vaddpd %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0xc2] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.256(<4 x double> %x0,i32 1, <2 x double> %x2, i8 %x3) - %res2 = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.256(<4 x double> %x0,i32 1, <2 x double> zeroinitializer, i8 %x3) - %res1 = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.256(<4 x double> %x0,i32 1, <2 x double> zeroinitializer, i8 -1) - %res3 = fadd <2 x double> %res, %res1 - %res4 = fadd <2 x double> %res3, %res2 - ret <2 x double> %res4 -} - declare <4 x double> @llvm.x86.avx512.mask.insertf64x2.256(<4 x double>, <2 x double>, i32, <4 x double>, i8) define <4 x double>@test_int_x86_avx512_mask_insertf64x2_256(<4 x double> %x0, <2 x double> %x1, <4 x double> %x3, i8 %x4) { diff --git a/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll index 8d44af7b7a4..9e064684af8 100644 --- a/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll @@ -4773,3 +4773,22 @@ define <4 x float>@test_int_x86_avx512_mask_vpermilvar_ps_128(<4 x float> %x0, < ret <4 x float> %res4 } +declare <4 x float> @llvm.x86.avx512.mask.vextractf32x4.256(<8 x float>, i32, <4 x float>, i8) + +define <4 x float>@test_int_x86_avx512_mask_vextractf32x4_256(<8 x float> %x0, <4 x float> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_vextractf32x4_256: +; CHECK: ## BB#0: +; CHECK-NEXT: vextractf32x4 $1, %ymm0, %xmm2 ## encoding: [0x62,0xf3,0x7d,0x28,0x19,0xc2,0x01] +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; CHECK-NEXT: vextractf32x4 $1, %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x19,0xc1,0x01] +; CHECK-NEXT: vextractf32x4 $1, %ymm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x19,0xc0,0x01] +; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc0] +; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xc0] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <4 x float> @llvm.x86.avx512.mask.vextractf32x4.256(<8 x float> %x0, i32 1, <4 x float> %x2, i8 %x3) + %res1 = call <4 x float> @llvm.x86.avx512.mask.vextractf32x4.256(<8 x float> %x0, i32 1, <4 x float> zeroinitializer, i8 %x3) + %res2 = call <4 x float> @llvm.x86.avx512.mask.vextractf32x4.256(<8 x float> %x0, i32 1, <4 x float> zeroinitializer, i8 -1) + %res3 = fadd <4 x float> %res, %res1 + %res4 = fadd <4 x float> %res2, %res3 + ret <4 x float> %res4 +} diff --git a/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll index 94095f549e5..4e183bd7fef 100644 --- a/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll @@ -3621,26 +3621,6 @@ define <4 x i64>@test_int_x86_avx512_mask_shuf_i64x2_256(<4 x i64> %x0, <4 x i64 ret <4 x i64> %res2 } -declare <4 x float> @llvm.x86.avx512.mask.vextractf32x4.256(<8 x float>, i32, <4 x float>, i8) - -define <4 x float>@test_int_x86_avx512_mask_vextractf32x4_256(<8 x float> %x0, <4 x float> %x2, i8 %x3) { -; CHECK-LABEL: test_int_x86_avx512_mask_vextractf32x4_256: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vextractf32x4 $1, %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x19,0xc1,0x01] -; CHECK-NEXT: vextractf32x4 $1, %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x19,0xc2,0x01] -; CHECK-NEXT: vextractf32x4 $1, %ymm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x28,0x19,0xc0,0x01] -; CHECK-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca] -; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <4 x float> @llvm.x86.avx512.mask.vextractf32x4.256(<8 x float> %x0, i32 1, <4 x float> %x2, i8 %x3) - %res1 = call <4 x float> @llvm.x86.avx512.mask.vextractf32x4.256(<8 x float> %x0, i32 1, <4 x float> zeroinitializer, i8 %x3) - %res2 = call <4 x float> @llvm.x86.avx512.mask.vextractf32x4.256(<8 x float> %x0, i32 1, <4 x float> zeroinitializer, i8 -1) - %res3 = fadd <4 x float> %res, %res1 - %res4 = fadd <4 x float> %res2, %res3 - ret <4 x float> %res4 -} - declare <2 x double> @llvm.x86.avx512.mask.getmant.pd.128(<2 x double>, i32, <2 x double>, i8) define <2 x double>@test_int_x86_avx512_mask_getmant_pd_128(<2 x double> %x0, <2 x double> %x2, i8 %x3) { |

