diff options
-rw-r--r-- | llvm/include/llvm/IR/IntrinsicsX86.td | 24 | ||||
-rw-r--r-- | llvm/lib/IR/AutoUpgrade.cpp | 18 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86IntrinsicsInfo.h | 8 | ||||
-rw-r--r-- | llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp | 46 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll | 396 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/avx512bw-intrinsics.ll | 100 |
6 files changed, 485 insertions, 107 deletions
diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td index 71321ff612b..9d723a710d1 100644 --- a/llvm/include/llvm/IR/IntrinsicsX86.td +++ b/llvm/include/llvm/IR/IntrinsicsX86.td @@ -3653,18 +3653,18 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". } // Integer arithmetic ops let TargetPrefix = "x86" in { - def int_x86_avx512_mask_padds_b_512 : GCCBuiltin<"__builtin_ia32_paddsb512_mask">, - Intrinsic<[llvm_v64i8_ty], [llvm_v64i8_ty, llvm_v64i8_ty, - llvm_v64i8_ty, llvm_i64_ty], [IntrNoMem]>; - def int_x86_avx512_mask_padds_w_512 : GCCBuiltin<"__builtin_ia32_paddsw512_mask">, - Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v32i16_ty, - llvm_v32i16_ty, llvm_i32_ty], [IntrNoMem]>; - def int_x86_avx512_mask_psubs_b_512 : GCCBuiltin<"__builtin_ia32_psubsb512_mask">, - Intrinsic<[llvm_v64i8_ty], [llvm_v64i8_ty, llvm_v64i8_ty, - llvm_v64i8_ty, llvm_i64_ty], [IntrNoMem]>; - def int_x86_avx512_mask_psubs_w_512 : GCCBuiltin<"__builtin_ia32_psubsw512_mask">, - Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v32i16_ty, - llvm_v32i16_ty, llvm_i32_ty], [IntrNoMem]>; + def int_x86_avx512_padds_b_512 : GCCBuiltin<"__builtin_ia32_paddsb512">, + Intrinsic<[llvm_v64i8_ty], [llvm_v64i8_ty, llvm_v64i8_ty], + [IntrNoMem]>; + def int_x86_avx512_padds_w_512 : GCCBuiltin<"__builtin_ia32_paddsw512">, + Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v32i16_ty], + [IntrNoMem]>; + def int_x86_avx512_psubs_b_512 : GCCBuiltin<"__builtin_ia32_psubsb512">, + Intrinsic<[llvm_v64i8_ty], [llvm_v64i8_ty, llvm_v64i8_ty], + [IntrNoMem]>; + def int_x86_avx512_psubs_w_512 : GCCBuiltin<"__builtin_ia32_psubsw512">, + Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v32i16_ty], + [IntrNoMem]>; def int_x86_avx512_pmulhu_w_512 : GCCBuiltin<"__builtin_ia32_pmulhuw512">, Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v32i16_ty], [IntrNoMem, Commutative]>; diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index ae88a285a0e..1864ad29b30 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -282,14 +282,8 @@ static bool ShouldUpgradeX86Intrinsic(Function *F, StringRef Name) { Name.startswith("avx512.mask.pror.") || // Added in 7.0 Name.startswith("avx512.mask.prolv.") || // Added in 7.0 Name.startswith("avx512.mask.prol.") || // Added in 7.0 - Name == "avx512.mask.padds.b.128" || // Added in 8.0 - Name == "avx512.mask.padds.b.256" || // Added in 8.0 - Name == "avx512.mask.padds.w.128" || // Added in 8.0 - Name == "avx512.mask.padds.w.256" || // Added in 8.0 - Name == "avx512.mask.psubs.b.128" || // Added in 8.0 - Name == "avx512.mask.psubs.b.256" || // Added in 8.0 - Name == "avx512.mask.psubs.w.128" || // Added in 8.0 - Name == "avx512.mask.psubs.w.256" || // Added in 8.0 + Name.startswith("avx512.mask.padds.") || // Added in 8.0 + Name.startswith("avx512.mask.psubs.") || // Added in 8.0 Name == "sse.cvtsi2ss" || // Added in 7.0 Name == "sse.cvtsi642ss" || // Added in 7.0 Name == "sse2.cvtsi2sd" || // Added in 7.0 @@ -1417,10 +1411,14 @@ static bool upgradeAVX512MaskToSelect(StringRef Name, IRBuilder<> &Builder, IID = Intrinsic::x86_sse2_padds_b; else if (VecWidth == 256 && EltWidth == 8) IID = Intrinsic::x86_avx2_padds_b; + else if (VecWidth == 512 && EltWidth == 8) + IID = Intrinsic::x86_avx512_padds_b_512; else if (VecWidth == 128 && EltWidth == 16) IID = Intrinsic::x86_sse2_padds_w; else if (VecWidth == 256 && EltWidth == 16) IID = Intrinsic::x86_avx2_padds_w; + else if (VecWidth == 512 && EltWidth == 16) + IID = Intrinsic::x86_avx512_padds_w_512; else llvm_unreachable("Unexpected intrinsic"); } else if (Name.startswith("psubs.")) { @@ -1428,10 +1426,14 @@ static bool upgradeAVX512MaskToSelect(StringRef Name, IRBuilder<> &Builder, IID = Intrinsic::x86_sse2_psubs_b; else if (VecWidth == 256 && EltWidth == 8) IID = Intrinsic::x86_avx2_psubs_b; + else if (VecWidth == 512 && EltWidth == 8) + IID = Intrinsic::x86_avx512_psubs_b_512; else if (VecWidth == 128 && EltWidth == 16) IID = Intrinsic::x86_sse2_psubs_w; else if (VecWidth == 256 && EltWidth == 16) IID = Intrinsic::x86_avx2_psubs_w; + else if (VecWidth == 512 && EltWidth == 16) + IID = Intrinsic::x86_avx512_psubs_w_512; else llvm_unreachable("Unexpected intrinsic"); } else diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h index 95f17f76298..a9ee3182aaa 100644 --- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h +++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h @@ -673,8 +673,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::FMULS_RND, 0), X86_INTRINSIC_DATA(avx512_mask_mul_ss_round, INTR_TYPE_SCALAR_MASK_RM, X86ISD::FMULS_RND, 0), - X86_INTRINSIC_DATA(avx512_mask_padds_b_512, INTR_TYPE_2OP_MASK, X86ISD::ADDS, 0), - X86_INTRINSIC_DATA(avx512_mask_padds_w_512, INTR_TYPE_2OP_MASK, X86ISD::ADDS, 0), X86_INTRINSIC_DATA(avx512_mask_pmov_db_128, INTR_TYPE_1OP_MASK, X86ISD::VTRUNC, 0), X86_INTRINSIC_DATA(avx512_mask_pmov_db_256, INTR_TYPE_1OP_MASK, @@ -789,8 +787,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::MULTISHIFT, 0), X86_INTRINSIC_DATA(avx512_mask_pmultishift_qb_512, INTR_TYPE_2OP_MASK, X86ISD::MULTISHIFT, 0), - X86_INTRINSIC_DATA(avx512_mask_psubs_b_512, INTR_TYPE_2OP_MASK, X86ISD::SUBS, 0), - X86_INTRINSIC_DATA(avx512_mask_psubs_w_512, INTR_TYPE_2OP_MASK, X86ISD::SUBS, 0), X86_INTRINSIC_DATA(avx512_mask_range_pd_128, INTR_TYPE_3OP_MASK, X86ISD::VRANGE, 0), X86_INTRINSIC_DATA(avx512_mask_range_pd_256, INTR_TYPE_3OP_MASK, X86ISD::VRANGE, 0), X86_INTRINSIC_DATA(avx512_mask_range_pd_512, INTR_TYPE_3OP_MASK, X86ISD::VRANGE, X86ISD::VRANGE_RND), @@ -926,6 +922,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_packsswb_512, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(avx512_packusdw_512, INTR_TYPE_2OP, X86ISD::PACKUS, 0), X86_INTRINSIC_DATA(avx512_packuswb_512, INTR_TYPE_2OP, X86ISD::PACKUS, 0), + X86_INTRINSIC_DATA(avx512_padds_b_512, INTR_TYPE_2OP, X86ISD::ADDS, 0), + X86_INTRINSIC_DATA(avx512_padds_w_512, INTR_TYPE_2OP, X86ISD::ADDS, 0), X86_INTRINSIC_DATA(avx512_permvar_df_256, VPERM_2OP, X86ISD::VPERMV, 0), X86_INTRINSIC_DATA(avx512_permvar_df_512, VPERM_2OP, X86ISD::VPERMV, 0), X86_INTRINSIC_DATA(avx512_permvar_di_256, VPERM_2OP, X86ISD::VPERMV, 0), @@ -1008,6 +1006,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_psrlv_w_128, INTR_TYPE_2OP, ISD::SRL, 0), X86_INTRINSIC_DATA(avx512_psrlv_w_256, INTR_TYPE_2OP, ISD::SRL, 0), X86_INTRINSIC_DATA(avx512_psrlv_w_512, INTR_TYPE_2OP, ISD::SRL, 0), + X86_INTRINSIC_DATA(avx512_psubs_b_512, INTR_TYPE_2OP, X86ISD::SUBS, 0), + X86_INTRINSIC_DATA(avx512_psubs_w_512, INTR_TYPE_2OP, X86ISD::SUBS, 0), X86_INTRINSIC_DATA(avx512_pternlog_d_128, INTR_TYPE_4OP, X86ISD::VPTERNLOG, 0), X86_INTRINSIC_DATA(avx512_pternlog_d_256, INTR_TYPE_4OP, X86ISD::VPTERNLOG, 0), X86_INTRINSIC_DATA(avx512_pternlog_d_512, INTR_TYPE_4OP, X86ISD::VPTERNLOG, 0), diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index 7773c3d234e..1d0cd11f941 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -245,8 +245,7 @@ Instruction *InstCombiner::SimplifyAnyMemSet(AnyMemSetInst *MI) { static Value *simplifyX86AddsSubs(const IntrinsicInst &II, InstCombiner::BuilderTy &Builder) { - bool IsAddition = false; - bool IsMasked = false; + bool IsAddition; switch (II.getIntrinsicID()) { default: llvm_unreachable("Unexpected intrinsic!"); @@ -254,21 +253,17 @@ static Value *simplifyX86AddsSubs(const IntrinsicInst &II, case Intrinsic::x86_sse2_padds_w: case Intrinsic::x86_avx2_padds_b: case Intrinsic::x86_avx2_padds_w: - IsAddition = true; IsMasked = false; + case Intrinsic::x86_avx512_padds_b_512: + case Intrinsic::x86_avx512_padds_w_512: + IsAddition = true; break; case Intrinsic::x86_sse2_psubs_b: case Intrinsic::x86_sse2_psubs_w: case Intrinsic::x86_avx2_psubs_b: case Intrinsic::x86_avx2_psubs_w: - IsAddition = false; IsMasked = false; - break; - case Intrinsic::x86_avx512_mask_padds_b_512: - case Intrinsic::x86_avx512_mask_padds_w_512: - IsAddition = true; IsMasked = true; - break; - case Intrinsic::x86_avx512_mask_psubs_b_512: - case Intrinsic::x86_avx512_mask_psubs_w_512: - IsAddition = false; IsMasked = true; + case Intrinsic::x86_avx512_psubs_b_512: + case Intrinsic::x86_avx512_psubs_w_512: + IsAddition = false; break; } @@ -278,7 +273,7 @@ static Value *simplifyX86AddsSubs(const IntrinsicInst &II, auto SVT = VT->getElementType(); unsigned NumElems = VT->getNumElements(); - if (!Arg0 || !Arg1 || (IsMasked && !isa<Constant>(II.getOperand(2)))) + if (!Arg0 || !Arg1) return nullptr; SmallVector<Constant *, 64> Result; @@ -306,21 +301,7 @@ static Value *simplifyX86AddsSubs(const IntrinsicInst &II, Result.push_back(Constant::getIntegerValue(SVT, ResultElem)); } - Value *ResultVec = ConstantVector::get(Result); - - if (II.getNumArgOperands() == 4) { // For masked intrinsics. - Value *Src = II.getOperand(2); - auto Mask = II.getOperand(3); - if (auto *C = dyn_cast<Constant>(Mask)) - if (C->isAllOnesValue()) - return ResultVec; - auto *MaskTy = VectorType::get( - Builder.getInt1Ty(), cast<IntegerType>(Mask->getType())->getBitWidth()); - Mask = Builder.CreateBitCast(Mask, MaskTy); - ResultVec = Builder.CreateSelect(Mask, ResultVec, Src); - } - - return ResultVec; + return ConstantVector::get(Result); } static Value *simplifyX86immShift(const IntrinsicInst &II, @@ -2614,14 +2595,13 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { case Intrinsic::x86_avx2_padds_w: case Intrinsic::x86_avx2_psubs_b: case Intrinsic::x86_avx2_psubs_w: - case Intrinsic::x86_avx512_mask_padds_b_512: - case Intrinsic::x86_avx512_mask_padds_w_512: - case Intrinsic::x86_avx512_mask_psubs_b_512: - case Intrinsic::x86_avx512_mask_psubs_w_512: + case Intrinsic::x86_avx512_padds_b_512: + case Intrinsic::x86_avx512_padds_w_512: + case Intrinsic::x86_avx512_psubs_b_512: + case Intrinsic::x86_avx512_psubs_w_512: if (Value *V = simplifyX86AddsSubs(*II, Builder)) return replaceInstUsesWith(*II, V); break; - // Constant fold ashr( <A x Bi>, Ci ). // Constant fold lshr( <A x Bi>, Ci ). diff --git a/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll index 31a557aa4ae..c623d8bbf2d 100644 --- a/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll @@ -3284,3 +3284,399 @@ define <64 x i8> @test_mask_subs_epu8_rmkz_512(<64 x i8> %a, <64 x i8>* %ptr_b, } declare <64 x i8> @llvm.x86.avx512.mask.psubus.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64) + +define <32 x i16> @test_mask_adds_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) { +; CHECK-LABEL: test_mask_adds_epi16_rr_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpaddsw %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xed,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1) + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_adds_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) { +; X86-LABEL: test_mask_adds_epi16_rrk_512: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpaddsw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xed,0xd1] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mask_adds_epi16_rrk_512: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpaddsw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xed,0xd1] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_adds_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) { +; X86-LABEL: test_mask_adds_epi16_rrkz_512: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpaddsw %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xed,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mask_adds_epi16_rrkz_512: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpaddsw %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xed,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask) + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_adds_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) { +; X86-LABEL: test_mask_adds_epi16_rm_512: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vpaddsw (%eax), %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xed,0x00] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mask_adds_epi16_rm_512: +; X64: # %bb.0: +; X64-NEXT: vpaddsw (%rdi), %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xed,0x07] +; X64-NEXT: retq # encoding: [0xc3] + %b = load <32 x i16>, <32 x i16>* %ptr_b + %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1) + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_adds_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) { +; X86-LABEL: test_mask_adds_epi16_rmk_512: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08] +; X86-NEXT: vpaddsw (%eax), %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xed,0x08] +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mask_adds_epi16_rmk_512: +; X64: # %bb.0: +; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] +; X64-NEXT: vpaddsw (%rdi), %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xed,0x0f] +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %b = load <32 x i16>, <32 x i16>* %ptr_b + %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_adds_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) { +; X86-LABEL: test_mask_adds_epi16_rmkz_512: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08] +; X86-NEXT: vpaddsw (%eax), %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xed,0x00] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mask_adds_epi16_rmkz_512: +; X64: # %bb.0: +; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] +; X64-NEXT: vpaddsw (%rdi), %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xed,0x07] +; X64-NEXT: retq # encoding: [0xc3] + %b = load <32 x i16>, <32 x i16>* %ptr_b + %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask) + ret <32 x i16> %res +} + +declare <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) + +define <32 x i16> @test_mask_subs_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) { +; CHECK-LABEL: test_mask_subs_epi16_rr_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpsubsw %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xe9,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1) + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_subs_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) { +; X86-LABEL: test_mask_subs_epi16_rrk_512: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpsubsw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xe9,0xd1] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mask_subs_epi16_rrk_512: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpsubsw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xe9,0xd1] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_subs_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) { +; X86-LABEL: test_mask_subs_epi16_rrkz_512: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpsubsw %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xe9,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mask_subs_epi16_rrkz_512: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpsubsw %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xe9,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask) + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_subs_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) { +; X86-LABEL: test_mask_subs_epi16_rm_512: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vpsubsw (%eax), %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xe9,0x00] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mask_subs_epi16_rm_512: +; X64: # %bb.0: +; X64-NEXT: vpsubsw (%rdi), %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xe9,0x07] +; X64-NEXT: retq # encoding: [0xc3] + %b = load <32 x i16>, <32 x i16>* %ptr_b + %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1) + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_subs_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) { +; X86-LABEL: test_mask_subs_epi16_rmk_512: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08] +; X86-NEXT: vpsubsw (%eax), %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xe9,0x08] +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mask_subs_epi16_rmk_512: +; X64: # %bb.0: +; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] +; X64-NEXT: vpsubsw (%rdi), %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xe9,0x0f] +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %b = load <32 x i16>, <32 x i16>* %ptr_b + %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_subs_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) { +; X86-LABEL: test_mask_subs_epi16_rmkz_512: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08] +; X86-NEXT: vpsubsw (%eax), %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xe9,0x00] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mask_subs_epi16_rmkz_512: +; X64: # %bb.0: +; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] +; X64-NEXT: vpsubsw (%rdi), %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xe9,0x07] +; X64-NEXT: retq # encoding: [0xc3] + %b = load <32 x i16>, <32 x i16>* %ptr_b + %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask) + ret <32 x i16> %res +} + +declare <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) + +define <64 x i8> @test_mask_adds_epi8_rr_512(<64 x i8> %a, <64 x i8> %b) { +; CHECK-LABEL: test_mask_adds_epi8_rr_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpaddsb %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xec,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <64 x i8> @llvm.x86.avx512.mask.padds.b.512(<64 x i8> %a, <64 x i8> %b, <64 x i8> zeroinitializer, i64 -1) + ret <64 x i8> %res +} + +define <64 x i8> @test_mask_adds_epi8_rrk_512(<64 x i8> %a, <64 x i8> %b, <64 x i8> %passThru, i64 %mask) { +; X86-LABEL: test_mask_adds_epi8_rrk_512: +; X86: # %bb.0: +; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpaddsb %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xec,0xd1] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mask_adds_epi8_rrk_512: +; X64: # %bb.0: +; X64-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] +; X64-NEXT: vpaddsb %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xec,0xd1] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <64 x i8> @llvm.x86.avx512.mask.padds.b.512(<64 x i8> %a, <64 x i8> %b, <64 x i8> %passThru, i64 %mask) + ret <64 x i8> %res +} + +define <64 x i8> @test_mask_adds_epi8_rrkz_512(<64 x i8> %a, <64 x i8> %b, i64 %mask) { +; X86-LABEL: test_mask_adds_epi8_rrkz_512: +; X86: # %bb.0: +; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpaddsb %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xec,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mask_adds_epi8_rrkz_512: +; X64: # %bb.0: +; X64-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] +; X64-NEXT: vpaddsb %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xec,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <64 x i8> @llvm.x86.avx512.mask.padds.b.512(<64 x i8> %a, <64 x i8> %b, <64 x i8> zeroinitializer, i64 %mask) + ret <64 x i8> %res +} + +define <64 x i8> @test_mask_adds_epi8_rm_512(<64 x i8> %a, <64 x i8>* %ptr_b) { +; X86-LABEL: test_mask_adds_epi8_rm_512: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vpaddsb (%eax), %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xec,0x00] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mask_adds_epi8_rm_512: +; X64: # %bb.0: +; X64-NEXT: vpaddsb (%rdi), %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xec,0x07] +; X64-NEXT: retq # encoding: [0xc3] + %b = load <64 x i8>, <64 x i8>* %ptr_b + %res = call <64 x i8> @llvm.x86.avx512.mask.padds.b.512(<64 x i8> %a, <64 x i8> %b, <64 x i8> zeroinitializer, i64 -1) + ret <64 x i8> %res +} + +define <64 x i8> @test_mask_adds_epi8_rmk_512(<64 x i8> %a, <64 x i8>* %ptr_b, <64 x i8> %passThru, i64 %mask) { +; X86-LABEL: test_mask_adds_epi8_rmk_512: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x08] +; X86-NEXT: vpaddsb (%eax), %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xec,0x08] +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mask_adds_epi8_rmk_512: +; X64: # %bb.0: +; X64-NEXT: kmovq %rsi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xce] +; X64-NEXT: vpaddsb (%rdi), %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xec,0x0f] +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %b = load <64 x i8>, <64 x i8>* %ptr_b + %res = call <64 x i8> @llvm.x86.avx512.mask.padds.b.512(<64 x i8> %a, <64 x i8> %b, <64 x i8> %passThru, i64 %mask) + ret <64 x i8> %res +} + +define <64 x i8> @test_mask_adds_epi8_rmkz_512(<64 x i8> %a, <64 x i8>* %ptr_b, i64 %mask) { +; X86-LABEL: test_mask_adds_epi8_rmkz_512: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x08] +; X86-NEXT: vpaddsb (%eax), %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xec,0x00] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mask_adds_epi8_rmkz_512: +; X64: # %bb.0: +; X64-NEXT: kmovq %rsi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xce] +; X64-NEXT: vpaddsb (%rdi), %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xec,0x07] +; X64-NEXT: retq # encoding: [0xc3] + %b = load <64 x i8>, <64 x i8>* %ptr_b + %res = call <64 x i8> @llvm.x86.avx512.mask.padds.b.512(<64 x i8> %a, <64 x i8> %b, <64 x i8> zeroinitializer, i64 %mask) + ret <64 x i8> %res +} + +declare <64 x i8> @llvm.x86.avx512.mask.padds.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64) + +define <64 x i8> @test_mask_subs_epi8_rr_512(<64 x i8> %a, <64 x i8> %b) { +; CHECK-LABEL: test_mask_subs_epi8_rr_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpsubsb %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xe8,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <64 x i8> @llvm.x86.avx512.mask.psubs.b.512(<64 x i8> %a, <64 x i8> %b, <64 x i8> zeroinitializer, i64 -1) + ret <64 x i8> %res +} + +define <64 x i8> @test_mask_subs_epi8_rrk_512(<64 x i8> %a, <64 x i8> %b, <64 x i8> %passThru, i64 %mask) { +; X86-LABEL: test_mask_subs_epi8_rrk_512: +; X86: # %bb.0: +; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpsubsb %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xe8,0xd1] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mask_subs_epi8_rrk_512: +; X64: # %bb.0: +; X64-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] +; X64-NEXT: vpsubsb %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xe8,0xd1] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <64 x i8> @llvm.x86.avx512.mask.psubs.b.512(<64 x i8> %a, <64 x i8> %b, <64 x i8> %passThru, i64 %mask) + ret <64 x i8> %res +} + +define <64 x i8> @test_mask_subs_epi8_rrkz_512(<64 x i8> %a, <64 x i8> %b, i64 %mask) { +; X86-LABEL: test_mask_subs_epi8_rrkz_512: +; X86: # %bb.0: +; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpsubsb %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xe8,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mask_subs_epi8_rrkz_512: +; X64: # %bb.0: +; X64-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] +; X64-NEXT: vpsubsb %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xe8,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <64 x i8> @llvm.x86.avx512.mask.psubs.b.512(<64 x i8> %a, <64 x i8> %b, <64 x i8> zeroinitializer, i64 %mask) + ret <64 x i8> %res +} + +define <64 x i8> @test_mask_subs_epi8_rm_512(<64 x i8> %a, <64 x i8>* %ptr_b) { +; X86-LABEL: test_mask_subs_epi8_rm_512: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vpsubsb (%eax), %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xe8,0x00] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mask_subs_epi8_rm_512: +; X64: # %bb.0: +; X64-NEXT: vpsubsb (%rdi), %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xe8,0x07] +; X64-NEXT: retq # encoding: [0xc3] + %b = load <64 x i8>, <64 x i8>* %ptr_b + %res = call <64 x i8> @llvm.x86.avx512.mask.psubs.b.512(<64 x i8> %a, <64 x i8> %b, <64 x i8> zeroinitializer, i64 -1) + ret <64 x i8> %res +} + +define <64 x i8> @test_mask_subs_epi8_rmk_512(<64 x i8> %a, <64 x i8>* %ptr_b, <64 x i8> %passThru, i64 %mask) { +; X86-LABEL: test_mask_subs_epi8_rmk_512: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x08] +; X86-NEXT: vpsubsb (%eax), %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xe8,0x08] +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mask_subs_epi8_rmk_512: +; X64: # %bb.0: +; X64-NEXT: kmovq %rsi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xce] +; X64-NEXT: vpsubsb (%rdi), %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xe8,0x0f] +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %b = load <64 x i8>, <64 x i8>* %ptr_b + %res = call <64 x i8> @llvm.x86.avx512.mask.psubs.b.512(<64 x i8> %a, <64 x i8> %b, <64 x i8> %passThru, i64 %mask) + ret <64 x i8> %res +} + +define <64 x i8> @test_mask_subs_epi8_rmkz_512(<64 x i8> %a, <64 x i8>* %ptr_b, i64 %mask) { +; X86-LABEL: test_mask_subs_epi8_rmkz_512: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x08] +; X86-NEXT: vpsubsb (%eax), %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xe8,0x00] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_mask_subs_epi8_rmkz_512: +; X64: # %bb.0: +; X64-NEXT: kmovq %rsi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xce] +; X64-NEXT: vpsubsb (%rdi), %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xe8,0x07] +; X64-NEXT: retq # encoding: [0xc3] + %b = load <64 x i8>, <64 x i8>* %ptr_b + %res = call <64 x i8> @llvm.x86.avx512.mask.psubs.b.512(<64 x i8> %a, <64 x i8> %b, <64 x i8> zeroinitializer, i64 %mask) + ret <64 x i8> %res +} + +declare <64 x i8> @llvm.x86.avx512.mask.psubs.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64) diff --git a/llvm/test/CodeGen/X86/avx512bw-intrinsics.ll b/llvm/test/CodeGen/X86/avx512bw-intrinsics.ll index 4e59ade192c..87e2e7c037e 100644 --- a/llvm/test/CodeGen/X86/avx512bw-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512bw-intrinsics.ll @@ -564,8 +564,8 @@ define <32 x i16> @test_mask_adds_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) { ; CHECK: # %bb.0: ; CHECK-NEXT: vpaddsw %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xed,0xc1] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1) - ret <32 x i16> %res + %1 = call <32 x i16> @llvm.x86.avx512.padds.w.512(<32 x i16> %a, <32 x i16> %b) + ret <32 x i16> %1 } define <32 x i16> @test_mask_adds_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) { @@ -582,8 +582,10 @@ define <32 x i16> @test_mask_adds_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <3 ; X64-NEXT: vpaddsw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xed,0xd1] ; X64-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) - ret <32 x i16> %res + %1 = call <32 x i16> @llvm.x86.avx512.padds.w.512(<32 x i16> %a, <32 x i16> %b) + %2 = bitcast i32 %mask to <32 x i1> + %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passThru + ret <32 x i16> %3 } define <32 x i16> @test_mask_adds_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) { @@ -598,8 +600,10 @@ define <32 x i16> @test_mask_adds_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpaddsw %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xed,0xc1] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask) - ret <32 x i16> %res + %1 = call <32 x i16> @llvm.x86.avx512.padds.w.512(<32 x i16> %a, <32 x i16> %b) + %2 = bitcast i32 %mask to <32 x i1> + %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> zeroinitializer + ret <32 x i16> %3 } define <32 x i16> @test_mask_adds_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) { @@ -614,8 +618,8 @@ define <32 x i16> @test_mask_adds_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b ; X64-NEXT: vpaddsw (%rdi), %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xed,0x07] ; X64-NEXT: retq # encoding: [0xc3] %b = load <32 x i16>, <32 x i16>* %ptr_b - %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1) - ret <32 x i16> %res + %1 = call <32 x i16> @llvm.x86.avx512.padds.w.512(<32 x i16> %a, <32 x i16> %b) + ret <32 x i16> %1 } define <32 x i16> @test_mask_adds_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) { @@ -634,8 +638,10 @@ define <32 x i16> @test_mask_adds_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_ ; X64-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %b = load <32 x i16>, <32 x i16>* %ptr_b - %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) - ret <32 x i16> %res + %1 = call <32 x i16> @llvm.x86.avx512.padds.w.512(<32 x i16> %a, <32 x i16> %b) + %2 = bitcast i32 %mask to <32 x i1> + %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passThru + ret <32 x i16> %3 } define <32 x i16> @test_mask_adds_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) { @@ -652,19 +658,21 @@ define <32 x i16> @test_mask_adds_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr ; X64-NEXT: vpaddsw (%rdi), %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xed,0x07] ; X64-NEXT: retq # encoding: [0xc3] %b = load <32 x i16>, <32 x i16>* %ptr_b - %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask) - ret <32 x i16> %res + %1 = call <32 x i16> @llvm.x86.avx512.padds.w.512(<32 x i16> %a, <32 x i16> %b) + %2 = bitcast i32 %mask to <32 x i1> + %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> zeroinitializer + ret <32 x i16> %3 } -declare <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) +declare <32 x i16> @llvm.x86.avx512.padds.w.512(<32 x i16>, <32 x i16>) define <32 x i16> @test_mask_subs_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) { ; CHECK-LABEL: test_mask_subs_epi16_rr_512: ; CHECK: # %bb.0: ; CHECK-NEXT: vpsubsw %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xe9,0xc1] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1) - ret <32 x i16> %res + %1 = call <32 x i16> @llvm.x86.avx512.psubs.w.512(<32 x i16> %a, <32 x i16> %b) + ret <32 x i16> %1 } define <32 x i16> @test_mask_subs_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) { @@ -681,8 +689,10 @@ define <32 x i16> @test_mask_subs_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <3 ; X64-NEXT: vpsubsw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xe9,0xd1] ; X64-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) - ret <32 x i16> %res + %1 = call <32 x i16> @llvm.x86.avx512.psubs.w.512(<32 x i16> %a, <32 x i16> %b) + %2 = bitcast i32 %mask to <32 x i1> + %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passThru + ret <32 x i16> %3 } define <32 x i16> @test_mask_subs_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) { @@ -697,8 +707,10 @@ define <32 x i16> @test_mask_subs_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpsubsw %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xe9,0xc1] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask) - ret <32 x i16> %res + %1 = call <32 x i16> @llvm.x86.avx512.psubs.w.512(<32 x i16> %a, <32 x i16> %b) + %2 = bitcast i32 %mask to <32 x i1> + %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> zeroinitializer + ret <32 x i16> %3 } define <32 x i16> @test_mask_subs_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) { @@ -713,8 +725,8 @@ define <32 x i16> @test_mask_subs_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b ; X64-NEXT: vpsubsw (%rdi), %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xe9,0x07] ; X64-NEXT: retq # encoding: [0xc3] %b = load <32 x i16>, <32 x i16>* %ptr_b - %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1) - ret <32 x i16> %res + %1 = call <32 x i16> @llvm.x86.avx512.psubs.w.512(<32 x i16> %a, <32 x i16> %b) + ret <32 x i16> %1 } define <32 x i16> @test_mask_subs_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) { @@ -733,8 +745,10 @@ define <32 x i16> @test_mask_subs_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_ ; X64-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %b = load <32 x i16>, <32 x i16>* %ptr_b - %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) - ret <32 x i16> %res + %1 = call <32 x i16> @llvm.x86.avx512.psubs.w.512(<32 x i16> %a, <32 x i16> %b) + %2 = bitcast i32 %mask to <32 x i1> + %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passThru + ret <32 x i16> %3 } define <32 x i16> @test_mask_subs_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) { @@ -751,11 +765,13 @@ define <32 x i16> @test_mask_subs_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr ; X64-NEXT: vpsubsw (%rdi), %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xe9,0x07] ; X64-NEXT: retq # encoding: [0xc3] %b = load <32 x i16>, <32 x i16>* %ptr_b - %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask) - ret <32 x i16> %res + %1 = call <32 x i16> @llvm.x86.avx512.psubs.w.512(<32 x i16> %a, <32 x i16> %b) + %2 = bitcast i32 %mask to <32 x i1> + %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> zeroinitializer + ret <32 x i16> %3 } -declare <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) +declare <32 x i16> @llvm.x86.avx512.psubs.w.512(<32 x i16>, <32 x i16>) define <64 x i8> @test_mask_adds_epi8_rr_512(<64 x i8> %a, <64 x i8> %b) { ; CHECK-LABEL: test_mask_adds_epi8_rr_512: @@ -769,9 +785,7 @@ define <64 x i8> @test_mask_adds_epi8_rr_512(<64 x i8> %a, <64 x i8> %b) { define <64 x i8> @test_mask_adds_epi8_rrk_512(<64 x i8> %a, <64 x i8> %b, <64 x i8> %passThru, i64 %mask) { ; X86-LABEL: test_mask_adds_epi8_rrk_512: ; X86: # %bb.0: -; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k0 # encoding: [0xc4,0xe1,0xf9,0x90,0x44,0x24,0x04] -; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: kunpckdq %k0, %k1, %k1 # encoding: [0xc4,0xe1,0xf4,0x4b,0xc8] +; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpaddsb %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xec,0xd1] ; X86-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] @@ -789,9 +803,7 @@ define <64 x i8> @test_mask_adds_epi8_rrk_512(<64 x i8> %a, <64 x i8> %b, <64 x define <64 x i8> @test_mask_adds_epi8_rrkz_512(<64 x i8> %a, <64 x i8> %b, i64 %mask) { ; X86-LABEL: test_mask_adds_epi8_rrkz_512: ; X86: # %bb.0: -; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k0 # encoding: [0xc4,0xe1,0xf9,0x90,0x44,0x24,0x04] -; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: kunpckdq %k0, %k1, %k1 # encoding: [0xc4,0xe1,0xf4,0x4b,0xc8] +; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpaddsb %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xec,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -824,9 +836,7 @@ define <64 x i8> @test_mask_adds_epi8_rmk_512(<64 x i8> %a, <64 x i8>* %ptr_b, < ; X86-LABEL: test_mask_adds_epi8_rmk_512: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k0 # encoding: [0xc4,0xe1,0xf9,0x90,0x44,0x24,0x08] -; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x0c] -; X86-NEXT: kunpckdq %k0, %k1, %k1 # encoding: [0xc4,0xe1,0xf4,0x4b,0xc8] +; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x08] ; X86-NEXT: vpaddsb (%eax), %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xec,0x08] ; X86-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] @@ -846,9 +856,7 @@ define <64 x i8> @test_mask_adds_epi8_rmkz_512(<64 x i8> %a, <64 x i8>* %ptr_b, ; X86-LABEL: test_mask_adds_epi8_rmkz_512: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k0 # encoding: [0xc4,0xe1,0xf9,0x90,0x44,0x24,0x08] -; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x0c] -; X86-NEXT: kunpckdq %k0, %k1, %k1 # encoding: [0xc4,0xe1,0xf4,0x4b,0xc8] +; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x08] ; X86-NEXT: vpaddsb (%eax), %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xec,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -876,9 +884,7 @@ define <64 x i8> @test_mask_subs_epi8_rr_512(<64 x i8> %a, <64 x i8> %b) { define <64 x i8> @test_mask_subs_epi8_rrk_512(<64 x i8> %a, <64 x i8> %b, <64 x i8> %passThru, i64 %mask) { ; X86-LABEL: test_mask_subs_epi8_rrk_512: ; X86: # %bb.0: -; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k0 # encoding: [0xc4,0xe1,0xf9,0x90,0x44,0x24,0x04] -; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: kunpckdq %k0, %k1, %k1 # encoding: [0xc4,0xe1,0xf4,0x4b,0xc8] +; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpsubsb %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xe8,0xd1] ; X86-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] @@ -896,9 +902,7 @@ define <64 x i8> @test_mask_subs_epi8_rrk_512(<64 x i8> %a, <64 x i8> %b, <64 x define <64 x i8> @test_mask_subs_epi8_rrkz_512(<64 x i8> %a, <64 x i8> %b, i64 %mask) { ; X86-LABEL: test_mask_subs_epi8_rrkz_512: ; X86: # %bb.0: -; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k0 # encoding: [0xc4,0xe1,0xf9,0x90,0x44,0x24,0x04] -; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: kunpckdq %k0, %k1, %k1 # encoding: [0xc4,0xe1,0xf4,0x4b,0xc8] +; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpsubsb %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xe8,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -931,9 +935,7 @@ define <64 x i8> @test_mask_subs_epi8_rmk_512(<64 x i8> %a, <64 x i8>* %ptr_b, < ; X86-LABEL: test_mask_subs_epi8_rmk_512: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k0 # encoding: [0xc4,0xe1,0xf9,0x90,0x44,0x24,0x08] -; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x0c] -; X86-NEXT: kunpckdq %k0, %k1, %k1 # encoding: [0xc4,0xe1,0xf4,0x4b,0xc8] +; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x08] ; X86-NEXT: vpsubsb (%eax), %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xe8,0x08] ; X86-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] @@ -953,9 +955,7 @@ define <64 x i8> @test_mask_subs_epi8_rmkz_512(<64 x i8> %a, <64 x i8>* %ptr_b, ; X86-LABEL: test_mask_subs_epi8_rmkz_512: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k0 # encoding: [0xc4,0xe1,0xf9,0x90,0x44,0x24,0x08] -; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x0c] -; X86-NEXT: kunpckdq %k0, %k1, %k1 # encoding: [0xc4,0xe1,0xf4,0x4b,0xc8] +; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x08] ; X86-NEXT: vpsubsb (%eax), %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xe8,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; |