diff options
-rw-r--r-- | llvm/include/llvm/IR/IntrinsicsX86.td | 10 | ||||
-rw-r--r-- | llvm/lib/IR/AutoUpgrade.cpp | 23 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 29 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86IntrinsicsInfo.h | 8 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll | 202 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll | 168 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/avx512-intrinsics.ll | 128 |
7 files changed, 540 insertions, 28 deletions
diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td index 0d3a194faac..249ca5ebec0 100644 --- a/llvm/include/llvm/IR/IntrinsicsX86.td +++ b/llvm/include/llvm/IR/IntrinsicsX86.td @@ -3856,12 +3856,10 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>; - def int_x86_avx512_mask_sqrt_pd_512 : - Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty, - llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>; - def int_x86_avx512_mask_sqrt_ps_512 : - Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty, - llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>; + def int_x86_avx512_sqrt_pd_512 : + Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_i32_ty], [IntrNoMem]>; + def int_x86_avx512_sqrt_ps_512 : + Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_i32_ty], [IntrNoMem]>; def int_x86_avx512_mask_fixupimm_pd_128 : GCCBuiltin<"__builtin_ia32_fixupimmpd128_mask">, Intrinsic<[llvm_v2f64_ty], diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index aed47fe9266..39a9da12f0f 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -86,10 +86,7 @@ static bool ShouldUpgradeX86Intrinsic(Function *F, StringRef Name) { Name.startswith("avx512.broadcastm") || // Added in 6.0 Name == "sse.sqrt.ss" || // Added in 7.0 Name == "sse2.sqrt.sd" || // Added in 7.0 - Name == "avx512.mask.sqrt.ps.128" || // Added in 7.0 - Name == "avx512.mask.sqrt.ps.256" || // Added in 7.0 - Name == "avx512.mask.sqrt.pd.128" || // Added in 7.0 - Name == "avx512.mask.sqrt.pd.256" || // Added in 7.0 + Name.startswith("avx512.mask.sqrt.p") || // Added in 7.0 Name.startswith("avx.sqrt.p") || // Added in 7.0 Name.startswith("sse2.sqrt.p") || // Added in 7.0 Name.startswith("sse.sqrt.p") || // Added in 7.0 @@ -1465,14 +1462,24 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { Intrinsic::sqrt, CI->getType()), {CI->getArgOperand(0)}); - } else if (IsX86 && (Name.startswith("avx512.mask.sqrt.p") && - !Name.endswith("512"))) { + } else if (IsX86 && (Name.startswith("avx512.mask.sqrt.p"))) { + if (CI->getNumArgOperands() == 4 && + (!isa<ConstantInt>(CI->getArgOperand(3)) || + cast<ConstantInt>(CI->getArgOperand(3))->getZExtValue() != 4)) { + Intrinsic::ID IID = Name[18] == 's' ? Intrinsic::x86_avx512_sqrt_ps_512 + : Intrinsic::x86_avx512_sqrt_pd_512; + + Value *Args[] = { CI->getArgOperand(0), CI->getArgOperand(3) }; + Rep = Builder.CreateCall(Intrinsic::getDeclaration(CI->getModule(), + IID), Args); + } else { Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), Intrinsic::sqrt, CI->getType()), {CI->getArgOperand(0)}); - Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep, - CI->getArgOperand(1)); + } + Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep, + CI->getArgOperand(1)); } else if (IsX86 && (Name.startswith("avx512.ptestm") || Name.startswith("avx512.ptestnm"))) { Value *Op0 = CI->getArgOperand(0); diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 204925fd94d..779246634a8 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -20400,8 +20400,20 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo); if (IntrData) { switch(IntrData->Type) { - case INTR_TYPE_1OP: + case INTR_TYPE_1OP: { + // We specify 2 possible opcodes for intrinsics with rounding modes. + // First, we check if the intrinsic may have non-default rounding mode, + // (IntrData->Opc1 != 0), then we check the rounding mode operand. + unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; + if (IntrWithRoundingModeOpcode != 0) { + SDValue Rnd = Op.getOperand(2); + if (!isRoundModeCurDirection(Rnd)) { + return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(), + Op.getOperand(1), Rnd); + } + } return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1)); + } case INTR_TYPE_2OP: { // We specify 2 possible opcodes for intrinsics with rounding modes. // First, we check if the intrinsic may have non-default rounding mode, @@ -20616,6 +20628,21 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, Src1, Src2, Src3), Mask, PassThru, Subtarget, DAG); } + case INTR_TYPE_1OP_RM: { + // We specify 2 possible opcodes for intrinsics with rounding modes. + // First, we check if the intrinsic may have non-default rounding mode, + // (IntrData->Opc1 != 0), then we check the rounding mode operand. + unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; + if (IntrWithRoundingModeOpcode != 0) { + SDValue Rnd = Op.getOperand(2); + if (!isRoundModeCurDirection(Rnd)) { + return DAG.getNode(IntrWithRoundingModeOpcode, + dl, Op.getValueType(), + Op.getOperand(1), Rnd); + } + } + return DAG.getNode(IntrData->Opc0, dl, VT, Op.getOperand(1)); + } case INTR_TYPE_3OP_RM: { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h index ae6caf8ebf5..86c26d449b9 100644 --- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h +++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h @@ -22,7 +22,7 @@ namespace llvm { enum IntrinsicType : uint16_t { GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST, XGETBV, ADX, FPCLASS, FPCLASSS, INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_3OP, INTR_TYPE_4OP, - INTR_TYPE_3OP_RM, INTR_TYPE_3OP_IMM8, + INTR_TYPE_1OP_RM, INTR_TYPE_3OP_RM, INTR_TYPE_3OP_IMM8, CMP_MASK, CMP_MASK_CC,CMP_MASK_SCALAR_CC, VSHIFT, COMI, COMI_RM, CVTPD2PS, CVTPD2PS_MASK, INTR_TYPE_1OP_MASK, INTR_TYPE_1OP_MASK_RM, @@ -882,10 +882,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::SCALEFS, 0), X86_INTRINSIC_DATA(avx512_mask_scalef_ss, INTR_TYPE_SCALAR_MASK_RM, X86ISD::SCALEFS, 0), - X86_INTRINSIC_DATA(avx512_mask_sqrt_pd_512, INTR_TYPE_1OP_MASK, ISD::FSQRT, - X86ISD::FSQRT_RND), - X86_INTRINSIC_DATA(avx512_mask_sqrt_ps_512, INTR_TYPE_1OP_MASK, ISD::FSQRT, - X86ISD::FSQRT_RND), X86_INTRINSIC_DATA(avx512_mask_sqrt_sd, INTR_TYPE_SCALAR_MASK_RM, X86ISD::FSQRTS_RND, 0), X86_INTRINSIC_DATA(avx512_mask_sqrt_ss, INTR_TYPE_SCALAR_MASK_RM, @@ -1173,6 +1169,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_rsqrt28_ps, INTR_TYPE_1OP_MASK_RM,X86ISD::RSQRT28, 0), X86_INTRINSIC_DATA(avx512_rsqrt28_sd, INTR_TYPE_SCALAR_MASK_RM,X86ISD::RSQRT28S, 0), X86_INTRINSIC_DATA(avx512_rsqrt28_ss, INTR_TYPE_SCALAR_MASK_RM,X86ISD::RSQRT28S, 0), + X86_INTRINSIC_DATA(avx512_sqrt_pd_512, INTR_TYPE_1OP, ISD::FSQRT, X86ISD::FSQRT_RND), + X86_INTRINSIC_DATA(avx512_sqrt_ps_512, INTR_TYPE_1OP, ISD::FSQRT, X86ISD::FSQRT_RND), X86_INTRINSIC_DATA(avx512_sub_pd_512, INTR_TYPE_2OP, ISD::FSUB, X86ISD::FSUB_RND), X86_INTRINSIC_DATA(avx512_sub_ps_512, INTR_TYPE_2OP, ISD::FSUB, X86ISD::FSUB_RND), X86_INTRINSIC_DATA(avx512_vcomi_sd, COMI_RM, X86ISD::COMI, X86ISD::UCOMI), diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll index 21c8763d6d1..5f7a6a17457 100644 --- a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll @@ -9022,6 +9022,206 @@ entry: ret <16 x float> %0 } +define <8 x double> @test_mm512_sqrt_pd(<8 x double> %a) { +; CHECK-LABEL: test_mm512_sqrt_pd: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsqrtpd %zmm0, %zmm0 +; CHECK-NEXT: ret{{[l|q]}} +entry: + %0 = tail call <8 x double> @llvm.sqrt.v8f64(<8 x double> %a) + ret <8 x double> %0 +} + +define <8 x double> @test_mm512_mask_sqrt_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A) { +; X86-LABEL: test_mm512_mask_sqrt_pd: +; X86: # %bb.0: # %entry +; X86-NEXT: movb {{[0-9]+}}(%esp), %al +; X86-NEXT: kmovw %eax, %k1 +; X86-NEXT: vsqrtpd %zmm1, %zmm0 {%k1} +; X86-NEXT: retl +; +; X64-LABEL: test_mm512_mask_sqrt_pd: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vsqrtpd %zmm1, %zmm0 {%k1} +; X64-NEXT: retq +entry: + %0 = tail call <8 x double> @llvm.sqrt.v8f64(<8 x double> %__A) + %1 = bitcast i8 %__U to <8 x i1> + %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__W + ret <8 x double> %2 +} + +define <8 x double> @test_mm512_maskz_sqrt_pd(i8 zeroext %__U, <8 x double> %__A) { +; X86-LABEL: test_mm512_maskz_sqrt_pd: +; X86: # %bb.0: # %entry +; X86-NEXT: movb {{[0-9]+}}(%esp), %al +; X86-NEXT: kmovw %eax, %k1 +; X86-NEXT: vsqrtpd %zmm0, %zmm0 {%k1} {z} +; X86-NEXT: retl +; +; X64-LABEL: test_mm512_maskz_sqrt_pd: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vsqrtpd %zmm0, %zmm0 {%k1} {z} +; X64-NEXT: retq +entry: + %0 = tail call <8 x double> @llvm.sqrt.v8f64(<8 x double> %__A) + %1 = bitcast i8 %__U to <8 x i1> + %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer + ret <8 x double> %2 +} + +define <8 x double> @test_mm512_mask_sqrt_round_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A) { +; X86-LABEL: test_mm512_mask_sqrt_round_pd: +; X86: # %bb.0: # %entry +; X86-NEXT: movb {{[0-9]+}}(%esp), %al +; X86-NEXT: kmovw %eax, %k1 +; X86-NEXT: vsqrtpd {rn-sae}, %zmm1, %zmm0 {%k1} +; X86-NEXT: retl +; +; X64-LABEL: test_mm512_mask_sqrt_round_pd: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vsqrtpd {rn-sae}, %zmm1, %zmm0 {%k1} +; X64-NEXT: retq +entry: + %0 = tail call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %__A, i32 8) + %1 = bitcast i8 %__U to <8 x i1> + %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__W + ret <8 x double> %2 +} + +declare <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double>, i32) + +define <8 x double> @test_mm512_maskz_sqrt_round_pd(i8 zeroext %__U, <8 x double> %__A) { +; X86-LABEL: test_mm512_maskz_sqrt_round_pd: +; X86: # %bb.0: # %entry +; X86-NEXT: movb {{[0-9]+}}(%esp), %al +; X86-NEXT: kmovw %eax, %k1 +; X86-NEXT: vsqrtpd {rn-sae}, %zmm0, %zmm0 {%k1} {z} +; X86-NEXT: retl +; +; X64-LABEL: test_mm512_maskz_sqrt_round_pd: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vsqrtpd {rn-sae}, %zmm0, %zmm0 {%k1} {z} +; X64-NEXT: retq +entry: + %0 = tail call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %__A, i32 8) + %1 = bitcast i8 %__U to <8 x i1> + %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer + ret <8 x double> %2 +} + +define <8 x double> @test_mm512_sqrt_round_pd(<8 x double> %__A) { +; CHECK-LABEL: test_mm512_sqrt_round_pd: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsqrtpd {rn-sae}, %zmm0, %zmm0 +; CHECK-NEXT: ret{{[l|q]}} +entry: + %0 = tail call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %__A, i32 8) + ret <8 x double> %0 +} + +define <16 x float> @test_mm512_sqrt_ps(<16 x float> %a) { +; CHECK-LABEL: test_mm512_sqrt_ps: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsqrtps %zmm0, %zmm0 +; CHECK-NEXT: ret{{[l|q]}} +entry: + %0 = tail call <16 x float> @llvm.sqrt.v16f32(<16 x float> %a) + ret <16 x float> %0 +} + +define <16 x float> @test_mm512_mask_sqrt_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A) { +; X86-LABEL: test_mm512_mask_sqrt_ps: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: vsqrtps %zmm1, %zmm0 {%k1} +; X86-NEXT: retl +; +; X64-LABEL: test_mm512_mask_sqrt_ps: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vsqrtps %zmm1, %zmm0 {%k1} +; X64-NEXT: retq +entry: + %0 = tail call <16 x float> @llvm.sqrt.v16f32(<16 x float> %__A) + %1 = bitcast i16 %__U to <16 x i1> + %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__W + ret <16 x float> %2 +} + +define <16 x float> @test_mm512_maskz_sqrt_ps(i16 zeroext %__U, <16 x float> %__A) { +; X86-LABEL: test_mm512_maskz_sqrt_ps: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: vsqrtps %zmm0, %zmm0 {%k1} {z} +; X86-NEXT: retl +; +; X64-LABEL: test_mm512_maskz_sqrt_ps: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vsqrtps %zmm0, %zmm0 {%k1} {z} +; X64-NEXT: retq +entry: + %0 = tail call <16 x float> @llvm.sqrt.v16f32(<16 x float> %__A) + %1 = bitcast i16 %__U to <16 x i1> + %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer + ret <16 x float> %2 +} + +define <16 x float> @test_mm512_mask_sqrt_round_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A) { +; X86-LABEL: test_mm512_mask_sqrt_round_ps: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: vsqrtps {rn-sae}, %zmm1, %zmm0 {%k1} +; X86-NEXT: retl +; +; X64-LABEL: test_mm512_mask_sqrt_round_ps: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vsqrtps {rn-sae}, %zmm1, %zmm0 {%k1} +; X64-NEXT: retq +entry: + %0 = tail call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %__A, i32 8) + %1 = bitcast i16 %__U to <16 x i1> + %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__W + ret <16 x float> %2 +} + +declare <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float>, i32) + +define <16 x float> @test_mm512_maskz_sqrt_round_ps(i16 zeroext %__U, <16 x float> %__A) { +; X86-LABEL: test_mm512_maskz_sqrt_round_ps: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: vsqrtps {rn-sae}, %zmm0, %zmm0 {%k1} {z} +; X86-NEXT: retl +; +; X64-LABEL: test_mm512_maskz_sqrt_round_ps: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vsqrtps {rn-sae}, %zmm0, %zmm0 {%k1} {z} +; X64-NEXT: retq +entry: + %0 = tail call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %__A, i32 8) + %1 = bitcast i16 %__U to <16 x i1> + %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer + ret <16 x float> %2 +} + +define <16 x float> @test_mm512_sqrt_round_ps(<16 x float> %__A) { +; CHECK-LABEL: test_mm512_sqrt_round_ps: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsqrtps {rn-sae}, %zmm0, %zmm0 +; CHECK-NEXT: ret{{[l|q]}} +entry: + %0 = tail call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %__A, i32 8) + ret <16 x float> %0 +} + declare <8 x double> @llvm.fma.v8f64(<8 x double>, <8 x double>, <8 x double>) #9 declare <16 x float> @llvm.fma.v16f32(<16 x float>, <16 x float>, <16 x float>) #9 declare float @llvm.fma.f32(float, float, float) #9 @@ -9042,6 +9242,8 @@ declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) +declare <8 x double> @llvm.sqrt.v8f64(<8 x double>) +declare <16 x float> @llvm.sqrt.v16f32(<16 x float>) !0 = !{i32 1} diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll index 61555d4cb7e..e232337a162 100644 --- a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll @@ -8372,3 +8372,171 @@ define <16 x float> @test_mm512_max_round_ps_current(<16 x float> %a0, <16 x flo ret <16 x float> %res } declare <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) + +define <8 x double> @test_sqrt_pd_512(<8 x double> %a0) { +; CHECK-LABEL: test_sqrt_pd_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vsqrtpd %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x51,0xc0] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <8 x double> @llvm.x86.avx512.mask.sqrt.pd.512(<8 x double> %a0, <8 x double> undef, i8 -1, i32 4) + ret <8 x double> %res +} +define <8 x double> @test_mask_sqrt_pd_512(<8 x double> %a0, <8 x double> %passthru, i8 %mask) { +; X86-LABEL: test_mask_sqrt_pd_512: +; X86: ## %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vsqrtpd %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0x51,0xc8] +; X86-NEXT: vmovapd %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc1] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_mask_sqrt_pd_512: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vsqrtpd %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0x51,0xc8] +; X64-NEXT: vmovapd %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc1] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <8 x double> @llvm.x86.avx512.mask.sqrt.pd.512(<8 x double> %a0, <8 x double> %passthru, i8 %mask, i32 4) + ret <8 x double> %res +} +define <8 x double> @test_maskz_sqrt_pd_512(<8 x double> %a0, i8 %mask) { +; X86-LABEL: test_maskz_sqrt_pd_512: +; X86: ## %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vsqrtpd %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0x51,0xc0] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_maskz_sqrt_pd_512: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vsqrtpd %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0x51,0xc0] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <8 x double> @llvm.x86.avx512.mask.sqrt.pd.512(<8 x double> %a0, <8 x double> zeroinitializer, i8 %mask, i32 4) + ret <8 x double> %res +} +define <8 x double> @test_sqrt_round_pd_512(<8 x double> %a0) { +; CHECK-LABEL: test_sqrt_round_pd_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vsqrtpd {rz-sae}, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x78,0x51,0xc0] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <8 x double> @llvm.x86.avx512.mask.sqrt.pd.512(<8 x double> %a0, <8 x double> undef, i8 -1, i32 11) + ret <8 x double> %res +} +define <8 x double> @test_mask_sqrt_round_pd_512(<8 x double> %a0, <8 x double> %passthru, i8 %mask) { +; X86-LABEL: test_mask_sqrt_round_pd_512: +; X86: ## %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vsqrtpd {rz-sae}, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x79,0x51,0xc8] +; X86-NEXT: vmovapd %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc1] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_mask_sqrt_round_pd_512: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vsqrtpd {rz-sae}, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x79,0x51,0xc8] +; X64-NEXT: vmovapd %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc1] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <8 x double> @llvm.x86.avx512.mask.sqrt.pd.512(<8 x double> %a0, <8 x double> %passthru, i8 %mask, i32 11) + ret <8 x double> %res +} +define <8 x double> @test_maskz_sqrt_round_pd_512(<8 x double> %a0, i8 %mask) { +; X86-LABEL: test_maskz_sqrt_round_pd_512: +; X86: ## %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vsqrtpd {rz-sae}, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xf9,0x51,0xc0] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_maskz_sqrt_round_pd_512: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vsqrtpd {rz-sae}, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xf9,0x51,0xc0] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <8 x double> @llvm.x86.avx512.mask.sqrt.pd.512(<8 x double> %a0, <8 x double> zeroinitializer, i8 %mask, i32 11) + ret <8 x double> %res +} +declare <8 x double> @llvm.x86.avx512.mask.sqrt.pd.512(<8 x double>, <8 x double>, i8, i32) nounwind readnone + +define <16 x float> @test_sqrt_ps_512(<16 x float> %a0) { +; CHECK-LABEL: test_sqrt_ps_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vsqrtps %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x51,0xc0] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.sqrt.ps.512(<16 x float> %a0, <16 x float> undef, i16 -1, i32 4) + ret <16 x float> %res +} +define <16 x float> @test_mask_sqrt_ps_512(<16 x float> %a0, <16 x float> %passthru, i16 %mask) { +; X86-LABEL: test_mask_sqrt_ps_512: +; X86: ## %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vsqrtps %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x51,0xc8] +; X86-NEXT: vmovaps %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_mask_sqrt_ps_512: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vsqrtps %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x51,0xc8] +; X64-NEXT: vmovaps %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.sqrt.ps.512(<16 x float> %a0, <16 x float> %passthru, i16 %mask, i32 4) + ret <16 x float> %res +} +define <16 x float> @test_maskz_sqrt_ps_512(<16 x float> %a0, i16 %mask) { +; X86-LABEL: test_maskz_sqrt_ps_512: +; X86: ## %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vsqrtps %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xc9,0x51,0xc0] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_maskz_sqrt_ps_512: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vsqrtps %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xc9,0x51,0xc0] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.sqrt.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 %mask, i32 4) + ret <16 x float> %res +} +define <16 x float> @test_sqrt_round_ps_512(<16 x float> %a0) { +; CHECK-LABEL: test_sqrt_round_ps_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vsqrtps {rz-sae}, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x78,0x51,0xc0] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.sqrt.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 11) + ret <16 x float> %res +} +define <16 x float> @test_mask_sqrt_round_ps_512(<16 x float> %a0, <16 x float> %passthru, i16 %mask) { +; X86-LABEL: test_mask_sqrt_round_ps_512: +; X86: ## %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vsqrtps {rz-sae}, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x79,0x51,0xc8] +; X86-NEXT: vmovaps %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_mask_sqrt_round_ps_512: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vsqrtps {rz-sae}, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x79,0x51,0xc8] +; X64-NEXT: vmovaps %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.sqrt.ps.512(<16 x float> %a0, <16 x float> %passthru, i16 %mask, i32 11) + ret <16 x float> %res +} +define <16 x float> @test_maskz_sqrt_round_ps_512(<16 x float> %a0, i16 %mask) { +; X86-LABEL: test_maskz_sqrt_round_ps_512: +; X86: ## %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vsqrtps {rz-sae}, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xf9,0x51,0xc0] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_maskz_sqrt_round_ps_512: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vsqrtps {rz-sae}, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xf9,0x51,0xc0] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.sqrt.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 %mask, i32 11) + ret <16 x float> %res +} +declare <16 x float> @llvm.x86.avx512.mask.sqrt.ps.512(<16 x float>, <16 x float>, i16, i32) nounwind readnone diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics.ll b/llvm/test/CodeGen/X86/avx512-intrinsics.ll index e7db5429094..1e154e257cb 100644 --- a/llvm/test/CodeGen/X86/avx512-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics.ll @@ -393,28 +393,140 @@ define <8 x double> @test_sqrt_pd_512(<8 x double> %a0) { ; CHECK: ## %bb.0: ; CHECK-NEXT: vsqrtpd %zmm0, %zmm0 ; CHECK-NEXT: retq - %res = call <8 x double> @llvm.x86.avx512.mask.sqrt.pd.512(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1, i32 4) - ret <8 x double> %res + %1 = call <8 x double> @llvm.sqrt.v8f64(<8 x double> %a0) + ret <8 x double> %1 +} + +define <8 x double> @test_mask_sqrt_pd_512(<8 x double> %a0, <8 x double> %passthru, i8 %mask) { +; CHECK-LABEL: test_mask_sqrt_pd_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vsqrtpd %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovapd %zmm1, %zmm0 +; CHECK-NEXT: retq + %1 = call <8 x double> @llvm.sqrt.v8f64(<8 x double> %a0) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %passthru + ret <8 x double> %3 +} + +define <8 x double> @test_maskz_sqrt_pd_512(<8 x double> %a0, i8 %mask) { +; CHECK-LABEL: test_maskz_sqrt_pd_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vsqrtpd %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq + %1 = call <8 x double> @llvm.sqrt.v8f64(<8 x double> %a0) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> zeroinitializer + ret <8 x double> %3 +} +declare <8 x double> @llvm.sqrt.v8f64(<8 x double>) + +define <8 x double> @test_sqrt_round_pd_512(<8 x double> %a0) { +; CHECK-LABEL: test_sqrt_round_pd_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vsqrtpd {rz-sae}, %zmm0, %zmm0 +; CHECK-NEXT: retq + %1 = call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %a0, i32 11) + ret <8 x double> %1 +} + +define <8 x double> @test_mask_sqrt_round_pd_512(<8 x double> %a0, <8 x double> %passthru, i8 %mask) { +; CHECK-LABEL: test_mask_sqrt_round_pd_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vsqrtpd {rz-sae}, %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovapd %zmm1, %zmm0 +; CHECK-NEXT: retq + %1 = call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %a0, i32 11) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %passthru + ret <8 x double> %3 } -declare <8 x double> @llvm.x86.avx512.mask.sqrt.pd.512(<8 x double>, <8 x double>, i8, i32) nounwind readnone + +define <8 x double> @test_maskz_sqrt_round_pd_512(<8 x double> %a0, i8 %mask) { +; CHECK-LABEL: test_maskz_sqrt_round_pd_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vsqrtpd {rz-sae}, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq + %1 = call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %a0, i32 11) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> zeroinitializer + ret <8 x double> %3 +} +declare <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double>, i32) nounwind readnone define <16 x float> @test_sqrt_ps_512(<16 x float> %a0) { ; CHECK-LABEL: test_sqrt_ps_512: ; CHECK: ## %bb.0: ; CHECK-NEXT: vsqrtps %zmm0, %zmm0 ; CHECK-NEXT: retq - %res = call <16 x float> @llvm.x86.avx512.mask.sqrt.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 4) - ret <16 x float> %res + %1 = call <16 x float> @llvm.sqrt.v16f32(<16 x float> %a0) + ret <16 x float> %1 +} + +define <16 x float> @test_mask_sqrt_ps_512(<16 x float> %a0, <16 x float> %passthru, i16 %mask) { +; CHECK-LABEL: test_mask_sqrt_ps_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vsqrtps %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: retq + %1 = call <16 x float> @llvm.sqrt.v16f32(<16 x float> %a0) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %passthru + ret <16 x float> %3 } + +define <16 x float> @test_maskz_sqrt_ps_512(<16 x float> %a0, i16 %mask) { +; CHECK-LABEL: test_maskz_sqrt_ps_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vsqrtps %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq + %1 = call <16 x float> @llvm.sqrt.v16f32(<16 x float> %a0) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer + ret <16 x float> %3 +} +declare <16 x float> @llvm.sqrt.v16f32(<16 x float>) + define <16 x float> @test_sqrt_round_ps_512(<16 x float> %a0) { ; CHECK-LABEL: test_sqrt_round_ps_512: ; CHECK: ## %bb.0: ; CHECK-NEXT: vsqrtps {rz-sae}, %zmm0, %zmm0 ; CHECK-NEXT: retq - %res = call <16 x float> @llvm.x86.avx512.mask.sqrt.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 3) - ret <16 x float> %res + %1 = call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %a0, i32 11) + ret <16 x float> %1 +} + +define <16 x float> @test_mask_sqrt_round_ps_512(<16 x float> %a0, <16 x float> %passthru, i16 %mask) { +; CHECK-LABEL: test_mask_sqrt_round_ps_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vsqrtps {rz-sae}, %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: retq + %1 = call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %a0, i32 11) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %passthru + ret <16 x float> %3 +} + +define <16 x float> @test_maskz_sqrt_round_ps_512(<16 x float> %a0, i16 %mask) { +; CHECK-LABEL: test_maskz_sqrt_round_ps_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vsqrtps {rz-sae}, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq + %1 = call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %a0, i32 11) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer + ret <16 x float> %3 } -declare <16 x float> @llvm.x86.avx512.mask.sqrt.ps.512(<16 x float>, <16 x float>, i16, i32) nounwind readnone +declare <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float>, i32) nounwind readnone define <8 x double> @test_getexp_pd_512(<8 x double> %a0) { ; CHECK-LABEL: test_getexp_pd_512: |