diff options
-rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 8 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86InstrAVX512.td | 40 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/avx512f-vec-test-testn.ll | 32 |
3 files changed, 64 insertions, 16 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 9f483b72a89..2f2ae32a1d1 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -17830,6 +17830,14 @@ static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) { "Cannot set masked compare for this operation"); ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); + + // If this is a seteq make sure any build vectors of all zeros are on the RHS. + // This helps with vptestm matching. + // TODO: Should we just canonicalize the setcc during DAG combine? + if ((SetCCOpcode == ISD::SETEQ || SetCCOpcode == ISD::SETNE) && + ISD::isBuildVectorAllZeros(Op0.getNode())) + std::swap(Op0, Op1); + bool Swap = false; unsigned SSECC; switch (SetCCOpcode) { diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index 395ab8a3c8a..23432072e98 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -2189,27 +2189,27 @@ multiclass avx512_icmp_packed_rmb_vl<bits<8> opc, string OpcodeStr, } } -def X86pcmpeqm : PatFrag<(ops node:$src1, node:$src2), - (X86cmpm_c node:$src1, node:$src2, (i8 0))>; -def X86pcmpnem : PatFrag<(ops node:$src1, node:$src2), - (X86cmpm_c node:$src1, node:$src2, (i8 4))>; +// This fragment treats X86cmpm as commutable to help match loads in both +// operands for PCMPEQ. +def X86pcmpeqm_c : PatFrag<(ops node:$src1, node:$src2), + (X86cmpm_c node:$src1, node:$src2, (i8 0))>; def X86pcmpgtm : PatFrag<(ops node:$src1, node:$src2), (X86cmpm node:$src1, node:$src2, (i8 6))>; // FIXME: Is there a better scheduler itinerary for VPCMP? -defm VPCMPEQB : avx512_icmp_packed_vl<0x74, "vpcmpeqb", X86pcmpeqm, +defm VPCMPEQB : avx512_icmp_packed_vl<0x74, "vpcmpeqb", X86pcmpeqm_c, SSE_ALU_F32P, avx512vl_i8_info, HasBWI, 1>, EVEX_CD8<8, CD8VF>, VEX_WIG; -defm VPCMPEQW : avx512_icmp_packed_vl<0x75, "vpcmpeqw", X86pcmpeqm, +defm VPCMPEQW : avx512_icmp_packed_vl<0x75, "vpcmpeqw", X86pcmpeqm_c, SSE_ALU_F32P, avx512vl_i16_info, HasBWI, 1>, EVEX_CD8<16, CD8VF>, VEX_WIG; -defm VPCMPEQD : avx512_icmp_packed_rmb_vl<0x76, "vpcmpeqd", X86pcmpeqm, +defm VPCMPEQD : avx512_icmp_packed_rmb_vl<0x76, "vpcmpeqd", X86pcmpeqm_c, SSE_ALU_F32P, avx512vl_i32_info, HasAVX512, 1>, EVEX_CD8<32, CD8VF>; -defm VPCMPEQQ : avx512_icmp_packed_rmb_vl<0x29, "vpcmpeqq", X86pcmpeqm, +defm VPCMPEQQ : avx512_icmp_packed_rmb_vl<0x29, "vpcmpeqq", X86pcmpeqm_c, SSE_ALU_F32P, avx512vl_i64_info, HasAVX512, 1>, T8PD, VEX_W, EVEX_CD8<64, CD8VF>; @@ -3111,16 +3111,16 @@ def : Pat<(Narrow.KVT (and Narrow.KRC:$mask, let Predicates = [HasAVX512, NoVLX] in { defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, "VPCMPGTD", v8i32x_info, v16i32_info>; - defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm, "VPCMPEQD", v8i32x_info, v16i32_info>; + defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, "VPCMPEQD", v8i32x_info, v16i32_info>; defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, "VPCMPGTD", v4i32x_info, v16i32_info>; - defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm, "VPCMPEQD", v4i32x_info, v16i32_info>; + defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, "VPCMPEQD", v4i32x_info, v16i32_info>; defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, "VPCMPGTQ", v4i64x_info, v8i64_info>; - defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm, "VPCMPEQQ", v4i64x_info, v8i64_info>; + defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, "VPCMPEQQ", v4i64x_info, v8i64_info>; defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, "VPCMPGTQ", v2i64x_info, v8i64_info>; - defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm, "VPCMPEQQ", v2i64x_info, v8i64_info>; + defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, "VPCMPEQQ", v2i64x_info, v8i64_info>; defm : axv512_icmp_packed_cc_no_vlx_lowering<X86cmpm, "VCMPPS", v8f32x_info, v16f32_info>; defm : axv512_icmp_packed_cc_no_vlx_lowering<X86cmpm, "VPCMPD", v8i32x_info, v16i32_info>; @@ -3141,16 +3141,16 @@ let Predicates = [HasAVX512, NoVLX] in { let Predicates = [HasBWI, NoVLX] in { defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, "VPCMPGTB", v32i8x_info, v64i8_info>; - defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm, "VPCMPEQB", v32i8x_info, v64i8_info>; + defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, "VPCMPEQB", v32i8x_info, v64i8_info>; defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, "VPCMPGTB", v16i8x_info, v64i8_info>; - defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm, "VPCMPEQB", v16i8x_info, v64i8_info>; + defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, "VPCMPEQB", v16i8x_info, v64i8_info>; defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, "VPCMPGTW", v16i16x_info, v32i16_info>; - defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm, "VPCMPEQW", v16i16x_info, v32i16_info>; + defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, "VPCMPEQW", v16i16x_info, v32i16_info>; defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, "VPCMPGTW", v8i16x_info, v32i16_info>; - defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm, "VPCMPEQW", v8i16x_info, v32i16_info>; + defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, "VPCMPEQW", v8i16x_info, v32i16_info>; defm : axv512_icmp_packed_cc_no_vlx_lowering<X86cmpm, "VPCMPB", v32i8x_info, v64i8_info>; defm : axv512_icmp_packed_cc_no_vlx_lowering<X86cmpmu, "VPCMPUB", v32i8x_info, v64i8_info>; @@ -5465,6 +5465,14 @@ multiclass avx512_vptest_wb<bits<8> opc, string OpcodeStr, } } +// These patterns are used to match vptestm/vptestnm. We don't treat pcmpeqm +// as commutable here because we already canonicalized all zeros vectors to the +// RHS during lowering. +def X86pcmpeqm : PatFrag<(ops node:$src1, node:$src2), + (X86cmpm node:$src1, node:$src2, (i8 0))>; +def X86pcmpnem : PatFrag<(ops node:$src1, node:$src2), + (X86cmpm node:$src1, node:$src2, (i8 4))>; + multiclass avx512_vptest_all_forms<bits<8> opc_wb, bits<8> opc_dq, string OpcodeStr, PatFrag OpNode, OpndItins itins> : avx512_vptest_wb <opc_wb, OpcodeStr, OpNode, itins>, diff --git a/llvm/test/CodeGen/X86/avx512f-vec-test-testn.ll b/llvm/test/CodeGen/X86/avx512f-vec-test-testn.ll index ac30758fd02..5b1c69b680d 100644 --- a/llvm/test/CodeGen/X86/avx512f-vec-test-testn.ll +++ b/llvm/test/CodeGen/X86/avx512f-vec-test-testn.ll @@ -17,6 +17,22 @@ entry: ret i8 %1 } +; Similar to the above, but the compare is reversed to have the zeros on the LHS +define zeroext i8 @TEST_mm512_test_epi64_mask_2(<8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 { +; CHECK-LABEL: TEST_mm512_test_epi64_mask_2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vptestmq %zmm0, %zmm1, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: # kill: def $al killed $al killed $eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %and.i.i = and <8 x i64> %__B, %__A + %0 = icmp ne <8 x i64> zeroinitializer, %and.i.i + %1 = bitcast <8 x i1> %0 to i8 + ret i8 %1 +} + ; Function Attrs: norecurse nounwind readnone define zeroext i16 @TEST_mm512_test_epi32_mask(<8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 { ; CHECK-LABEL: TEST_mm512_test_epi32_mask: @@ -89,6 +105,22 @@ entry: ret i8 %1 } +; Similar to the above, but the compare is reversed to have the zeros on the LHS +define zeroext i8 @TEST_mm512_testn_epi64_mask_2(<8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 { +; CHECK-LABEL: TEST_mm512_testn_epi64_mask_2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vptestnmq %zmm0, %zmm1, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: # kill: def $al killed $al killed $eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %and.i.i = and <8 x i64> %__B, %__A + %0 = icmp eq <8 x i64> zeroinitializer, %and.i.i + %1 = bitcast <8 x i1> %0 to i8 + ret i8 %1 +} + ; Function Attrs: norecurse nounwind readnone define zeroext i16 @TEST_mm512_testn_epi32_mask(<8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 { ; CHECK-LABEL: TEST_mm512_testn_epi32_mask: |