summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp8
-rw-r--r--llvm/lib/Target/X86/X86InstrAVX512.td40
-rw-r--r--llvm/test/CodeGen/X86/avx512f-vec-test-testn.ll32
3 files changed, 64 insertions, 16 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 9f483b72a89..2f2ae32a1d1 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -17830,6 +17830,14 @@ static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
"Cannot set masked compare for this operation");
ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
+
+ // If this is a seteq make sure any build vectors of all zeros are on the RHS.
+ // This helps with vptestm matching.
+ // TODO: Should we just canonicalize the setcc during DAG combine?
+ if ((SetCCOpcode == ISD::SETEQ || SetCCOpcode == ISD::SETNE) &&
+ ISD::isBuildVectorAllZeros(Op0.getNode()))
+ std::swap(Op0, Op1);
+
bool Swap = false;
unsigned SSECC;
switch (SetCCOpcode) {
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index 395ab8a3c8a..23432072e98 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -2189,27 +2189,27 @@ multiclass avx512_icmp_packed_rmb_vl<bits<8> opc, string OpcodeStr,
}
}
-def X86pcmpeqm : PatFrag<(ops node:$src1, node:$src2),
- (X86cmpm_c node:$src1, node:$src2, (i8 0))>;
-def X86pcmpnem : PatFrag<(ops node:$src1, node:$src2),
- (X86cmpm_c node:$src1, node:$src2, (i8 4))>;
+// This fragment treats X86cmpm as commutable to help match loads in both
+// operands for PCMPEQ.
+def X86pcmpeqm_c : PatFrag<(ops node:$src1, node:$src2),
+ (X86cmpm_c node:$src1, node:$src2, (i8 0))>;
def X86pcmpgtm : PatFrag<(ops node:$src1, node:$src2),
(X86cmpm node:$src1, node:$src2, (i8 6))>;
// FIXME: Is there a better scheduler itinerary for VPCMP?
-defm VPCMPEQB : avx512_icmp_packed_vl<0x74, "vpcmpeqb", X86pcmpeqm,
+defm VPCMPEQB : avx512_icmp_packed_vl<0x74, "vpcmpeqb", X86pcmpeqm_c,
SSE_ALU_F32P, avx512vl_i8_info, HasBWI, 1>,
EVEX_CD8<8, CD8VF>, VEX_WIG;
-defm VPCMPEQW : avx512_icmp_packed_vl<0x75, "vpcmpeqw", X86pcmpeqm,
+defm VPCMPEQW : avx512_icmp_packed_vl<0x75, "vpcmpeqw", X86pcmpeqm_c,
SSE_ALU_F32P, avx512vl_i16_info, HasBWI, 1>,
EVEX_CD8<16, CD8VF>, VEX_WIG;
-defm VPCMPEQD : avx512_icmp_packed_rmb_vl<0x76, "vpcmpeqd", X86pcmpeqm,
+defm VPCMPEQD : avx512_icmp_packed_rmb_vl<0x76, "vpcmpeqd", X86pcmpeqm_c,
SSE_ALU_F32P, avx512vl_i32_info, HasAVX512, 1>,
EVEX_CD8<32, CD8VF>;
-defm VPCMPEQQ : avx512_icmp_packed_rmb_vl<0x29, "vpcmpeqq", X86pcmpeqm,
+defm VPCMPEQQ : avx512_icmp_packed_rmb_vl<0x29, "vpcmpeqq", X86pcmpeqm_c,
SSE_ALU_F32P, avx512vl_i64_info, HasAVX512, 1>,
T8PD, VEX_W, EVEX_CD8<64, CD8VF>;
@@ -3111,16 +3111,16 @@ def : Pat<(Narrow.KVT (and Narrow.KRC:$mask,
let Predicates = [HasAVX512, NoVLX] in {
defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, "VPCMPGTD", v8i32x_info, v16i32_info>;
- defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm, "VPCMPEQD", v8i32x_info, v16i32_info>;
+ defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, "VPCMPEQD", v8i32x_info, v16i32_info>;
defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, "VPCMPGTD", v4i32x_info, v16i32_info>;
- defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm, "VPCMPEQD", v4i32x_info, v16i32_info>;
+ defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, "VPCMPEQD", v4i32x_info, v16i32_info>;
defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, "VPCMPGTQ", v4i64x_info, v8i64_info>;
- defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm, "VPCMPEQQ", v4i64x_info, v8i64_info>;
+ defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, "VPCMPEQQ", v4i64x_info, v8i64_info>;
defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, "VPCMPGTQ", v2i64x_info, v8i64_info>;
- defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm, "VPCMPEQQ", v2i64x_info, v8i64_info>;
+ defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, "VPCMPEQQ", v2i64x_info, v8i64_info>;
defm : axv512_icmp_packed_cc_no_vlx_lowering<X86cmpm, "VCMPPS", v8f32x_info, v16f32_info>;
defm : axv512_icmp_packed_cc_no_vlx_lowering<X86cmpm, "VPCMPD", v8i32x_info, v16i32_info>;
@@ -3141,16 +3141,16 @@ let Predicates = [HasAVX512, NoVLX] in {
let Predicates = [HasBWI, NoVLX] in {
defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, "VPCMPGTB", v32i8x_info, v64i8_info>;
- defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm, "VPCMPEQB", v32i8x_info, v64i8_info>;
+ defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, "VPCMPEQB", v32i8x_info, v64i8_info>;
defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, "VPCMPGTB", v16i8x_info, v64i8_info>;
- defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm, "VPCMPEQB", v16i8x_info, v64i8_info>;
+ defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, "VPCMPEQB", v16i8x_info, v64i8_info>;
defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, "VPCMPGTW", v16i16x_info, v32i16_info>;
- defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm, "VPCMPEQW", v16i16x_info, v32i16_info>;
+ defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, "VPCMPEQW", v16i16x_info, v32i16_info>;
defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, "VPCMPGTW", v8i16x_info, v32i16_info>;
- defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm, "VPCMPEQW", v8i16x_info, v32i16_info>;
+ defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, "VPCMPEQW", v8i16x_info, v32i16_info>;
defm : axv512_icmp_packed_cc_no_vlx_lowering<X86cmpm, "VPCMPB", v32i8x_info, v64i8_info>;
defm : axv512_icmp_packed_cc_no_vlx_lowering<X86cmpmu, "VPCMPUB", v32i8x_info, v64i8_info>;
@@ -5465,6 +5465,14 @@ multiclass avx512_vptest_wb<bits<8> opc, string OpcodeStr,
}
}
+// These patterns are used to match vptestm/vptestnm. We don't treat pcmpeqm
+// as commutable here because we already canonicalized all zeros vectors to the
+// RHS during lowering.
+def X86pcmpeqm : PatFrag<(ops node:$src1, node:$src2),
+ (X86cmpm node:$src1, node:$src2, (i8 0))>;
+def X86pcmpnem : PatFrag<(ops node:$src1, node:$src2),
+ (X86cmpm node:$src1, node:$src2, (i8 4))>;
+
multiclass avx512_vptest_all_forms<bits<8> opc_wb, bits<8> opc_dq, string OpcodeStr,
PatFrag OpNode, OpndItins itins> :
avx512_vptest_wb <opc_wb, OpcodeStr, OpNode, itins>,
diff --git a/llvm/test/CodeGen/X86/avx512f-vec-test-testn.ll b/llvm/test/CodeGen/X86/avx512f-vec-test-testn.ll
index ac30758fd02..5b1c69b680d 100644
--- a/llvm/test/CodeGen/X86/avx512f-vec-test-testn.ll
+++ b/llvm/test/CodeGen/X86/avx512f-vec-test-testn.ll
@@ -17,6 +17,22 @@ entry:
ret i8 %1
}
+; Similar to the above, but the compare is reversed to have the zeros on the LHS
+define zeroext i8 @TEST_mm512_test_epi64_mask_2(<8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
+; CHECK-LABEL: TEST_mm512_test_epi64_mask_2:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vptestmq %zmm0, %zmm1, %k0
+; CHECK-NEXT: kmovw %k0, %eax
+; CHECK-NEXT: # kill: def $al killed $al killed $eax
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+entry:
+ %and.i.i = and <8 x i64> %__B, %__A
+ %0 = icmp ne <8 x i64> zeroinitializer, %and.i.i
+ %1 = bitcast <8 x i1> %0 to i8
+ ret i8 %1
+}
+
; Function Attrs: norecurse nounwind readnone
define zeroext i16 @TEST_mm512_test_epi32_mask(<8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm512_test_epi32_mask:
@@ -89,6 +105,22 @@ entry:
ret i8 %1
}
+; Similar to the above, but the compare is reversed to have the zeros on the LHS
+define zeroext i8 @TEST_mm512_testn_epi64_mask_2(<8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
+; CHECK-LABEL: TEST_mm512_testn_epi64_mask_2:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vptestnmq %zmm0, %zmm1, %k0
+; CHECK-NEXT: kmovw %k0, %eax
+; CHECK-NEXT: # kill: def $al killed $al killed $eax
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+entry:
+ %and.i.i = and <8 x i64> %__B, %__A
+ %0 = icmp eq <8 x i64> zeroinitializer, %and.i.i
+ %1 = bitcast <8 x i1> %0 to i8
+ ret i8 %1
+}
+
; Function Attrs: norecurse nounwind readnone
define zeroext i16 @TEST_mm512_testn_epi32_mask(<8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
; CHECK-LABEL: TEST_mm512_testn_epi32_mask:
OpenPOWER on IntegriCloud