diff options
| author | Craig Topper <craig.topper@intel.com> | 2018-02-28 08:14:28 +0000 |
|---|---|---|
| committer | Craig Topper <craig.topper@intel.com> | 2018-02-28 08:14:28 +0000 |
| commit | 48d5ed265c9772a3353236ae20f228a9b4cdca95 (patch) | |
| tree | e9e75901eccf8c08ead50aea67f9e31ac52a1984 | |
| parent | 7275da0f2ee24336fe83cb7cfe2ba22f9cefc117 (diff) | |
| download | bcm5719-llvm-48d5ed265c9772a3353236ae20f228a9b4cdca95.tar.gz bcm5719-llvm-48d5ed265c9772a3353236ae20f228a9b4cdca95.zip | |
[X86] Don't use EXTRACT_ELEMENT from v1i1 with i8/i32 result type when we need to guarantee zeroes in the upper bits of return.
An extract_element where the result type is larger than the scalar element type is semantically an any_extend of from the scalar element type to the result type. If we expect zeroes in the upper bits of the i8/i32 we need to mae sure those zeroes are explicit in the DAG.
For these cases the best way to accomplish this is use an insert_subvector to pad zeroes to the upper bits of the v1i1 first. We extend to either v16i1(for i32) or v8i1(for i8). Then bitcast that to a scalar and finish with a zero_extend up to i32 if necessary. We can't extend past v16i1 because that's the largest mask size on KNL. But isel is smarter enough to know that a zext of a bitcast from v16i1 to i16 can use a KMOVW instruction. The insert_subvectors will be dropped during isel because we can determine that the producing instruction already zeroed the upper bits of the k-register.
llvm-svn: 326308
| -rw-r--r-- | llvm/lib/Target/X86/X86ISelDAGToDAG.cpp | 6 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 50 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86InstrVecCompiler.td | 17 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/avx512-cmp.ll | 18 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/avx512-intrinsics.ll | 24 | ||||
| -rwxr-xr-x | llvm/test/CodeGen/X86/avx512-schedule.ll | 6 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/gpr-to-mask.ll | 16 |
7 files changed, 83 insertions, 54 deletions
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp index df302cadd52..bcac241bb3a 100644 --- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -457,7 +457,7 @@ namespace { static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget) { unsigned Opcode = N->getOpcode(); if (Opcode == X86ISD::CMPM || Opcode == X86ISD::CMPMU || - Opcode == X86ISD::CMPM_RND) { + Opcode == X86ISD::CMPM_RND || Opcode == X86ISD::VFPCLASS) { // We can get 256-bit 8 element types here without VLX being enabled. When // this happens we will use 512-bit operations and the mask will not be // zero extended. @@ -467,6 +467,10 @@ static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget) { return true; } + // Scalar opcodes use 128 bit registers, but aren't subject to the VLX check. + if (Opcode == X86ISD::VFPCLASSS || Opcode == X86ISD::FSETCCM || + Opcode == X86ISD::FSETCCM_RND) + return true; return false; } diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 9225881a01f..3a89c195907 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -19948,6 +19948,7 @@ static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask, MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); + assert(Mask.getValueType() == MVT::i8 && "Unexpect type"); SDValue IMask = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Mask); if (Op.getOpcode() == X86ISD::FSETCCM || Op.getOpcode() == X86ISD::FSETCCM_RND || @@ -20417,9 +20418,11 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Imm); SDValue FPclassMask = getVectorMaskingNode(FPclass, Mask, SDValue(), Subtarget, DAG); + // Need to fill with zeros to ensure the bitcast will produce zeroes + // for the upper bits in the v2i1/v4i1 case. SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT, - DAG.getUNDEF(BitcastVT), FPclassMask, - DAG.getIntPtrConstant(0, dl)); + DAG.getConstant(0, dl, BitcastVT), + FPclassMask, DAG.getIntPtrConstant(0, dl)); return DAG.getBitcast(Op.getValueType(), Res); } case FPCLASSS: { @@ -20429,8 +20432,12 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm); SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(), Subtarget, DAG); - return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i8, FPclassMask, - DAG.getIntPtrConstant(0, dl)); + // Need to fill with zeros to ensure the bitcast will produce zeroes + // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that. + SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1, + DAG.getConstant(0, dl, MVT::v8i1), + FPclassMask, DAG.getIntPtrConstant(0, dl)); + return DAG.getBitcast(MVT::i8, Ins); } case CMP_MASK: { // Comparison intrinsics with masks. @@ -20438,7 +20445,7 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, // (i8 (int_x86_avx512_mask_pcmpeq_q_128 // (v2i64 %a), (v2i64 %b), (i8 %mask))) -> // (i8 (bitcast - // (v8i1 (insert_subvector undef, + // (v8i1 (insert_subvector zero, // (v2i1 (and (PCMPEQM %a, %b), // (extract_subvector // (v8i1 (bitcast %mask)), 0))), 0)))) @@ -20451,9 +20458,11 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, Op.getOperand(2)); SDValue CmpMask = getVectorMaskingNode(Cmp, Mask, SDValue(), Subtarget, DAG); + // Need to fill with zeros to ensure the bitcast will produce zeroes + // for the upper bits in the v2i1/v4i1 case. SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT, - DAG.getUNDEF(BitcastVT), CmpMask, - DAG.getIntPtrConstant(0, dl)); + DAG.getConstant(0, dl, BitcastVT), + CmpMask, DAG.getIntPtrConstant(0, dl)); return DAG.getBitcast(Op.getValueType(), Res); } @@ -20497,8 +20506,12 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(), Subtarget, DAG); - return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i8, CmpMask, - DAG.getIntPtrConstant(0, dl)); + // Need to fill with zeros to ensure the bitcast will produce zeroes + // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that. + SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1, + DAG.getConstant(0, dl, MVT::v8i1), + CmpMask, DAG.getIntPtrConstant(0, dl)); + return DAG.getBitcast(MVT::i8, Ins); } case COMI: { // Comparison intrinsics ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1; @@ -20551,8 +20564,13 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, else FCmp = DAG.getNode(X86ISD::FSETCCM_RND, dl, MVT::v1i1, LHS, RHS, DAG.getConstant(CondVal, dl, MVT::i8), Sae); - return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, FCmp, - DAG.getIntPtrConstant(0, dl)); + // Need to fill with zeros to ensure the bitcast will produce zeroes + // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that. + SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1, + DAG.getConstant(0, dl, MVT::v16i1), + FCmp, DAG.getIntPtrConstant(0, dl)); + return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, + DAG.getBitcast(MVT::i16, Ins)); } case VSHIFT: return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(), @@ -33382,9 +33400,13 @@ static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG, SDValue FSetCC = DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01, DAG.getConstant(x86cc, DL, MVT::i8)); - return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, - N->getSimpleValueType(0), FSetCC, - DAG.getIntPtrConstant(0, DL)); + // Need to fill with zeros to ensure the bitcast will produce zeroes + // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that. + SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v16i1, + DAG.getConstant(0, DL, MVT::v16i1), + FSetCC, DAG.getIntPtrConstant(0, DL)); + return DAG.getZExtOrTrunc(DAG.getBitcast(MVT::i16, Ins), DL, + N->getSimpleValueType(0)); } SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL, CMP00.getValueType(), CMP00, CMP01, diff --git a/llvm/lib/Target/X86/X86InstrVecCompiler.td b/llvm/lib/Target/X86/X86InstrVecCompiler.td index 1aef98ba49d..db3dfe56531 100644 --- a/llvm/lib/Target/X86/X86InstrVecCompiler.td +++ b/llvm/lib/Target/X86/X86InstrVecCompiler.td @@ -427,6 +427,7 @@ class maskzeroupper<ValueType vt, RegisterClass RC> : return isMaskZeroExtended(N); }]>; +def maskzeroupperv1i1 : maskzeroupper<v1i1, VK1>; def maskzeroupperv2i1 : maskzeroupper<v2i1, VK2>; def maskzeroupperv4i1 : maskzeroupper<v4i1, VK4>; def maskzeroupperv8i1 : maskzeroupper<v8i1, VK8>; @@ -438,11 +439,18 @@ def maskzeroupperv32i1 : maskzeroupper<v32i1, VK32>; // zeroing. let Predicates = [HasBWI] in { def : Pat<(v32i1 (insert_subvector (v32i1 immAllZerosV), + maskzeroupperv1i1:$src, (iPTR 0))), + (COPY_TO_REGCLASS VK1:$src, VK32)>; + def : Pat<(v32i1 (insert_subvector (v32i1 immAllZerosV), maskzeroupperv8i1:$src, (iPTR 0))), (COPY_TO_REGCLASS VK8:$src, VK32)>; def : Pat<(v32i1 (insert_subvector (v32i1 immAllZerosV), maskzeroupperv16i1:$src, (iPTR 0))), (COPY_TO_REGCLASS VK16:$src, VK32)>; + + def : Pat<(v64i1 (insert_subvector (v64i1 immAllZerosV), + maskzeroupperv1i1:$src, (iPTR 0))), + (COPY_TO_REGCLASS VK1:$src, VK64)>; def : Pat<(v64i1 (insert_subvector (v64i1 immAllZerosV), maskzeroupperv8i1:$src, (iPTR 0))), (COPY_TO_REGCLASS VK8:$src, VK64)>; @@ -456,10 +464,19 @@ let Predicates = [HasBWI] in { let Predicates = [HasAVX512] in { def : Pat<(v16i1 (insert_subvector (v16i1 immAllZerosV), + maskzeroupperv1i1:$src, (iPTR 0))), + (COPY_TO_REGCLASS VK1:$src, VK16)>; + def : Pat<(v16i1 (insert_subvector (v16i1 immAllZerosV), maskzeroupperv8i1:$src, (iPTR 0))), (COPY_TO_REGCLASS VK8:$src, VK16)>; } +let Predicates = [HasDQI] in { + def : Pat<(v8i1 (insert_subvector (v8i1 immAllZerosV), + maskzeroupperv1i1:$src, (iPTR 0))), + (COPY_TO_REGCLASS VK1:$src, VK8)>; +} + let Predicates = [HasVLX, HasDQI] in { def : Pat<(v8i1 (insert_subvector (v8i1 immAllZerosV), maskzeroupperv2i1:$src, (iPTR 0))), diff --git a/llvm/test/CodeGen/X86/avx512-cmp.ll b/llvm/test/CodeGen/X86/avx512-cmp.ll index dd4d6b961d5..89d811f8681 100644 --- a/llvm/test/CodeGen/X86/avx512-cmp.ll +++ b/llvm/test/CodeGen/X86/avx512-cmp.ll @@ -48,19 +48,11 @@ l2: } define i32 @test3(float %a, float %b) { -; KNL-LABEL: test3: -; KNL: ## %bb.0: -; KNL-NEXT: vcmpeqss %xmm1, %xmm0, %k0 -; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: movzbl %al, %eax -; KNL-NEXT: retq -; -; SKX-LABEL: test3: -; SKX: ## %bb.0: -; SKX-NEXT: vcmpeqss %xmm1, %xmm0, %k0 -; SKX-NEXT: kmovd %k0, %eax -; SKX-NEXT: movzbl %al, %eax -; SKX-NEXT: retq +; ALL-LABEL: test3: +; ALL: ## %bb.0: +; ALL-NEXT: vcmpeqss %xmm1, %xmm0, %k0 +; ALL-NEXT: kmovw %k0, %eax +; ALL-NEXT: retq %cmp10.i = fcmp oeq float %a, %b %conv11.i = zext i1 %cmp10.i to i32 diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics.ll b/llvm/test/CodeGen/X86/avx512-intrinsics.ll index 972fbc91602..b8548b0aa1b 100644 --- a/llvm/test/CodeGen/X86/avx512-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics.ll @@ -3186,17 +3186,17 @@ define i8@test_int_x86_avx512_mask_cmp_sd_all(<2 x double> %x0, <2 x double> %x1 ; CHECK-LABEL: test_int_x86_avx512_mask_cmp_sd_all: ; CHECK: ## %bb.0: ; CHECK-NEXT: vcmplesd %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vcmpunordsd {sae}, %xmm1, %xmm0, %k0 ; CHECK-NEXT: kmovw %k0, %ecx -; CHECK-NEXT: orl %eax, %ecx +; CHECK-NEXT: vcmpunordsd {sae}, %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovw %k0, %edx ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vcmpneqsd %xmm1, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %edx +; CHECK-NEXT: kmovw %k0, %esi ; CHECK-NEXT: vcmpnltsd {sae}, %xmm1, %xmm0, %k0 {%k1} ; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: orl %edx, %eax -; CHECK-NEXT: orl %ecx, %eax +; CHECK-NEXT: orb %cl, %dl +; CHECK-NEXT: orb %sil, %al +; CHECK-NEXT: orb %dl, %al ; CHECK-NEXT: ## kill: def $al killed $al killed $eax ; CHECK-NEXT: retq @@ -3231,17 +3231,17 @@ define i8@test_int_x86_avx512_mask_cmp_ss_all(<4 x float> %x0, <4 x float> %x1, ; CHECK-LABEL: test_int_x86_avx512_mask_cmp_ss_all: ; CHECK: ## %bb.0: ; CHECK-NEXT: vcmpless %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: vcmpunordss {sae}, %xmm1, %xmm0, %k0 ; CHECK-NEXT: kmovw %k0, %ecx -; CHECK-NEXT: andl %eax, %ecx +; CHECK-NEXT: vcmpunordss {sae}, %xmm1, %xmm0, %k0 +; CHECK-NEXT: kmovw %k0, %edx ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vcmpneqss %xmm1, %xmm0, %k0 {%k1} -; CHECK-NEXT: kmovw %k0, %edx +; CHECK-NEXT: kmovw %k0, %esi ; CHECK-NEXT: vcmpnltss {sae}, %xmm1, %xmm0, %k0 {%k1} ; CHECK-NEXT: kmovw %k0, %eax -; CHECK-NEXT: andl %edx, %eax -; CHECK-NEXT: andl %ecx, %eax +; CHECK-NEXT: andb %cl, %dl +; CHECK-NEXT: andb %sil, %al +; CHECK-NEXT: andb %dl, %al ; CHECK-NEXT: ## kill: def $al killed $al killed $eax ; CHECK-NEXT: retq %res1 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 2, i8 -1, i32 4) diff --git a/llvm/test/CodeGen/X86/avx512-schedule.ll b/llvm/test/CodeGen/X86/avx512-schedule.ll index 4d24799444a..a1c3ed48e0e 100755 --- a/llvm/test/CodeGen/X86/avx512-schedule.ll +++ b/llvm/test/CodeGen/X86/avx512-schedule.ll @@ -1124,15 +1124,13 @@ define i32 @test3(float %a, float %b) { ; GENERIC-LABEL: test3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vcmpeqss %xmm1, %xmm0, %k0 # sched: [3:1.00] -; GENERIC-NEXT: kmovd %k0, %eax # sched: [1:0.33] -; GENERIC-NEXT: movzbl %al, %eax # sched: [1:0.33] +; GENERIC-NEXT: kmovw %k0, %eax # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test3: ; SKX: # %bb.0: ; SKX-NEXT: vcmpeqss %xmm1, %xmm0, %k0 # sched: [3:1.00] -; SKX-NEXT: kmovd %k0, %eax # sched: [3:1.00] -; SKX-NEXT: movzbl %al, %eax # sched: [1:0.25] +; SKX-NEXT: kmovw %k0, %eax # sched: [3:1.00] ; SKX-NEXT: retq # sched: [7:1.00] %cmp10.i = fcmp oeq float %a, %b diff --git a/llvm/test/CodeGen/X86/gpr-to-mask.ll b/llvm/test/CodeGen/X86/gpr-to-mask.ll index 05a09cb99fc..ff237799783 100644 --- a/llvm/test/CodeGen/X86/gpr-to-mask.ll +++ b/llvm/test/CodeGen/X86/gpr-to-mask.ll @@ -60,13 +60,11 @@ define void @test_fcmp_storei1(i1 %cond, float* %fptr, i1* %iptr, float %f1, flo ; X86-64-NEXT: je .LBB1_2 ; X86-64-NEXT: # %bb.1: # %if ; X86-64-NEXT: vcmpeqss %xmm1, %xmm0, %k0 -; X86-64-NEXT: jmp .LBB1_3 +; X86-64-NEXT: kmovb %k0, (%rdx) +; X86-64-NEXT: retq ; X86-64-NEXT: .LBB1_2: # %else ; X86-64-NEXT: vcmpeqss %xmm3, %xmm2, %k0 -; X86-64-NEXT: .LBB1_3: # %exit -; X86-64-NEXT: kmovd %k0, %eax -; X86-64-NEXT: andb $1, %al -; X86-64-NEXT: movb %al, (%rdx) +; X86-64-NEXT: kmovb %k0, (%rdx) ; X86-64-NEXT: retq ; ; X86-32-LABEL: test_fcmp_storei1: @@ -77,14 +75,12 @@ define void @test_fcmp_storei1(i1 %cond, float* %fptr, i1* %iptr, float %f1, flo ; X86-32-NEXT: # %bb.1: # %if ; X86-32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-32-NEXT: vcmpeqss {{[0-9]+}}(%esp), %xmm0, %k0 -; X86-32-NEXT: jmp .LBB1_3 +; X86-32-NEXT: kmovb %k0, (%eax) +; X86-32-NEXT: retl ; X86-32-NEXT: .LBB1_2: # %else ; X86-32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-32-NEXT: vcmpeqss {{[0-9]+}}(%esp), %xmm0, %k0 -; X86-32-NEXT: .LBB1_3: # %exit -; X86-32-NEXT: kmovd %k0, %ecx -; X86-32-NEXT: andb $1, %cl -; X86-32-NEXT: movb %cl, (%eax) +; X86-32-NEXT: kmovb %k0, (%eax) ; X86-32-NEXT: retl entry: br i1 %cond, label %if, label %else |

