summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSanjay Patel <spatel@rotateright.com>2018-06-13 12:28:32 +0000
committerSanjay Patel <spatel@rotateright.com>2018-06-13 12:28:32 +0000
commitb983ac6fe1d90b8d117c5e08054bf5a0b160c3a7 (patch)
tree171b91fe28c2a261ee49754f3cf54529ea7d3b5b
parent96f492d7df9e2bc2f2d76afb4d26cd59574d969f (diff)
downloadbcm5719-llvm-b983ac6fe1d90b8d117c5e08054bf5a0b160c3a7.tar.gz
bcm5719-llvm-b983ac6fe1d90b8d117c5e08054bf5a0b160c3a7.zip
[x86] eliminate even more sign-bit tests with vector select
This shortcoming was noted in D47330, and the test diffs show we already had other examples where we failed to fold to a SHRUNKBLEND: /// Dynamic (non-constant condition) vector blend where only the sign bits /// of the condition elements are used. This is used to enforce that the /// condition mask is not valid for generic VSELECT optimizations. This patch implements an idea from D48043 and would obsolete that patch because it catches more cases (notable the AVX1 case that was missed there). All we're doing is allowing the existing transform to fire more often by removing the post-legalize constraint. All of the relevant feature checks and other predicates are left as-is. Differential Revision: https://reviews.llvm.org/D48078 llvm-svn: 334592
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp9
-rw-r--r--llvm/test/CodeGen/X86/vsel-cmp-load.ll9
-rw-r--r--llvm/test/CodeGen/X86/vselect-pcmp.ll19
3 files changed, 8 insertions, 29 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 0413f0f00b0..9f03df35570 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -32373,14 +32373,14 @@ static SDValue combineVSelectToShrunkBlend(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
SDValue Cond = N->getOperand(0);
- if (N->getOpcode() != ISD::VSELECT || !DCI.isBeforeLegalizeOps() ||
- DCI.isBeforeLegalize() ||
+ if (N->getOpcode() != ISD::VSELECT ||
ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
return SDValue();
- // Don't optimize vector selects that map to mask-registers.
+ // Don't optimize before the condition has been transformed to a legal type
+ // and don't ever optimize vector selects that map to AVX512 mask-registers.
unsigned BitWidth = Cond.getScalarValueSizeInBits();
- if (BitWidth == 1)
+ if (BitWidth < 8 || BitWidth > 64)
return SDValue();
// We can only handle the cases where VSELECT is directly legal on the
@@ -32418,7 +32418,6 @@ static SDValue combineVSelectToShrunkBlend(SDNode *N, SelectionDAG &DAG,
if (UI->getOpcode() != ISD::VSELECT || UI.getOperandNo() != 0)
return SDValue();
- assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
APInt DemandedMask(APInt::getSignMask(BitWidth));
KnownBits Known;
TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
diff --git a/llvm/test/CodeGen/X86/vsel-cmp-load.ll b/llvm/test/CodeGen/X86/vsel-cmp-load.ll
index b9f18fe89a5..d317377a93a 100644
--- a/llvm/test/CodeGen/X86/vsel-cmp-load.ll
+++ b/llvm/test/CodeGen/X86/vsel-cmp-load.ll
@@ -118,9 +118,6 @@ define <8 x i32> @slt_zero(<8 x i8>* %p, <8 x i32> %x, <8 x i32> %y) {
; AVX1-NEXT: vpmovsxwd %xmm2, %xmm3
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
; AVX1-NEXT: vpmovsxwd %xmm2, %xmm2
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpcmpgtd %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpcmpgtd %xmm3, %xmm4, %xmm3
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
; AVX1-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm0
; AVX1-NEXT: retq
@@ -128,8 +125,6 @@ define <8 x i32> @slt_zero(<8 x i8>* %p, <8 x i32> %x, <8 x i32> %y) {
; AVX2-LABEL: slt_zero:
; AVX2: # %bb.0:
; AVX2-NEXT: vpmovsxbd (%rdi), %ymm2
-; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX2-NEXT: vpcmpgtd %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
@@ -251,8 +246,6 @@ define <4 x double> @sgt_zero_fp_select(<4 x i8>* %p, <4 x double> %x, <4 x doub
ret <4 x double> %sel
}
-; FIXME: The compare with 0 for AVX2 should be eliminated.
-
define <8 x float> @slt_zero_fp_select(<8 x i16>* %p, <8 x float> %x, <8 x float> %y) {
; AVX1-LABEL: slt_zero_fp_select:
; AVX1: # %bb.0:
@@ -265,8 +258,6 @@ define <8 x float> @slt_zero_fp_select(<8 x i16>* %p, <8 x float> %x, <8 x float
; AVX2-LABEL: slt_zero_fp_select:
; AVX2: # %bb.0:
; AVX2-NEXT: vpmovsxwd (%rdi), %ymm2
-; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX2-NEXT: vpcmpgtd %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vselect-pcmp.ll b/llvm/test/CodeGen/X86/vselect-pcmp.ll
index 3b41981b2dd..d9938a3cdaf 100644
--- a/llvm/test/CodeGen/X86/vselect-pcmp.ll
+++ b/llvm/test/CodeGen/X86/vselect-pcmp.ll
@@ -9,20 +9,11 @@
; Test 128-bit vectors for all legal element types.
-; FIXME: Why didn't AVX-512 optimize too?
-
define <16 x i8> @signbit_sel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) {
-; AVX12-LABEL: signbit_sel_v16i8:
-; AVX12: # %bb.0:
-; AVX12-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
-; AVX12-NEXT: retq
-;
-; AVX512-LABEL: signbit_sel_v16i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX512-NEXT: vpcmpgtb %xmm2, %xmm3, %xmm2
-; AVX512-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
-; AVX512-NEXT: retq
+; AVX-LABEL: signbit_sel_v16i8:
+; AVX: # %bb.0:
+; AVX-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-NEXT: retq
%tr = icmp slt <16 x i8> %mask, zeroinitializer
%z = select <16 x i1> %tr, <16 x i8> %x, <16 x i8> %y
ret <16 x i8> %z
@@ -180,8 +171,6 @@ define <32 x i8> @signbit_sel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %mask)
;
; AVX512-LABEL: signbit_sel_v32i8:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX512-NEXT: vpcmpgtb %ymm2, %ymm3, %ymm2
; AVX512-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX512-NEXT: retq
%tr = icmp slt <32 x i8> %mask, zeroinitializer
OpenPOWER on IntegriCloud