summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp70
-rw-r--r--llvm/test/CodeGen/X86/masked_gather_scatter.ll64
2 files changed, 48 insertions, 86 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 052300d6f72..54ca3721cf5 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -42572,16 +42572,17 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
SDValue Base = GorS->getBasePtr();
SDValue Scale = GorS->getScale();
- // Shrink constant indices if they are larger than 32-bits.
- // Only do this before legalize types since v2i64 could become v2i32.
- // FIXME: We could check that the type is legal if we're after legalize types,
- // but then we would need to construct test cases where that happens.
- // FIXME: We could support more than just constant vectors, but we need to
- // careful with costing. A truncate that can be optimized out would be fine.
- // Otherwise we might only want to create a truncate if it avoids a split.
if (DCI.isBeforeLegalize()) {
+ unsigned IndexWidth = Index.getScalarValueSizeInBits();
+
+ // Shrink constant indices if they are larger than 32-bits.
+ // Only do this before legalize types since v2i64 could become v2i32.
+ // FIXME: We could check that the type is legal if we're after legalize
+ // types, but then we would need to construct test cases where that happens.
+ // FIXME: We could support more than just constant vectors, but we need to
+ // careful with costing. A truncate that can be optimized out would be fine.
+ // Otherwise we might only want to create a truncate if it avoids a split.
if (auto *BV = dyn_cast<BuildVectorSDNode>(Index)) {
- unsigned IndexWidth = Index.getScalarValueSizeInBits();
if (BV->isConstant() && IndexWidth > 32 &&
DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
unsigned NumElts = Index.getValueType().getVectorNumElements();
@@ -42604,16 +42605,18 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
Scatter->getIndexType());
}
}
- }
- if (DCI.isBeforeLegalizeOps()) {
- // Remove any sign extends from 32 or smaller to larger than 32.
- // Only do this before LegalizeOps in case we need the sign extend for
- // legalization.
- if (Index.getOpcode() == ISD::SIGN_EXTEND &&
- Index.getScalarValueSizeInBits() > 32 &&
- Index.getOperand(0).getScalarValueSizeInBits() <= 32) {
- Index = Index.getOperand(0);
+ // Shrink any sign/zero extends from 32 or smaller to larger than 32 if
+ // there are sufficient sign bits. Only do this before legalize types to
+ // avoid creating illegal types in truncate.
+ if ((Index.getOpcode() == ISD::SIGN_EXTEND ||
+ Index.getOpcode() == ISD::ZERO_EXTEND) &&
+ IndexWidth > 32 &&
+ Index.getOperand(0).getScalarValueSizeInBits() <= 32 &&
+ DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
+ unsigned NumElts = Index.getValueType().getVectorNumElements();
+ EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
+ Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {
SDValue Ops[] = { Chain, Gather->getPassThru(),
Mask, Base, Index, Scale } ;
@@ -42630,11 +42633,14 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
Ops, Scatter->getMemOperand(),
Scatter->getIndexType());
}
+ }
+
+ if (DCI.isBeforeLegalizeOps()) {
+ unsigned IndexWidth = Index.getScalarValueSizeInBits();
// Make sure the index is either i32 or i64
- unsigned ScalarSize = Index.getScalarValueSizeInBits();
- if (ScalarSize != 32 && ScalarSize != 64) {
- MVT EltVT = ScalarSize > 32 ? MVT::i64 : MVT::i32;
+ if (IndexWidth != 32 && IndexWidth != 64) {
+ MVT EltVT = IndexWidth > 32 ? MVT::i64 : MVT::i32;
EVT IndexVT = EVT::getVectorVT(*DAG.getContext(), EltVT,
Index.getValueType().getVectorNumElements());
Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);
@@ -42654,30 +42660,6 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
Ops, Scatter->getMemOperand(),
Scatter->getIndexType());
}
-
- // Try to remove zero extends from 32->64 if we know the sign bit of
- // the input is zero.
- if (Index.getOpcode() == ISD::ZERO_EXTEND &&
- Index.getScalarValueSizeInBits() == 64 &&
- Index.getOperand(0).getScalarValueSizeInBits() == 32 &&
- DAG.SignBitIsZero(Index.getOperand(0))) {
- Index = Index.getOperand(0);
- if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {
- SDValue Ops[] = { Chain, Gather->getPassThru(),
- Mask, Base, Index, Scale } ;
- return DAG.getMaskedGather(Gather->getVTList(),
- Gather->getMemoryVT(), DL, Ops,
- Gather->getMemOperand(),
- Gather->getIndexType());
- }
- auto *Scatter = cast<MaskedScatterSDNode>(GorS);
- SDValue Ops[] = { Chain, Scatter->getValue(),
- Mask, Base, Index, Scale };
- return DAG.getMaskedScatter(Scatter->getVTList(),
- Scatter->getMemoryVT(), DL,
- Ops, Scatter->getMemOperand(),
- Scatter->getIndexType());
- }
}
// With vector masks we only demand the upper bit of the mask.
diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter.ll b/llvm/test/CodeGen/X86/masked_gather_scatter.ll
index fe870e51538..2e4edb99094 100644
--- a/llvm/test/CodeGen/X86/masked_gather_scatter.ll
+++ b/llvm/test/CodeGen/X86/masked_gather_scatter.ll
@@ -2693,56 +2693,32 @@ declare <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*>, i32, <8 x i1
define <16 x float> @zext_i8_index(float* %base, <16 x i8> %ind) {
; KNL_64-LABEL: zext_i8_index:
; KNL_64: # %bb.0:
-; KNL_64-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; KNL_64-NEXT: vpmovzxwq {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; KNL_64-NEXT: vextracti128 $1, %ymm0, %xmm0
-; KNL_64-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; KNL_64-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; KNL_64-NEXT: kxnorw %k0, %k0, %k1
-; KNL_64-NEXT: kxnorw %k0, %k0, %k2
-; KNL_64-NEXT: vgatherqps (%rdi,%zmm0,4), %ymm2 {%k2}
-; KNL_64-NEXT: vgatherqps (%rdi,%zmm1,4), %ymm0 {%k1}
-; KNL_64-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; KNL_64-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: zext_i8_index:
; KNL_32: # %bb.0:
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; KNL_32-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; KNL_32-NEXT: vpmovzxwq {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; KNL_32-NEXT: vextracti128 $1, %ymm0, %xmm0
-; KNL_32-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; KNL_32-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; KNL_32-NEXT: kxnorw %k0, %k0, %k1
-; KNL_32-NEXT: kxnorw %k0, %k0, %k2
-; KNL_32-NEXT: vgatherqps (%eax,%zmm0,4), %ymm2 {%k2}
-; KNL_32-NEXT: vgatherqps (%eax,%zmm1,4), %ymm0 {%k1}
-; KNL_32-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
; KNL_32-NEXT: retl
;
; SKX-LABEL: zext_i8_index:
; SKX: # %bb.0:
-; SKX-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; SKX-NEXT: vpmovzxwq {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0
-; SKX-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; SKX-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; SKX-NEXT: kxnorw %k0, %k0, %k1
-; SKX-NEXT: kxnorw %k0, %k0, %k2
-; SKX-NEXT: vgatherqps (%rdi,%zmm0,4), %ymm2 {%k2}
-; SKX-NEXT: vgatherqps (%rdi,%zmm1,4), %ymm0 {%k1}
-; SKX-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; SKX-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
; SKX-NEXT: retq
;
; SKX_32-LABEL: zext_i8_index:
; SKX_32: # %bb.0:
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; SKX_32-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; SKX_32-NEXT: vpmovzxwq {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; SKX_32-NEXT: vextracti128 $1, %ymm0, %xmm0
-; SKX_32-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; SKX_32-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; SKX_32-NEXT: kxnorw %k0, %k0, %k1
-; SKX_32-NEXT: kxnorw %k0, %k0, %k2
-; SKX_32-NEXT: vgatherqps (%eax,%zmm0,4), %ymm2 {%k2}
-; SKX_32-NEXT: vgatherqps (%eax,%zmm1,4), %ymm0 {%k1}
-; SKX_32-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; SKX_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
; SKX_32-NEXT: retl
%zext_ind = zext <16 x i8> %ind to <16 x i64>
@@ -2756,32 +2732,36 @@ define <16 x float> @zext_i8_index(float* %base, <16 x i8> %ind) {
define <8 x float> @zext_v8i8_index(float* %base, <8 x i8> %ind) {
; KNL_64-LABEL: zext_v8i8_index:
; KNL_64: # %bb.0:
-; KNL_64-NEXT: vpmovzxbq {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero
-; KNL_64-NEXT: kxnorw %k0, %k0, %k1
-; KNL_64-NEXT: vgatherqps (%rdi,%zmm1,4), %ymm0 {%k1}
+; KNL_64-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; KNL_64-NEXT: movw $255, %ax
+; KNL_64-NEXT: kmovw %eax, %k1
+; KNL_64-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
+; KNL_64-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: zext_v8i8_index:
; KNL_32: # %bb.0:
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; KNL_32-NEXT: vpmovzxbq {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero
-; KNL_32-NEXT: kxnorw %k0, %k0, %k1
-; KNL_32-NEXT: vgatherqps (%eax,%zmm1,4), %ymm0 {%k1}
+; KNL_32-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; KNL_32-NEXT: movw $255, %cx
+; KNL_32-NEXT: kmovw %ecx, %k1
+; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
+; KNL_32-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; KNL_32-NEXT: retl
;
; SKX-LABEL: zext_v8i8_index:
; SKX: # %bb.0:
-; SKX-NEXT: vpmovzxbq {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero
+; SKX-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; SKX-NEXT: kxnorw %k0, %k0, %k1
-; SKX-NEXT: vgatherqps (%rdi,%zmm1,4), %ymm0 {%k1}
+; SKX-NEXT: vgatherdps (%rdi,%ymm1,4), %ymm0 {%k1}
; SKX-NEXT: retq
;
; SKX_32-LABEL: zext_v8i8_index:
; SKX_32: # %bb.0:
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; SKX_32-NEXT: vpmovzxbq {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero
+; SKX_32-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; SKX_32-NEXT: kxnorw %k0, %k0, %k1
-; SKX_32-NEXT: vgatherqps (%eax,%zmm1,4), %ymm0 {%k1}
+; SKX_32-NEXT: vgatherdps (%eax,%ymm1,4), %ymm0 {%k1}
; SKX_32-NEXT: retl
%zext_ind = zext <8 x i8> %ind to <8 x i64>
OpenPOWER on IntegriCloud