summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp69
-rw-r--r--llvm/lib/Target/X86/X86InstrAVX512.td29
-rw-r--r--llvm/test/CodeGen/X86/avx512-mask-op.ll8
3 files changed, 68 insertions, 38 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 18f32768a06..e8b3f3656b6 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -1177,6 +1177,19 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FP_TO_SINT, MVT::v2i1, Custom);
setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Custom);
+ // There is no byte sized k-register load or store without AVX512DQ.
+ if (!Subtarget.hasDQI()) {
+ setOperationAction(ISD::LOAD, MVT::v1i1, Custom);
+ setOperationAction(ISD::LOAD, MVT::v2i1, Custom);
+ setOperationAction(ISD::LOAD, MVT::v4i1, Custom);
+ setOperationAction(ISD::LOAD, MVT::v8i1, Custom);
+
+ setOperationAction(ISD::STORE, MVT::v1i1, Custom);
+ setOperationAction(ISD::STORE, MVT::v2i1, Custom);
+ setOperationAction(ISD::STORE, MVT::v4i1, Custom);
+ setOperationAction(ISD::STORE, MVT::v8i1, Custom);
+ }
+
// Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.
for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
@@ -18983,6 +18996,30 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
}
+static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
+ EVT VT = St->getValue().getValueType();
+ SDLoc dl(St);
+ SDValue StoredVal = St->getOperand(1);
+
+ // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.
+ assert(VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
+ VT.getVectorNumElements() <= 8 && "Unexpected VT");
+ assert(!St->isTruncatingStore() && "Expected non-truncating store");
+ assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
+ "Expected AVX512F without AVX512DQI");
+
+ StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
+ DAG.getUNDEF(MVT::v8i1), StoredVal,
+ DAG.getIntPtrConstant(0, dl));
+ StoredVal = DAG.getBitcast(MVT::i8, StoredVal);
+
+ return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
+ St->getPointerInfo(), St->getAlignment(),
+ St->getMemOperand()->getFlags());
+}
+
// Lower vector extended loads using a shuffle. If SSSE3 is not available we
// may emit an illegal shuffle but the expansion is still better than scalar
// code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise
@@ -18990,20 +19027,41 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
// FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
// TODO: It is possible to support ZExt by zeroing the undef values during
// the shuffle phase or after the shuffle.
-static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget &Subtarget,
+static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MVT RegVT = Op.getSimpleValueType();
assert(RegVT.isVector() && "We only custom lower vector sext loads.");
assert(RegVT.isInteger() &&
"We only custom lower integer vector sext loads.");
- // Nothing useful we can do without SSE2 shuffles.
- assert(Subtarget.hasSSE2() && "We only custom lower sext loads with SSE2.");
-
LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
SDLoc dl(Ld);
EVT MemVT = Ld->getMemoryVT();
+ // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.
+ if (RegVT.isVector() && RegVT.getVectorElementType() == MVT::i1) {
+ assert(EVT(RegVT) == MemVT && "Expected non-extending load");
+ assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT");
+ assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
+ "Expected AVX512F without AVX512DQI");
+
+ SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(),
+ Ld->getPointerInfo(), Ld->getAlignment(),
+ Ld->getMemOperand()->getFlags());
+
+ // Replace chain users with the new chain.
+ assert(NewLd->getNumValues() == 2 && "Loads must carry a chain!");
+ DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewLd.getValue(1));
+
+ SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, RegVT,
+ DAG.getBitcast(MVT::v8i1, NewLd),
+ DAG.getIntPtrConstant(0, dl));
+ return DAG.getMergeValues({Extract, NewLd.getValue(1)}, dl);
+ }
+
+ // Nothing useful we can do without SSE2 shuffles.
+ assert(Subtarget.hasSSE2() && "We only custom lower sext loads with SSE2.");
+
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
unsigned RegSz = RegVT.getSizeInBits();
@@ -24766,7 +24824,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::FP_TO_SINT:
case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
- case ISD::LOAD: return LowerExtendedLoad(Op, Subtarget, DAG);
+ case ISD::LOAD: return LowerLoad(Op, Subtarget, DAG);
+ case ISD::STORE: return LowerStore(Op, Subtarget, DAG);
case ISD::FABS:
case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index 91e4aca1489..59ed5fd6f31 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -2816,40 +2816,15 @@ let Predicates = [HasDQI] in {
def : Pat<(store VK1:$src, addr:$dst),
(KMOVBmk addr:$dst, (COPY_TO_REGCLASS VK1:$src, VK8))>;
+ def : Pat<(v1i1 (load addr:$src)),
+ (COPY_TO_REGCLASS (KMOVBkm addr:$src), VK1)>;
def : Pat<(v2i1 (load addr:$src)),
(COPY_TO_REGCLASS (KMOVBkm addr:$src), VK2)>;
def : Pat<(v4i1 (load addr:$src)),
(COPY_TO_REGCLASS (KMOVBkm addr:$src), VK4)>;
}
-let Predicates = [HasAVX512, NoDQI] in {
- def : Pat<(store VK1:$src, addr:$dst),
- (MOV8mr addr:$dst,
- (i8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK1:$src, GR32)),
- sub_8bit)))>;
- def : Pat<(store VK2:$src, addr:$dst),
- (MOV8mr addr:$dst,
- (i8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK2:$src, GR32)),
- sub_8bit)))>;
- def : Pat<(store VK4:$src, addr:$dst),
- (MOV8mr addr:$dst,
- (i8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK4:$src, GR32)),
- sub_8bit)))>;
- def : Pat<(store VK8:$src, addr:$dst),
- (MOV8mr addr:$dst,
- (i8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK8:$src, GR32)),
- sub_8bit)))>;
-
- def : Pat<(v8i1 (load addr:$src)),
- (COPY_TO_REGCLASS (MOVZX32rm8 addr:$src), VK8)>;
- def : Pat<(v2i1 (load addr:$src)),
- (COPY_TO_REGCLASS (MOVZX32rm8 addr:$src), VK2)>;
- def : Pat<(v4i1 (load addr:$src)),
- (COPY_TO_REGCLASS (MOVZX32rm8 addr:$src), VK4)>;
-}
let Predicates = [HasAVX512] in {
- def : Pat<(v1i1 (load addr:$src)),
- (COPY_TO_REGCLASS (MOVZX32rm8 addr:$src), VK1)>;
def : Pat<(v8i1 (bitconvert (i8 (load addr:$src)))),
(COPY_TO_REGCLASS (MOVZX32rm8 addr:$src), VK8)>;
}
diff --git a/llvm/test/CodeGen/X86/avx512-mask-op.ll b/llvm/test/CodeGen/X86/avx512-mask-op.ll
index 88d5ed4f7ad..1038c90ab9d 100644
--- a/llvm/test/CodeGen/X86/avx512-mask-op.ll
+++ b/llvm/test/CodeGen/X86/avx512-mask-op.ll
@@ -348,9 +348,7 @@ define i8 @zext_test3(<16 x i32> %a, <16 x i32> %b) {
define i8 @conv1(<8 x i1>* %R) {
; KNL-LABEL: conv1:
; KNL: ## %bb.0: ## %entry
-; KNL-NEXT: kxnorw %k0, %k0, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: movb %al, (%rdi)
+; KNL-NEXT: movb $-1, (%rdi)
; KNL-NEXT: movb $-2, -{{[0-9]+}}(%rsp)
; KNL-NEXT: movb $-2, %al
; KNL-NEXT: retq
@@ -365,9 +363,7 @@ define i8 @conv1(<8 x i1>* %R) {
;
; AVX512BW-LABEL: conv1:
; AVX512BW: ## %bb.0: ## %entry
-; AVX512BW-NEXT: kxnorw %k0, %k0, %k0
-; AVX512BW-NEXT: kmovd %k0, %eax
-; AVX512BW-NEXT: movb %al, (%rdi)
+; AVX512BW-NEXT: movb $-1, (%rdi)
; AVX512BW-NEXT: movb $-2, -{{[0-9]+}}(%rsp)
; AVX512BW-NEXT: movb $-2, %al
; AVX512BW-NEXT: retq
OpenPOWER on IntegriCloud