summaryrefslogtreecommitdiffstats
path: root/llvm
diff options
context:
space:
mode:
Diffstat (limited to 'llvm')
-rw-r--r--llvm/include/llvm/CodeGen/SelectionDAG.h19
-rw-r--r--llvm/include/llvm/CodeGen/SelectionDAGNodes.h53
-rw-r--r--llvm/include/llvm/CodeGen/TargetLowering.h114
-rw-r--r--llvm/include/llvm/Target/TargetSelectionDAG.td10
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp177
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp16
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp36
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp63
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp13
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp8
-rw-r--r--llvm/lib/CodeGen/TargetLoweringBase.cpp2
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrInfo.td27
-rw-r--r--llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp118
-rw-r--r--llvm/lib/Target/ARM/ARMISelLowering.cpp74
-rw-r--r--llvm/lib/Target/ARM/ARMInstrMVE.td118
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp44
-rw-r--r--llvm/lib/Target/X86/X86InstrFragmentsSIMD.td35
-rw-r--r--llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll2
-rw-r--r--llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll19
-rw-r--r--llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll59
-rw-r--r--llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll49
-rw-r--r--llvm/test/CodeGen/Thumb2/mve-masked-ldst-postinc.ll240
-rw-r--r--llvm/test/CodeGen/Thumb2/mve-masked-ldst-preinc.ll240
-rw-r--r--llvm/test/CodeGen/Thumb2/mve-masked-load.ll60
-rw-r--r--llvm/test/CodeGen/Thumb2/mve-masked-store.ll60
25 files changed, 907 insertions, 749 deletions
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index a0e37a19b37..8387e9a0e61 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -1136,14 +1136,19 @@ public:
/// Returns sum of the base pointer and offset.
SDValue getMemBasePlusOffset(SDValue Base, unsigned Offset, const SDLoc &DL);
- SDValue getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr,
- SDValue Mask, SDValue Src0, EVT MemVT,
- MachineMemOperand *MMO, ISD::LoadExtType,
- bool IsExpanding = false);
+ SDValue getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Base,
+ SDValue Offset, SDValue Mask, SDValue Src0, EVT MemVT,
+ MachineMemOperand *MMO, ISD::MemIndexedMode AM,
+ ISD::LoadExtType, bool IsExpanding = false);
+ SDValue getIndexedMaskedLoad(SDValue OrigLoad, const SDLoc &dl, SDValue Base,
+ SDValue Offset, ISD::MemIndexedMode AM);
SDValue getMaskedStore(SDValue Chain, const SDLoc &dl, SDValue Val,
- SDValue Ptr, SDValue Mask, EVT MemVT,
- MachineMemOperand *MMO, bool IsTruncating = false,
- bool IsCompressing = false);
+ SDValue Base, SDValue Offset, SDValue Mask, EVT MemVT,
+ MachineMemOperand *MMO, ISD::MemIndexedMode AM,
+ bool IsTruncating = false, bool IsCompressing = false);
+ SDValue getIndexedMaskedStore(SDValue OrigStore, const SDLoc &dl,
+ SDValue Base, SDValue Offset,
+ ISD::MemIndexedMode AM);
SDValue getMaskedGather(SDVTList VTs, EVT VT, const SDLoc &dl,
ArrayRef<SDValue> Ops, MachineMemOperand *MMO,
ISD::MemIndexType IndexType);
diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
index 3b799f96731..e18278f8cdc 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -553,6 +553,7 @@ BEGIN_TWO_BYTE_PACK()
class LSBaseSDNodeBitfields {
friend class LSBaseSDNode;
+ friend class MaskedLoadStoreSDNode;
friend class MaskedGatherScatterSDNode;
uint16_t : NumMemSDNodeBits;
@@ -560,6 +561,7 @@ BEGIN_TWO_BYTE_PACK()
// This storage is shared between disparate class hierarchies to hold an
// enumeration specific to the class hierarchy in use.
// LSBaseSDNode => enum ISD::MemIndexedMode
+ // MaskedLoadStoreBaseSDNode => enum ISD::MemIndexedMode
// MaskedGatherScatterSDNode => enum ISD::MemIndexType
uint16_t AddressingMode : 3;
};
@@ -2273,19 +2275,38 @@ public:
friend class SelectionDAG;
MaskedLoadStoreSDNode(ISD::NodeType NodeTy, unsigned Order,
- const DebugLoc &dl, SDVTList VTs, EVT MemVT,
+ const DebugLoc &dl, SDVTList VTs,
+ ISD::MemIndexedMode AM, EVT MemVT,
MachineMemOperand *MMO)
- : MemSDNode(NodeTy, Order, dl, VTs, MemVT, MMO) {}
+ : MemSDNode(NodeTy, Order, dl, VTs, MemVT, MMO) {
+ LSBaseSDNodeBits.AddressingMode = AM;
+ assert(getAddressingMode() == AM && "Value truncated");
+ }
- // MaskedLoadSDNode (Chain, ptr, mask, passthru)
- // MaskedStoreSDNode (Chain, data, ptr, mask)
+ // MaskedLoadSDNode (Chain, ptr, offset, mask, passthru)
+ // MaskedStoreSDNode (Chain, data, ptr, offset, mask)
// Mask is a vector of i1 elements
const SDValue &getBasePtr() const {
return getOperand(getOpcode() == ISD::MLOAD ? 1 : 2);
}
- const SDValue &getMask() const {
+ const SDValue &getOffset() const {
return getOperand(getOpcode() == ISD::MLOAD ? 2 : 3);
}
+ const SDValue &getMask() const {
+ return getOperand(getOpcode() == ISD::MLOAD ? 3 : 4);
+ }
+
+ /// Return the addressing mode for this load or store:
+ /// unindexed, pre-inc, pre-dec, post-inc, or post-dec.
+ ISD::MemIndexedMode getAddressingMode() const {
+ return static_cast<ISD::MemIndexedMode>(LSBaseSDNodeBits.AddressingMode);
+ }
+
+ /// Return true if this is a pre/post inc/dec load/store.
+ bool isIndexed() const { return getAddressingMode() != ISD::UNINDEXED; }
+
+ /// Return true if this is NOT a pre/post inc/dec load/store.
+ bool isUnindexed() const { return getAddressingMode() == ISD::UNINDEXED; }
static bool classof(const SDNode *N) {
return N->getOpcode() == ISD::MLOAD ||
@@ -2299,9 +2320,9 @@ public:
friend class SelectionDAG;
MaskedLoadSDNode(unsigned Order, const DebugLoc &dl, SDVTList VTs,
- ISD::LoadExtType ETy, bool IsExpanding, EVT MemVT,
- MachineMemOperand *MMO)
- : MaskedLoadStoreSDNode(ISD::MLOAD, Order, dl, VTs, MemVT, MMO) {
+ ISD::MemIndexedMode AM, ISD::LoadExtType ETy,
+ bool IsExpanding, EVT MemVT, MachineMemOperand *MMO)
+ : MaskedLoadStoreSDNode(ISD::MLOAD, Order, dl, VTs, AM, MemVT, MMO) {
LoadSDNodeBits.ExtTy = ETy;
LoadSDNodeBits.IsExpanding = IsExpanding;
}
@@ -2311,8 +2332,9 @@ public:
}
const SDValue &getBasePtr() const { return getOperand(1); }
- const SDValue &getMask() const { return getOperand(2); }
- const SDValue &getPassThru() const { return getOperand(3); }
+ const SDValue &getOffset() const { return getOperand(2); }
+ const SDValue &getMask() const { return getOperand(3); }
+ const SDValue &getPassThru() const { return getOperand(4); }
static bool classof(const SDNode *N) {
return N->getOpcode() == ISD::MLOAD;
@@ -2327,9 +2349,9 @@ public:
friend class SelectionDAG;
MaskedStoreSDNode(unsigned Order, const DebugLoc &dl, SDVTList VTs,
- bool isTrunc, bool isCompressing, EVT MemVT,
- MachineMemOperand *MMO)
- : MaskedLoadStoreSDNode(ISD::MSTORE, Order, dl, VTs, MemVT, MMO) {
+ ISD::MemIndexedMode AM, bool isTrunc, bool isCompressing,
+ EVT MemVT, MachineMemOperand *MMO)
+ : MaskedLoadStoreSDNode(ISD::MSTORE, Order, dl, VTs, AM, MemVT, MMO) {
StoreSDNodeBits.IsTruncating = isTrunc;
StoreSDNodeBits.IsCompressing = isCompressing;
}
@@ -2345,9 +2367,10 @@ public:
/// memory at base_addr.
bool isCompressingStore() const { return StoreSDNodeBits.IsCompressing; }
- const SDValue &getValue() const { return getOperand(1); }
+ const SDValue &getValue() const { return getOperand(1); }
const SDValue &getBasePtr() const { return getOperand(2); }
- const SDValue &getMask() const { return getOperand(3); }
+ const SDValue &getOffset() const { return getOperand(3); }
+ const SDValue &getMask() const { return getOperand(4); }
static bool classof(const SDNode *N) {
return N->getOpcode() == ISD::MSTORE;
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index fa84d0efbde..825cb712f7c 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -1110,12 +1110,8 @@ public:
/// Return how the indexed load should be treated: either it is legal, needs
/// to be promoted to a larger size, needs to be expanded to some other code
/// sequence, or the target has a custom expander for it.
- LegalizeAction
- getIndexedLoadAction(unsigned IdxMode, MVT VT) const {
- assert(IdxMode < ISD::LAST_INDEXED_MODE && VT.isValid() &&
- "Table isn't big enough!");
- unsigned Ty = (unsigned)VT.SimpleTy;
- return (LegalizeAction)((IndexedModeActions[Ty][IdxMode] & 0xf0) >> 4);
+ LegalizeAction getIndexedLoadAction(unsigned IdxMode, MVT VT) const {
+ return getIndexedModeAction(IdxMode, VT, IMAB_Load);
}
/// Return true if the specified indexed load is legal on this target.
@@ -1128,12 +1124,8 @@ public:
/// Return how the indexed store should be treated: either it is legal, needs
/// to be promoted to a larger size, needs to be expanded to some other code
/// sequence, or the target has a custom expander for it.
- LegalizeAction
- getIndexedStoreAction(unsigned IdxMode, MVT VT) const {
- assert(IdxMode < ISD::LAST_INDEXED_MODE && VT.isValid() &&
- "Table isn't big enough!");
- unsigned Ty = (unsigned)VT.SimpleTy;
- return (LegalizeAction)(IndexedModeActions[Ty][IdxMode] & 0x0f);
+ LegalizeAction getIndexedStoreAction(unsigned IdxMode, MVT VT) const {
+ return getIndexedModeAction(IdxMode, VT, IMAB_Store);
}
/// Return true if the specified indexed load is legal on this target.
@@ -1143,6 +1135,34 @@ public:
getIndexedStoreAction(IdxMode, VT.getSimpleVT()) == Custom);
}
+ /// Return how the indexed load should be treated: either it is legal, needs
+ /// to be promoted to a larger size, needs to be expanded to some other code
+ /// sequence, or the target has a custom expander for it.
+ LegalizeAction getIndexedMaskedLoadAction(unsigned IdxMode, MVT VT) const {
+ return getIndexedModeAction(IdxMode, VT, IMAB_MaskedLoad);
+ }
+
+ /// Return true if the specified indexed load is legal on this target.
+ bool isIndexedMaskedLoadLegal(unsigned IdxMode, EVT VT) const {
+ return VT.isSimple() &&
+ (getIndexedMaskedLoadAction(IdxMode, VT.getSimpleVT()) == Legal ||
+ getIndexedMaskedLoadAction(IdxMode, VT.getSimpleVT()) == Custom);
+ }
+
+ /// Return how the indexed store should be treated: either it is legal, needs
+ /// to be promoted to a larger size, needs to be expanded to some other code
+ /// sequence, or the target has a custom expander for it.
+ LegalizeAction getIndexedMaskedStoreAction(unsigned IdxMode, MVT VT) const {
+ return getIndexedModeAction(IdxMode, VT, IMAB_MaskedStore);
+ }
+
+ /// Return true if the specified indexed load is legal on this target.
+ bool isIndexedMaskedStoreLegal(unsigned IdxMode, EVT VT) const {
+ return VT.isSimple() &&
+ (getIndexedMaskedStoreAction(IdxMode, VT.getSimpleVT()) == Legal ||
+ getIndexedMaskedStoreAction(IdxMode, VT.getSimpleVT()) == Custom);
+ }
+
/// Return how the condition code should be treated: either it is legal, needs
/// to be expanded to some other code sequence, or the target has a custom
/// expander for it.
@@ -2030,13 +2050,8 @@ protected:
///
/// NOTE: All indexed mode loads are initialized to Expand in
/// TargetLowering.cpp
- void setIndexedLoadAction(unsigned IdxMode, MVT VT,
- LegalizeAction Action) {
- assert(VT.isValid() && IdxMode < ISD::LAST_INDEXED_MODE &&
- (unsigned)Action < 0xf && "Table isn't big enough!");
- // Load action are kept in the upper half.
- IndexedModeActions[(unsigned)VT.SimpleTy][IdxMode] &= ~0xf0;
- IndexedModeActions[(unsigned)VT.SimpleTy][IdxMode] |= ((uint8_t)Action) <<4;
+ void setIndexedLoadAction(unsigned IdxMode, MVT VT, LegalizeAction Action) {
+ setIndexedModeAction(IdxMode, VT, IMAB_Load, Action);
}
/// Indicate that the specified indexed store does or does not work with the
@@ -2044,13 +2059,28 @@ protected:
///
/// NOTE: All indexed mode stores are initialized to Expand in
/// TargetLowering.cpp
- void setIndexedStoreAction(unsigned IdxMode, MVT VT,
- LegalizeAction Action) {
- assert(VT.isValid() && IdxMode < ISD::LAST_INDEXED_MODE &&
- (unsigned)Action < 0xf && "Table isn't big enough!");
- // Store action are kept in the lower half.
- IndexedModeActions[(unsigned)VT.SimpleTy][IdxMode] &= ~0x0f;
- IndexedModeActions[(unsigned)VT.SimpleTy][IdxMode] |= ((uint8_t)Action);
+ void setIndexedStoreAction(unsigned IdxMode, MVT VT, LegalizeAction Action) {
+ setIndexedModeAction(IdxMode, VT, IMAB_Store, Action);
+ }
+
+ /// Indicate that the specified indexed masked load does or does not work with
+ /// the specified type and indicate what to do about it.
+ ///
+ /// NOTE: All indexed mode masked loads are initialized to Expand in
+ /// TargetLowering.cpp
+ void setIndexedMaskedLoadAction(unsigned IdxMode, MVT VT,
+ LegalizeAction Action) {
+ setIndexedModeAction(IdxMode, VT, IMAB_MaskedLoad, Action);
+ }
+
+ /// Indicate that the specified indexed masked store does or does not work
+ /// with the specified type and indicate what to do about it.
+ ///
+ /// NOTE: All indexed mode masked stores are initialized to Expand in
+ /// TargetLowering.cpp
+ void setIndexedMaskedStoreAction(unsigned IdxMode, MVT VT,
+ LegalizeAction Action) {
+ setIndexedModeAction(IdxMode, VT, IMAB_MaskedStore, Action);
}
/// Indicate that the specified condition code is or isn't supported on the
@@ -2763,13 +2793,13 @@ private:
/// truncating store of a specific value type and truncating type is legal.
LegalizeAction TruncStoreActions[MVT::LAST_VALUETYPE][MVT::LAST_VALUETYPE];
- /// For each indexed mode and each value type, keep a pair of LegalizeAction
+ /// For each indexed mode and each value type, keep a quad of LegalizeAction
/// that indicates how instruction selection should deal with the load /
- /// store.
+ /// store / maskedload / maskedstore.
///
/// The first dimension is the value_type for the reference. The second
/// dimension represents the various modes for load store.
- uint8_t IndexedModeActions[MVT::LAST_VALUETYPE][ISD::LAST_INDEXED_MODE];
+ uint16_t IndexedModeActions[MVT::LAST_VALUETYPE][ISD::LAST_INDEXED_MODE];
/// For each condition code (ISD::CondCode) keep a LegalizeAction that
/// indicates how instruction selection should deal with the condition code.
@@ -2812,6 +2842,32 @@ private:
/// Set default libcall names and calling conventions.
void InitLibcalls(const Triple &TT);
+ /// The bits of IndexedModeActions used to store the legalisation actions
+ /// We store the data as | ML | MS | L | S | each taking 4 bits.
+ enum IndexedModeActionsBits {
+ IMAB_Store = 0,
+ IMAB_Load = 4,
+ IMAB_MaskedStore = 8,
+ IMAB_MaskedLoad = 12
+ };
+
+ void setIndexedModeAction(unsigned IdxMode, MVT VT, unsigned Shift,
+ LegalizeAction Action) {
+ assert(VT.isValid() && IdxMode < ISD::LAST_INDEXED_MODE &&
+ (unsigned)Action < 0xf && "Table isn't big enough!");
+ unsigned Ty = (unsigned)VT.SimpleTy;
+ IndexedModeActions[Ty][IdxMode] &= ~(0xf << Shift);
+ IndexedModeActions[Ty][IdxMode] |= ((uint16_t)Action) << Shift;
+ }
+
+ LegalizeAction getIndexedModeAction(unsigned IdxMode, MVT VT,
+ unsigned Shift) const {
+ assert(IdxMode < ISD::LAST_INDEXED_MODE && VT.isValid() &&
+ "Table isn't big enough!");
+ unsigned Ty = (unsigned)VT.SimpleTy;
+ return (LegalizeAction)((IndexedModeActions[Ty][IdxMode] >> Shift) & 0xf);
+ }
+
protected:
/// Return true if the extension represented by \p I is free.
/// \pre \p I is a sign, zero, or fp extension and
diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td
index 441f3d7d118..9543086c4da 100644
--- a/llvm/include/llvm/Target/TargetSelectionDAG.td
+++ b/llvm/include/llvm/Target/TargetSelectionDAG.td
@@ -224,13 +224,13 @@ def SDTIStore : SDTypeProfile<1, 3, [ // indexed store
SDTCisSameAs<0, 2>, SDTCisPtrTy<0>, SDTCisPtrTy<3>
]>;
-def SDTMaskedStore: SDTypeProfile<0, 3, [ // masked store
- SDTCisVec<0>, SDTCisPtrTy<1>, SDTCisVec<2>, SDTCisSameNumEltsAs<0, 2>
+def SDTMaskedStore: SDTypeProfile<0, 4, [ // masked store
+ SDTCisVec<0>, SDTCisPtrTy<1>, SDTCisPtrTy<2>, SDTCisVec<3>, SDTCisSameNumEltsAs<0, 3>
]>;
-def SDTMaskedLoad: SDTypeProfile<1, 3, [ // masked load
- SDTCisVec<0>, SDTCisPtrTy<1>, SDTCisVec<2>, SDTCisSameAs<0, 3>,
- SDTCisSameNumEltsAs<0, 2>
+def SDTMaskedLoad: SDTypeProfile<1, 4, [ // masked load
+ SDTCisVec<0>, SDTCisPtrTy<1>, SDTCisPtrTy<2>, SDTCisVec<3>, SDTCisSameAs<0, 4>,
+ SDTCisSameNumEltsAs<0, 3>
]>;
def SDTVecShuffle : SDTypeProfile<1, 2, [
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 793352c16d3..e6844e556b1 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -8724,6 +8724,10 @@ SDValue DAGCombiner::visitMSTORE(SDNode *N) {
if (ISD::isBuildVectorAllZeros(Mask.getNode()))
return Chain;
+ // Try transforming N to an indexed store.
+ if (CombineToPreIndexedLoadStore(N) || CombineToPostIndexedLoadStore(N))
+ return SDValue(N, 0);
+
return SDValue();
}
@@ -8748,6 +8752,10 @@ SDValue DAGCombiner::visitMLOAD(SDNode *N) {
if (ISD::isBuildVectorAllZeros(Mask.getNode()))
return CombineTo(N, MLD->getPassThru(), MLD->getChain());
+ // Try transforming N to an indexed load.
+ if (CombineToPreIndexedLoadStore(N) || CombineToPostIndexedLoadStore(N))
+ return SDValue(N, 0);
+
return SDValue();
}
@@ -9506,11 +9514,10 @@ static SDValue tryToFoldExtOfMaskedLoad(SelectionDAG &DAG,
SDLoc dl(Ld);
SDValue PassThru = DAG.getNode(ExtOpc, dl, VT, Ld->getPassThru());
- SDValue NewLoad = DAG.getMaskedLoad(VT, dl, Ld->getChain(),
- Ld->getBasePtr(), Ld->getMask(),
- PassThru, Ld->getMemoryVT(),
- Ld->getMemOperand(), ExtLoadType,
- Ld->isExpandingLoad());
+ SDValue NewLoad = DAG.getMaskedLoad(
+ VT, dl, Ld->getChain(), Ld->getBasePtr(), Ld->getOffset(), Ld->getMask(),
+ PassThru, Ld->getMemoryVT(), Ld->getMemOperand(), Ld->getAddressingMode(),
+ ExtLoadType, Ld->isExpandingLoad());
DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), SDValue(NewLoad.getNode(), 1));
return NewLoad;
}
@@ -13612,12 +13619,22 @@ static bool canFoldInAddressingMode(SDNode *N, SDNode *Use,
EVT VT;
unsigned AS;
- if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Use)) {
+ if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Use)) {
if (LD->isIndexed() || LD->getBasePtr().getNode() != N)
return false;
VT = LD->getMemoryVT();
AS = LD->getAddressSpace();
- } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(Use)) {
+ } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(Use)) {
+ if (ST->isIndexed() || ST->getBasePtr().getNode() != N)
+ return false;
+ VT = ST->getMemoryVT();
+ AS = ST->getAddressSpace();
+ } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(Use)) {
+ if (LD->isIndexed() || LD->getBasePtr().getNode() != N)
+ return false;
+ VT = LD->getMemoryVT();
+ AS = LD->getAddressSpace();
+ } else if (MaskedStoreSDNode *ST = dyn_cast<MaskedStoreSDNode>(Use)) {
if (ST->isIndexed() || ST->getBasePtr().getNode() != N)
return false;
VT = ST->getMemoryVT();
@@ -13651,38 +13668,64 @@ static bool canFoldInAddressingMode(SDNode *N, SDNode *Use,
VT.getTypeForEVT(*DAG.getContext()), AS);
}
-/// Try turning a load/store into a pre-indexed load/store when the base
-/// pointer is an add or subtract and it has other uses besides the load/store.
-/// After the transformation, the new indexed load/store has effectively folded
-/// the add/subtract in and all of its other uses are redirected to the
-/// new load/store.
-bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) {
- if (Level < AfterLegalizeDAG)
- return false;
-
- bool isLoad = true;
- SDValue Ptr;
- EVT VT;
- if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
+static bool getCombineLoadStoreParts(SDNode *N, unsigned Inc, unsigned Dec,
+ bool &IsLoad, bool &IsMasked, SDValue &Ptr,
+ const TargetLowering &TLI) {
+ if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
if (LD->isIndexed())
return false;
- VT = LD->getMemoryVT();
- if (!TLI.isIndexedLoadLegal(ISD::PRE_INC, VT) &&
- !TLI.isIndexedLoadLegal(ISD::PRE_DEC, VT))
+ EVT VT = LD->getMemoryVT();
+ if (!TLI.isIndexedLoadLegal(Inc, VT) && !TLI.isIndexedLoadLegal(Dec, VT))
return false;
Ptr = LD->getBasePtr();
- } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
+ } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
if (ST->isIndexed())
return false;
- VT = ST->getMemoryVT();
- if (!TLI.isIndexedStoreLegal(ISD::PRE_INC, VT) &&
- !TLI.isIndexedStoreLegal(ISD::PRE_DEC, VT))
+ EVT VT = ST->getMemoryVT();
+ if (!TLI.isIndexedStoreLegal(Inc, VT) && !TLI.isIndexedStoreLegal(Dec, VT))
return false;
Ptr = ST->getBasePtr();
- isLoad = false;
+ IsLoad = false;
+ } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) {
+ if (LD->isIndexed())
+ return false;
+ EVT VT = LD->getMemoryVT();
+ if (!TLI.isIndexedMaskedLoadLegal(Inc, VT) &&
+ !TLI.isIndexedMaskedLoadLegal(Dec, VT))
+ return false;
+ Ptr = LD->getBasePtr();
+ IsMasked = true;
+ } else if (MaskedStoreSDNode *ST = dyn_cast<MaskedStoreSDNode>(N)) {
+ if (ST->isIndexed())
+ return false;
+ EVT VT = ST->getMemoryVT();
+ if (!TLI.isIndexedMaskedStoreLegal(Inc, VT) &&
+ !TLI.isIndexedMaskedStoreLegal(Dec, VT))
+ return false;
+ Ptr = ST->getBasePtr();
+ IsLoad = false;
+ IsMasked = true;
} else {
return false;
}
+ return true;
+}
+
+/// Try turning a load/store into a pre-indexed load/store when the base
+/// pointer is an add or subtract and it has other uses besides the load/store.
+/// After the transformation, the new indexed load/store has effectively folded
+/// the add/subtract in and all of its other uses are redirected to the
+/// new load/store.
+bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) {
+ if (Level < AfterLegalizeDAG)
+ return false;
+
+ bool IsLoad = true;
+ bool IsMasked = false;
+ SDValue Ptr;
+ if (!getCombineLoadStoreParts(N, ISD::PRE_INC, ISD::PRE_DEC, IsLoad, IsMasked,
+ Ptr, TLI))
+ return false;
// If the pointer is not an add/sub, or if it doesn't have multiple uses, bail
// out. There is no reason to make this a preinc/predec.
@@ -13724,8 +13767,9 @@ bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) {
return false;
// Check #2.
- if (!isLoad) {
- SDValue Val = cast<StoreSDNode>(N)->getValue();
+ if (!IsLoad) {
+ SDValue Val = IsMasked ? cast<MaskedStoreSDNode>(N)->getValue()
+ : cast<StoreSDNode>(N)->getValue();
// Would require a copy.
if (Val == BasePtr)
@@ -13801,18 +13845,26 @@ bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) {
return false;
SDValue Result;
- if (isLoad)
- Result = DAG.getIndexedLoad(SDValue(N,0), SDLoc(N),
- BasePtr, Offset, AM);
- else
- Result = DAG.getIndexedStore(SDValue(N,0), SDLoc(N),
- BasePtr, Offset, AM);
+ if (!IsMasked) {
+ if (IsLoad)
+ Result = DAG.getIndexedLoad(SDValue(N, 0), SDLoc(N), BasePtr, Offset, AM);
+ else
+ Result =
+ DAG.getIndexedStore(SDValue(N, 0), SDLoc(N), BasePtr, Offset, AM);
+ } else {
+ if (IsLoad)
+ Result = DAG.getIndexedMaskedLoad(SDValue(N, 0), SDLoc(N), BasePtr,
+ Offset, AM);
+ else
+ Result = DAG.getIndexedMaskedStore(SDValue(N, 0), SDLoc(N), BasePtr,
+ Offset, AM);
+ }
++PreIndexedNodes;
++NodesCombined;
LLVM_DEBUG(dbgs() << "\nReplacing.4 "; N->dump(&DAG); dbgs() << "\nWith: ";
Result.getNode()->dump(&DAG); dbgs() << '\n');
WorklistRemover DeadNodes(*this);
- if (isLoad) {
+ if (IsLoad) {
DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(0));
DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Result.getValue(2));
} else {
@@ -13866,7 +13918,7 @@ bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) {
// We can now generate the new expression.
SDValue NewOp1 = DAG.getConstant(CNV, DL, CN->getValueType(0));
- SDValue NewOp2 = Result.getValue(isLoad ? 1 : 0);
+ SDValue NewOp2 = Result.getValue(IsLoad ? 1 : 0);
SDValue NewUse = DAG.getNode(Opcode,
DL,
@@ -13876,7 +13928,7 @@ bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) {
}
// Replace the uses of Ptr with uses of the updated base value.
- DAG.ReplaceAllUsesOfValueWith(Ptr, Result.getValue(isLoad ? 1 : 0));
+ DAG.ReplaceAllUsesOfValueWith(Ptr, Result.getValue(IsLoad ? 1 : 0));
deleteAndRecombine(Ptr.getNode());
AddToWorklist(Result.getNode());
@@ -13891,29 +13943,12 @@ bool DAGCombiner::CombineToPostIndexedLoadStore(SDNode *N) {
if (Level < AfterLegalizeDAG)
return false;
- bool isLoad = true;
+ bool IsLoad = true;
+ bool IsMasked = false;
SDValue Ptr;
- EVT VT;
- if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
- if (LD->isIndexed())
- return false;
- VT = LD->getMemoryVT();
- if (!TLI.isIndexedLoadLegal(ISD::POST_INC, VT) &&
- !TLI.isIndexedLoadLegal(ISD::POST_DEC, VT))
- return false;
- Ptr = LD->getBasePtr();
- } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
- if (ST->isIndexed())
- return false;
- VT = ST->getMemoryVT();
- if (!TLI.isIndexedStoreLegal(ISD::POST_INC, VT) &&
- !TLI.isIndexedStoreLegal(ISD::POST_DEC, VT))
- return false;
- Ptr = ST->getBasePtr();
- isLoad = false;
- } else {
+ if (!getCombineLoadStoreParts(N, ISD::POST_INC, ISD::POST_DEC, IsLoad, IsMasked,
+ Ptr, TLI))
return false;
- }
if (Ptr.getNode()->hasOneUse())
return false;
@@ -13949,7 +13984,7 @@ bool DAGCombiner::CombineToPostIndexedLoadStore(SDNode *N) {
// If all the uses are load / store addresses, then don't do the
// transformation.
- if (Use->getOpcode() == ISD::ADD || Use->getOpcode() == ISD::SUB){
+ if (Use->getOpcode() == ISD::ADD || Use->getOpcode() == ISD::SUB) {
bool RealUse = false;
for (SDNode *UseUse : Use->uses()) {
if (!canFoldInAddressingMode(Use, UseUse, DAG, TLI))
@@ -13975,18 +14010,24 @@ bool DAGCombiner::CombineToPostIndexedLoadStore(SDNode *N) {
Worklist.push_back(Op);
if (!SDNode::hasPredecessorHelper(N, Visited, Worklist) &&
!SDNode::hasPredecessorHelper(Op, Visited, Worklist)) {
- SDValue Result = isLoad
- ? DAG.getIndexedLoad(SDValue(N,0), SDLoc(N),
- BasePtr, Offset, AM)
- : DAG.getIndexedStore(SDValue(N,0), SDLoc(N),
- BasePtr, Offset, AM);
+ SDValue Result;
+ if (!IsMasked)
+ Result = IsLoad ? DAG.getIndexedLoad(SDValue(N, 0), SDLoc(N), BasePtr,
+ Offset, AM)
+ : DAG.getIndexedStore(SDValue(N, 0), SDLoc(N),
+ BasePtr, Offset, AM);
+ else
+ Result = IsLoad ? DAG.getIndexedMaskedLoad(SDValue(N, 0), SDLoc(N),
+ BasePtr, Offset, AM)
+ : DAG.getIndexedMaskedStore(SDValue(N, 0), SDLoc(N),
+ BasePtr, Offset, AM);
++PostIndexedNodes;
++NodesCombined;
LLVM_DEBUG(dbgs() << "\nReplacing.5 "; N->dump(&DAG);
dbgs() << "\nWith: "; Result.getNode()->dump(&DAG);
dbgs() << '\n');
WorklistRemover DeadNodes(*this);
- if (isLoad) {
+ if (IsLoad) {
DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(0));
DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Result.getValue(2));
} else {
@@ -13998,7 +14039,7 @@ bool DAGCombiner::CombineToPostIndexedLoadStore(SDNode *N) {
// Replace the uses of Use with uses of the updated base value.
DAG.ReplaceAllUsesOfValueWith(SDValue(Op, 0),
- Result.getValue(isLoad ? 1 : 0));
+ Result.getValue(IsLoad ? 1 : 0));
deleteAndRecombine(Op);
return true;
}
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 56c13bb0753..9f8da60eb9a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -592,8 +592,9 @@ SDValue DAGTypeLegalizer::PromoteIntRes_MLOAD(MaskedLoadSDNode *N) {
SDLoc dl(N);
SDValue Res = DAG.getMaskedLoad(NVT, dl, N->getChain(), N->getBasePtr(),
- N->getMask(), ExtPassThru, N->getMemoryVT(),
- N->getMemOperand(), ISD::EXTLOAD);
+ N->getOffset(), N->getMask(), ExtPassThru,
+ N->getMemoryVT(), N->getMemOperand(),
+ N->getAddressingMode(), ISD::EXTLOAD);
// Legalize the chain result - switch anything that used the old chain to
// use the new one.
ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
@@ -1485,11 +1486,11 @@ SDValue DAGTypeLegalizer::PromoteIntOp_MSTORE(MaskedStoreSDNode *N,
SDLoc dl(N);
bool TruncateStore = false;
- if (OpNo == 3) {
+ if (OpNo == 4) {
Mask = PromoteTargetBoolean(Mask, DataVT);
// Update in place.
SmallVector<SDValue, 4> NewOps(N->op_begin(), N->op_end());
- NewOps[3] = Mask;
+ NewOps[4] = Mask;
return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
} else { // Data operand
assert(OpNo == 1 && "Unexpected operand for promotion");
@@ -1497,14 +1498,15 @@ SDValue DAGTypeLegalizer::PromoteIntOp_MSTORE(MaskedStoreSDNode *N,
TruncateStore = true;
}
- return DAG.getMaskedStore(N->getChain(), dl, DataOp, N->getBasePtr(), Mask,
- N->getMemoryVT(), N->getMemOperand(),
+ return DAG.getMaskedStore(N->getChain(), dl, DataOp, N->getBasePtr(),
+ N->getOffset(), Mask, N->getMemoryVT(),
+ N->getMemOperand(), N->getAddressingMode(),
TruncateStore, N->isCompressingStore());
}
SDValue DAGTypeLegalizer::PromoteIntOp_MLOAD(MaskedLoadSDNode *N,
unsigned OpNo) {
- assert(OpNo == 2 && "Only know how to promote the mask!");
+ assert(OpNo == 3 && "Only know how to promote the mask!");
EVT DataVT = N->getValueType(0);
SDValue Mask = PromoteTargetBoolean(N->getOperand(OpNo), DataVT);
SmallVector<SDValue, 4> NewOps(N->op_begin(), N->op_end());
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 7bca3ea888e..9403b344ea7 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -1541,12 +1541,15 @@ void DAGTypeLegalizer::SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo,
void DAGTypeLegalizer::SplitVecRes_MLOAD(MaskedLoadSDNode *MLD,
SDValue &Lo, SDValue &Hi) {
+ assert(MLD->isUnindexed() && "Indexed masked load during type legalization!");
EVT LoVT, HiVT;
SDLoc dl(MLD);
std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(MLD->getValueType(0));
SDValue Ch = MLD->getChain();
SDValue Ptr = MLD->getBasePtr();
+ SDValue Offset = MLD->getOffset();
+ assert(Offset.isUndef() && "Unexpected indexed masked load offset");
SDValue Mask = MLD->getMask();
SDValue PassThru = MLD->getPassThru();
unsigned Alignment = MLD->getOriginalAlignment();
@@ -1578,8 +1581,9 @@ void DAGTypeLegalizer::SplitVecRes_MLOAD(MaskedLoadSDNode *MLD,
MachineMemOperand::MOLoad, LoMemVT.getStoreSize(),
Alignment, MLD->getAAInfo(), MLD->getRanges());
- Lo = DAG.getMaskedLoad(LoVT, dl, Ch, Ptr, MaskLo, PassThruLo, LoMemVT, MMO,
- ExtType, MLD->isExpandingLoad());
+ Lo = DAG.getMaskedLoad(LoVT, dl, Ch, Ptr, Offset, MaskLo, PassThruLo, LoMemVT,
+ MMO, MLD->getAddressingMode(), ExtType,
+ MLD->isExpandingLoad());
Ptr = TLI.IncrementMemoryAddress(Ptr, MaskLo, dl, LoMemVT, DAG,
MLD->isExpandingLoad());
@@ -1590,8 +1594,9 @@ void DAGTypeLegalizer::SplitVecRes_MLOAD(MaskedLoadSDNode *MLD,
HiMemVT.getStoreSize(), Alignment, MLD->getAAInfo(),
MLD->getRanges());
- Hi = DAG.getMaskedLoad(HiVT, dl, Ch, Ptr, MaskHi, PassThruHi, HiMemVT, MMO,
- ExtType, MLD->isExpandingLoad());
+ Hi = DAG.getMaskedLoad(HiVT, dl, Ch, Ptr, Offset, MaskHi, PassThruHi, HiMemVT,
+ MMO, MLD->getAddressingMode(), ExtType,
+ MLD->isExpandingLoad());
// Build a factor node to remember that this load is independent of the
// other one.
@@ -2326,8 +2331,11 @@ SDValue DAGTypeLegalizer::SplitVecOp_MGATHER(MaskedGatherSDNode *MGT,
SDValue DAGTypeLegalizer::SplitVecOp_MSTORE(MaskedStoreSDNode *N,
unsigned OpNo) {
+ assert(N->isUnindexed() && "Indexed masked store of vector?");
SDValue Ch = N->getChain();
SDValue Ptr = N->getBasePtr();
+ SDValue Offset = N->getOffset();
+ assert(Offset.isUndef() && "Unexpected indexed masked store offset");
SDValue Mask = N->getMask();
SDValue Data = N->getValue();
EVT MemoryVT = N->getMemoryVT();
@@ -2361,8 +2369,8 @@ SDValue DAGTypeLegalizer::SplitVecOp_MSTORE(MaskedStoreSDNode *N,
MachineMemOperand::MOStore, LoMemVT.getStoreSize(),
Alignment, N->getAAInfo(), N->getRanges());
- Lo = DAG.getMaskedStore(Ch, DL, DataLo, Ptr, MaskLo, LoMemVT, MMO,
- N->isTruncatingStore(),
+ Lo = DAG.getMaskedStore(Ch, DL, DataLo, Ptr, Offset, MaskLo, LoMemVT, MMO,
+ N->getAddressingMode(), N->isTruncatingStore(),
N->isCompressingStore());
Ptr = TLI.IncrementMemoryAddress(Ptr, MaskLo, DL, LoMemVT, DAG,
@@ -2374,8 +2382,9 @@ SDValue DAGTypeLegalizer::SplitVecOp_MSTORE(MaskedStoreSDNode *N,
HiMemVT.getStoreSize(), Alignment, N->getAAInfo(),
N->getRanges());
- Hi = DAG.getMaskedStore(Ch, DL, DataHi, Ptr, MaskHi, HiMemVT, MMO,
- N->isTruncatingStore(), N->isCompressingStore());
+ Hi = DAG.getMaskedStore(Ch, DL, DataHi, Ptr, Offset, MaskHi, HiMemVT, MMO,
+ N->getAddressingMode(), N->isTruncatingStore(),
+ N->isCompressingStore());
// Build a factor node to remember that this store is independent of the
// other one.
@@ -3699,10 +3708,10 @@ SDValue DAGTypeLegalizer::WidenVecRes_MLOAD(MaskedLoadSDNode *N) {
WidenVT.getVectorNumElements());
Mask = ModifyToType(Mask, WideMaskVT, true);
- SDValue Res = DAG.getMaskedLoad(WidenVT, dl, N->getChain(), N->getBasePtr(),
- Mask, PassThru, N->getMemoryVT(),
- N->getMemOperand(), ExtType,
- N->isExpandingLoad());
+ SDValue Res = DAG.getMaskedLoad(
+ WidenVT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
+ PassThru, N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
+ ExtType, N->isExpandingLoad());
// Legalize the chain result - switch anything that used the old chain to
// use the new one.
ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
@@ -4447,7 +4456,8 @@ SDValue DAGTypeLegalizer::WidenVecOp_MSTORE(SDNode *N, unsigned OpNo) {
StVal.getValueType().getVectorNumElements() &&
"Mask and data vectors should have the same number of elements");
return DAG.getMaskedStore(MST->getChain(), dl, StVal, MST->getBasePtr(),
- Mask, MST->getMemoryVT(), MST->getMemOperand(),
+ MST->getOffset(), Mask, MST->getMemoryVT(),
+ MST->getMemOperand(), MST->getAddressingMode(),
false, MST->isCompressingStore());
}
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index f1b88d80f43..a20e43462f7 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -6975,16 +6975,22 @@ SDValue SelectionDAG::getIndexedStore(SDValue OrigStore, const SDLoc &dl,
}
SDValue SelectionDAG::getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain,
- SDValue Ptr, SDValue Mask, SDValue PassThru,
- EVT MemVT, MachineMemOperand *MMO,
+ SDValue Base, SDValue Offset, SDValue Mask,
+ SDValue PassThru, EVT MemVT,
+ MachineMemOperand *MMO,
+ ISD::MemIndexedMode AM,
ISD::LoadExtType ExtTy, bool isExpanding) {
- SDVTList VTs = getVTList(VT, MVT::Other);
- SDValue Ops[] = { Chain, Ptr, Mask, PassThru };
+ bool Indexed = AM != ISD::UNINDEXED;
+ assert((Indexed || Offset.isUndef()) &&
+ "Unindexed masked load with an offset!");
+ SDVTList VTs = Indexed ? getVTList(VT, Base.getValueType(), MVT::Other)
+ : getVTList(VT, MVT::Other);
+ SDValue Ops[] = {Chain, Base, Offset, Mask, PassThru};
FoldingSetNodeID ID;
AddNodeIDNode(ID, ISD::MLOAD, VTs, Ops);
ID.AddInteger(MemVT.getRawBits());
ID.AddInteger(getSyntheticNodeSubclassData<MaskedLoadSDNode>(
- dl.getIROrder(), VTs, ExtTy, isExpanding, MemVT, MMO));
+ dl.getIROrder(), VTs, AM, ExtTy, isExpanding, MemVT, MMO));
ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
void *IP = nullptr;
if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
@@ -6992,7 +6998,7 @@ SDValue SelectionDAG::getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain,
return SDValue(E, 0);
}
auto *N = newSDNode<MaskedLoadSDNode>(dl.getIROrder(), dl.getDebugLoc(), VTs,
- ExtTy, isExpanding, MemVT, MMO);
+ AM, ExtTy, isExpanding, MemVT, MMO);
createOperands(N, Ops);
CSEMap.InsertNode(N, IP);
@@ -7002,27 +7008,45 @@ SDValue SelectionDAG::getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain,
return V;
}
+SDValue SelectionDAG::getIndexedMaskedLoad(SDValue OrigLoad, const SDLoc &dl,
+ SDValue Base, SDValue Offset,
+ ISD::MemIndexedMode AM) {
+ MaskedLoadSDNode *LD = cast<MaskedLoadSDNode>(OrigLoad);
+ assert(LD->getOffset().isUndef() && "Masked load is already a indexed load!");
+ return getMaskedLoad(OrigLoad.getValueType(), dl, LD->getChain(), Base,
+ Offset, LD->getMask(), LD->getPassThru(),
+ LD->getMemoryVT(), LD->getMemOperand(), AM,
+ LD->getExtensionType(), LD->isExpandingLoad());
+}
+
SDValue SelectionDAG::getMaskedStore(SDValue Chain, const SDLoc &dl,
- SDValue Val, SDValue Ptr, SDValue Mask,
- EVT MemVT, MachineMemOperand *MMO,
- bool IsTruncating, bool IsCompressing) {
+ SDValue Val, SDValue Base, SDValue Offset,
+ SDValue Mask, EVT MemVT,
+ MachineMemOperand *MMO,
+ ISD::MemIndexedMode AM, bool IsTruncating,
+ bool IsCompressing) {
assert(Chain.getValueType() == MVT::Other &&
"Invalid chain type");
- SDVTList VTs = getVTList(MVT::Other);
- SDValue Ops[] = { Chain, Val, Ptr, Mask };
+ bool Indexed = AM != ISD::UNINDEXED;
+ assert((Indexed || Offset.isUndef()) &&
+ "Unindexed masked store with an offset!");
+ SDVTList VTs = Indexed ? getVTList(Base.getValueType(), MVT::Other)
+ : getVTList(MVT::Other);
+ SDValue Ops[] = {Chain, Val, Base, Offset, Mask};
FoldingSetNodeID ID;
AddNodeIDNode(ID, ISD::MSTORE, VTs, Ops);
ID.AddInteger(MemVT.getRawBits());
ID.AddInteger(getSyntheticNodeSubclassData<MaskedStoreSDNode>(
- dl.getIROrder(), VTs, IsTruncating, IsCompressing, MemVT, MMO));
+ dl.getIROrder(), VTs, AM, IsTruncating, IsCompressing, MemVT, MMO));
ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
void *IP = nullptr;
if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
cast<MaskedStoreSDNode>(E)->refineAlignment(MMO);
return SDValue(E, 0);
}
- auto *N = newSDNode<MaskedStoreSDNode>(dl.getIROrder(), dl.getDebugLoc(), VTs,
- IsTruncating, IsCompressing, MemVT, MMO);
+ auto *N =
+ newSDNode<MaskedStoreSDNode>(dl.getIROrder(), dl.getDebugLoc(), VTs, AM,
+ IsTruncating, IsCompressing, MemVT, MMO);
createOperands(N, Ops);
CSEMap.InsertNode(N, IP);
@@ -7032,6 +7056,17 @@ SDValue SelectionDAG::getMaskedStore(SDValue Chain, const SDLoc &dl,
return V;
}
+SDValue SelectionDAG::getIndexedMaskedStore(SDValue OrigStore, const SDLoc &dl,
+ SDValue Base, SDValue Offset,
+ ISD::MemIndexedMode AM) {
+ MaskedStoreSDNode *ST = cast<MaskedStoreSDNode>(OrigStore);
+ assert(ST->getOffset().isUndef() &&
+ "Masked store is already a indexed store!");
+ return getMaskedStore(ST->getChain(), dl, ST->getValue(), Base, Offset,
+ ST->getMask(), ST->getMemoryVT(), ST->getMemOperand(),
+ AM, ST->isTruncatingStore(), ST->isCompressingStore());
+}
+
SDValue SelectionDAG::getMaskedGather(SDVTList VTs, EVT VT, const SDLoc &dl,
ArrayRef<SDValue> Ops,
MachineMemOperand *MMO,
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 1ed0dc2c979..0aeb3c14aa3 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -4295,6 +4295,7 @@ void SelectionDAGBuilder::visitMaskedStore(const CallInst &I,
SDValue Ptr = getValue(PtrOperand);
SDValue Src0 = getValue(Src0Operand);
SDValue Mask = getValue(MaskOperand);
+ SDValue Offset = DAG.getUNDEF(Ptr.getValueType());
EVT VT = Src0.getValueType();
if (!Alignment)
@@ -4311,9 +4312,9 @@ void SelectionDAGBuilder::visitMaskedStore(const CallInst &I,
// vectors.
VT.getStoreSize().getKnownMinSize(),
Alignment, AAInfo);
- SDValue StoreNode = DAG.getMaskedStore(getRoot(), sdl, Src0, Ptr, Mask, VT,
- MMO, false /* Truncating */,
- IsCompressing);
+ SDValue StoreNode =
+ DAG.getMaskedStore(getRoot(), sdl, Src0, Ptr, Offset, Mask, VT, MMO,
+ ISD::UNINDEXED, false /* Truncating */, IsCompressing);
DAG.setRoot(StoreNode);
setValue(&I, StoreNode);
}
@@ -4461,6 +4462,7 @@ void SelectionDAGBuilder::visitMaskedLoad(const CallInst &I, bool IsExpanding) {
SDValue Ptr = getValue(PtrOperand);
SDValue Src0 = getValue(Src0Operand);
SDValue Mask = getValue(MaskOperand);
+ SDValue Offset = DAG.getUNDEF(Ptr.getValueType());
EVT VT = Src0.getValueType();
if (!Alignment)
@@ -4491,8 +4493,9 @@ void SelectionDAGBuilder::visitMaskedLoad(const CallInst &I, bool IsExpanding) {
VT.getStoreSize().getKnownMinSize(),
Alignment, AAInfo, Ranges);
- SDValue Load = DAG.getMaskedLoad(VT, sdl, InChain, Ptr, Mask, Src0, VT, MMO,
- ISD::NON_EXTLOAD, IsExpanding);
+ SDValue Load =
+ DAG.getMaskedLoad(VT, sdl, InChain, Ptr, Offset, Mask, Src0, VT, MMO,
+ ISD::UNINDEXED, ISD::NON_EXTLOAD, IsExpanding);
if (AddToChain)
PendingLoads.push_back(Load.getValue(1));
setValue(&I, Load);
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index bc10f762123..f863d987648 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -685,6 +685,10 @@ void SDNode::print_details(raw_ostream &OS, const SelectionDAG *G) const {
if (doExt)
OS << " from " << MLd->getMemoryVT().getEVTString();
+ const char *AM = getIndexedModeName(MLd->getAddressingMode());
+ if (*AM)
+ OS << ", " << AM;
+
if (MLd->isExpandingLoad())
OS << ", expanding";
@@ -696,6 +700,10 @@ void SDNode::print_details(raw_ostream &OS, const SelectionDAG *G) const {
if (MSt->isTruncatingStore())
OS << ", trunc to " << MSt->getMemoryVT().getEVTString();
+ const char *AM = getIndexedModeName(MSt->getAddressingMode());
+ if (*AM)
+ OS << ", " << AM;
+
if (MSt->isCompressingStore())
OS << ", compressing";
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index af7dc432eae..cc436fcc4f6 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -633,6 +633,8 @@ void TargetLoweringBase::initActions() {
IM != (unsigned)ISD::LAST_INDEXED_MODE; ++IM) {
setIndexedLoadAction(IM, VT, Expand);
setIndexedStoreAction(IM, VT, Expand);
+ setIndexedMaskedLoadAction(IM, VT, Expand);
+ setIndexedMaskedStoreAction(IM, VT, Expand);
}
// Most backends expect to see the node which just returns the value loaded.
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 80cf31ff3d5..ec84c1efbaf 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -262,15 +262,17 @@ def SDT_AArch64WrapperLarge : SDTypeProfile<1, 4,
// non-extending masked load fragment.
def nonext_masked_load :
PatFrag<(ops node:$ptr, node:$pred, node:$def),
- (masked_ld node:$ptr, node:$pred, node:$def), [{
- return cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::NON_EXTLOAD;
+ (masked_ld node:$ptr, undef, node:$pred, node:$def), [{
+ return cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::NON_EXTLOAD &&
+ cast<MaskedLoadSDNode>(N)->isUnindexed();
}]>;
// sign extending masked load fragments.
def asext_masked_load :
PatFrag<(ops node:$ptr, node:$pred, node:$def),
- (masked_ld node:$ptr, node:$pred, node:$def),[{
- return cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::EXTLOAD ||
- cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::SEXTLOAD;
+ (masked_ld node:$ptr, undef, node:$pred, node:$def),[{
+ return (cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::EXTLOAD ||
+ cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::SEXTLOAD) &&
+ cast<MaskedLoadSDNode>(N)->isUnindexed();
}]>;
def asext_masked_load_i8 :
PatFrag<(ops node:$ptr, node:$pred, node:$def),
@@ -290,8 +292,9 @@ def asext_masked_load_i32 :
// zero extending masked load fragments.
def zext_masked_load :
PatFrag<(ops node:$ptr, node:$pred, node:$def),
- (masked_ld node:$ptr, node:$pred, node:$def), [{
- return cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::ZEXTLOAD;
+ (masked_ld node:$ptr, undef, node:$pred, node:$def), [{
+ return cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::ZEXTLOAD &&
+ cast<MaskedLoadSDNode>(N)->isUnindexed();
}]>;
def zext_masked_load_i8 :
PatFrag<(ops node:$ptr, node:$pred, node:$def),
@@ -312,14 +315,16 @@ def zext_masked_load_i32 :
// non-truncating masked store fragment.
def nontrunc_masked_store :
PatFrag<(ops node:$val, node:$ptr, node:$pred),
- (masked_st node:$val, node:$ptr, node:$pred), [{
- return !cast<MaskedStoreSDNode>(N)->isTruncatingStore();
+ (masked_st node:$val, node:$ptr, undef, node:$pred), [{
+ return !cast<MaskedStoreSDNode>(N)->isTruncatingStore() &&
+ cast<MaskedStoreSDNode>(N)->isUnindexed();
}]>;
// truncating masked store fragments.
def trunc_masked_store :
PatFrag<(ops node:$val, node:$ptr, node:$pred),
- (masked_st node:$val, node:$ptr, node:$pred), [{
- return cast<MaskedStoreSDNode>(N)->isTruncatingStore();
+ (masked_st node:$val, node:$ptr, undef, node:$pred), [{
+ return cast<MaskedStoreSDNode>(N)->isTruncatingStore() &&
+ cast<MaskedStoreSDNode>(N)->isUnindexed();
}]>;
def trunc_masked_store_i8 :
PatFrag<(ops node:$val, node:$ptr, node:$pred),
diff --git a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
index 46a2560e167..a6b334938e1 100644
--- a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -1351,11 +1351,27 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm7Offset(SDNode *Op, SDValue N,
SDValue &OffImm,
unsigned Shift) {
unsigned Opcode = Op->getOpcode();
- ISD::MemIndexedMode AM = (Opcode == ISD::LOAD)
- ? cast<LoadSDNode>(Op)->getAddressingMode()
- : cast<StoreSDNode>(Op)->getAddressingMode();
+ ISD::MemIndexedMode AM;
+ switch (Opcode) {
+ case ISD::LOAD:
+ AM = cast<LoadSDNode>(Op)->getAddressingMode();
+ break;
+ case ISD::STORE:
+ AM = cast<StoreSDNode>(Op)->getAddressingMode();
+ break;
+ case ISD::MLOAD:
+ AM = cast<MaskedLoadSDNode>(Op)->getAddressingMode();
+ break;
+ case ISD::MSTORE:
+ AM = cast<MaskedStoreSDNode>(Op)->getAddressingMode();
+ break;
+ default:
+ llvm_unreachable("Unexpected Opcode for Imm7Offset");
+ }
+
int RHSC;
- if (isScaledConstantInRange(N, 1 << Shift, 0, 0x80, RHSC)) { // 7 bits.
+ // 7 bit constant, shifted by Shift.
+ if (isScaledConstantInRange(N, 1 << Shift, 0, 0x80, RHSC)) {
OffImm =
((AM == ISD::PRE_INC) || (AM == ISD::POST_INC))
? CurDAG->getTargetConstant(RHSC * (1 << Shift), SDLoc(N), MVT::i32)
@@ -1625,58 +1641,93 @@ bool ARMDAGToDAGISel::tryT2IndexedLoad(SDNode *N) {
}
bool ARMDAGToDAGISel::tryMVEIndexedLoad(SDNode *N) {
- LoadSDNode *LD = cast<LoadSDNode>(N);
- ISD::MemIndexedMode AM = LD->getAddressingMode();
- if (AM == ISD::UNINDEXED)
- return false;
- EVT LoadedVT = LD->getMemoryVT();
- if (!LoadedVT.isVector())
- return false;
- bool isSExtLd = LD->getExtensionType() == ISD::SEXTLOAD;
- SDValue Offset;
- bool isPre = (AM == ISD::PRE_INC) || (AM == ISD::PRE_DEC);
+ EVT LoadedVT;
unsigned Opcode = 0;
- unsigned Align = LD->getAlignment();
- bool IsLE = Subtarget->isLittle();
+ bool isSExtLd, isPre;
+ unsigned Align;
+ ARMVCC::VPTCodes Pred;
+ SDValue PredReg;
+ SDValue Chain, Base, Offset;
+
+ if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
+ ISD::MemIndexedMode AM = LD->getAddressingMode();
+ if (AM == ISD::UNINDEXED)
+ return false;
+ LoadedVT = LD->getMemoryVT();
+ if (!LoadedVT.isVector())
+ return false;
+
+ Chain = LD->getChain();
+ Base = LD->getBasePtr();
+ Offset = LD->getOffset();
+ Align = LD->getAlignment();
+ isSExtLd = LD->getExtensionType() == ISD::SEXTLOAD;
+ isPre = (AM == ISD::PRE_INC) || (AM == ISD::PRE_DEC);
+ Pred = ARMVCC::None;
+ PredReg = CurDAG->getRegister(0, MVT::i32);
+ } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) {
+ ISD::MemIndexedMode AM = LD->getAddressingMode();
+ if (AM == ISD::UNINDEXED)
+ return false;
+ LoadedVT = LD->getMemoryVT();
+ if (!LoadedVT.isVector())
+ return false;
+ Chain = LD->getChain();
+ Base = LD->getBasePtr();
+ Offset = LD->getOffset();
+ Align = LD->getAlignment();
+ isSExtLd = LD->getExtensionType() == ISD::SEXTLOAD;
+ isPre = (AM == ISD::PRE_INC) || (AM == ISD::PRE_DEC);
+ Pred = ARMVCC::Then;
+ PredReg = LD->getMask();
+ } else
+ llvm_unreachable("Expected a Load or a Masked Load!");
+
+ // We allow LE non-masked loads to change the type (for example use a vldrb.8
+ // as opposed to a vldrw.32). This can allow extra addressing modes or
+ // alignments for what is otherwise an equivalent instruction.
+ bool CanChangeType = Subtarget->isLittle() && !isa<MaskedLoadSDNode>(N);
+
+ SDValue NewOffset;
if (Align >= 2 && LoadedVT == MVT::v4i16 &&
- SelectT2AddrModeImm7Offset(N, LD->getOffset(), Offset, 1)) {
+ SelectT2AddrModeImm7Offset(N, Offset, NewOffset, 1)) {
if (isSExtLd)
Opcode = isPre ? ARM::MVE_VLDRHS32_pre : ARM::MVE_VLDRHS32_post;
else
Opcode = isPre ? ARM::MVE_VLDRHU32_pre : ARM::MVE_VLDRHU32_post;
} else if (LoadedVT == MVT::v8i8 &&
- SelectT2AddrModeImm7Offset(N, LD->getOffset(), Offset, 0)) {
+ SelectT2AddrModeImm7Offset(N, Offset, NewOffset, 0)) {
if (isSExtLd)
Opcode = isPre ? ARM::MVE_VLDRBS16_pre : ARM::MVE_VLDRBS16_post;
else
Opcode = isPre ? ARM::MVE_VLDRBU16_pre : ARM::MVE_VLDRBU16_post;
} else if (LoadedVT == MVT::v4i8 &&
- SelectT2AddrModeImm7Offset(N, LD->getOffset(), Offset, 0)) {
+ SelectT2AddrModeImm7Offset(N, Offset, NewOffset, 0)) {
if (isSExtLd)
Opcode = isPre ? ARM::MVE_VLDRBS32_pre : ARM::MVE_VLDRBS32_post;
else
Opcode = isPre ? ARM::MVE_VLDRBU32_pre : ARM::MVE_VLDRBU32_post;
} else if (Align >= 4 &&
- (IsLE || LoadedVT == MVT::v4i32 || LoadedVT == MVT::v4f32) &&
- SelectT2AddrModeImm7Offset(N, LD->getOffset(), Offset, 2))
+ (CanChangeType || LoadedVT == MVT::v4i32 ||
+ LoadedVT == MVT::v4f32) &&
+ SelectT2AddrModeImm7Offset(N, Offset, NewOffset, 2))
Opcode = isPre ? ARM::MVE_VLDRWU32_pre : ARM::MVE_VLDRWU32_post;
else if (Align >= 2 &&
- (IsLE || LoadedVT == MVT::v8i16 || LoadedVT == MVT::v8f16) &&
- SelectT2AddrModeImm7Offset(N, LD->getOffset(), Offset, 1))
+ (CanChangeType || LoadedVT == MVT::v8i16 ||
+ LoadedVT == MVT::v8f16) &&
+ SelectT2AddrModeImm7Offset(N, Offset, NewOffset, 1))
Opcode = isPre ? ARM::MVE_VLDRHU16_pre : ARM::MVE_VLDRHU16_post;
- else if ((IsLE || LoadedVT == MVT::v16i8) &&
- SelectT2AddrModeImm7Offset(N, LD->getOffset(), Offset, 0))
+ else if ((CanChangeType || LoadedVT == MVT::v16i8) &&
+ SelectT2AddrModeImm7Offset(N, Offset, NewOffset, 0))
Opcode = isPre ? ARM::MVE_VLDRBU8_pre : ARM::MVE_VLDRBU8_post;
else
return false;
- SDValue Chain = LD->getChain();
- SDValue Base = LD->getBasePtr();
- SDValue Ops[] = {Base, Offset,
- CurDAG->getTargetConstant(ARMVCC::None, SDLoc(N), MVT::i32),
- CurDAG->getRegister(0, MVT::i32), Chain};
- SDNode *New = CurDAG->getMachineNode(Opcode, SDLoc(N), LD->getValueType(0),
+ SDValue Ops[] = {Base, NewOffset,
+ CurDAG->getTargetConstant(Pred, SDLoc(N), MVT::i32), PredReg,
+ Chain};
+ SDNode *New = CurDAG->getMachineNode(Opcode, SDLoc(N), N->getValueType(0),
MVT::i32, MVT::Other, Ops);
transferMemOperands(N, New);
ReplaceUses(SDValue(N, 0), SDValue(New, 1));
@@ -3292,6 +3343,11 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
// Other cases are autogenerated.
break;
}
+ case ISD::MLOAD:
+ if (Subtarget->hasMVEIntegerOps() && tryMVEIndexedLoad(N))
+ return;
+ // Other cases are autogenerated.
+ break;
case ARMISD::WLS:
case ARMISD::LE: {
SDValue Ops[] = { N->getOperand(1),
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index e359756b7bf..c153e786e2d 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -296,6 +296,8 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
setIndexedLoadAction(im, VT, Legal);
setIndexedStoreAction(im, VT, Legal);
+ setIndexedMaskedLoadAction(im, VT, Legal);
+ setIndexedMaskedStoreAction(im, VT, Legal);
}
}
@@ -322,6 +324,8 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
setIndexedLoadAction(im, VT, Legal);
setIndexedStoreAction(im, VT, Legal);
+ setIndexedMaskedLoadAction(im, VT, Legal);
+ setIndexedMaskedStoreAction(im, VT, Legal);
}
if (HasMVEFP) {
@@ -374,12 +378,12 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
// Pre and Post inc on these are legal, given the correct extends
for (unsigned im = (unsigned)ISD::PRE_INC;
im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
- setIndexedLoadAction(im, MVT::v8i8, Legal);
- setIndexedStoreAction(im, MVT::v8i8, Legal);
- setIndexedLoadAction(im, MVT::v4i8, Legal);
- setIndexedStoreAction(im, MVT::v4i8, Legal);
- setIndexedLoadAction(im, MVT::v4i16, Legal);
- setIndexedStoreAction(im, MVT::v4i16, Legal);
+ for (auto VT : {MVT::v8i8, MVT::v4i8, MVT::v4i16}) {
+ setIndexedLoadAction(im, VT, Legal);
+ setIndexedStoreAction(im, VT, Legal);
+ setIndexedMaskedLoadAction(im, VT, Legal);
+ setIndexedMaskedStoreAction(im, VT, Legal);
+ }
}
// Predicate types
@@ -9013,8 +9017,9 @@ static SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG) {
SDValue ZeroVec = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
DAG.getTargetConstant(0, dl, MVT::i32));
SDValue NewLoad = DAG.getMaskedLoad(
- VT, dl, N->getChain(), N->getBasePtr(), Mask, ZeroVec, N->getMemoryVT(),
- N->getMemOperand(), N->getExtensionType(), N->isExpandingLoad());
+ VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask, ZeroVec,
+ N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
+ N->getExtensionType(), N->isExpandingLoad());
SDValue Combo = NewLoad;
if (!PassThru.isUndef() &&
(PassThru.getOpcode() != ISD::BITCAST ||
@@ -15192,14 +15197,19 @@ static bool getT2IndexedAddressParts(SDNode *Ptr, EVT VT,
}
static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, unsigned Align,
- bool isSEXTLoad, bool isLE, SDValue &Base,
- SDValue &Offset, bool &isInc,
- SelectionDAG &DAG) {
+ bool isSEXTLoad, bool IsMasked, bool isLE,
+ SDValue &Base, SDValue &Offset,
+ bool &isInc, SelectionDAG &DAG) {
if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
return false;
if (!isa<ConstantSDNode>(Ptr->getOperand(1)))
return false;
+ // We allow LE non-masked loads to change the type (for example use a vldrb.8
+ // as opposed to a vldrw.32). This can allow extra addressing modes or
+ // alignments for what is otherwise an equivalent instruction.
+ bool CanChangeType = isLE && !IsMasked;
+
ConstantSDNode *RHS = cast<ConstantSDNode>(Ptr->getOperand(1));
int RHSC = (int)RHS->getZExtValue();
@@ -15218,7 +15228,7 @@ static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, unsigned Align,
};
// Try to find a matching instruction based on s/zext, Alignment, Offset and
- // (in BE) type.
+ // (in BE/masked) type.
Base = Ptr->getOperand(0);
if (VT == MVT::v4i16) {
if (Align >= 2 && IsInRange(RHSC, 0x80, 2))
@@ -15226,13 +15236,15 @@ static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, unsigned Align,
} else if (VT == MVT::v4i8 || VT == MVT::v8i8) {
if (IsInRange(RHSC, 0x80, 1))
return true;
- } else if (Align >= 4 && (isLE || VT == MVT::v4i32 || VT == MVT::v4f32) &&
+ } else if (Align >= 4 &&
+ (CanChangeType || VT == MVT::v4i32 || VT == MVT::v4f32) &&
IsInRange(RHSC, 0x80, 4))
return true;
- else if (Align >= 2 && (isLE || VT == MVT::v8i16 || VT == MVT::v8f16) &&
+ else if (Align >= 2 &&
+ (CanChangeType || VT == MVT::v8i16 || VT == MVT::v8f16) &&
IsInRange(RHSC, 0x80, 2))
return true;
- else if ((isLE || VT == MVT::v16i8) && IsInRange(RHSC, 0x80, 1))
+ else if ((CanChangeType || VT == MVT::v16i8) && IsInRange(RHSC, 0x80, 1))
return true;
return false;
}
@@ -15252,6 +15264,7 @@ ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
SDValue Ptr;
unsigned Align;
bool isSEXTLoad = false;
+ bool IsMasked = false;
if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
Ptr = LD->getBasePtr();
VT = LD->getMemoryVT();
@@ -15261,6 +15274,17 @@ ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
Ptr = ST->getBasePtr();
VT = ST->getMemoryVT();
Align = ST->getAlignment();
+ } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) {
+ Ptr = LD->getBasePtr();
+ VT = LD->getMemoryVT();
+ Align = LD->getAlignment();
+ isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
+ IsMasked = true;
+ } else if (MaskedStoreSDNode *ST = dyn_cast<MaskedStoreSDNode>(N)) {
+ Ptr = ST->getBasePtr();
+ VT = ST->getMemoryVT();
+ Align = ST->getAlignment();
+ IsMasked = true;
} else
return false;
@@ -15269,8 +15293,8 @@ ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
if (VT.isVector())
isLegal = Subtarget->hasMVEIntegerOps() &&
getMVEIndexedAddressParts(Ptr.getNode(), VT, Align, isSEXTLoad,
- Subtarget->isLittle(), Base, Offset,
- isInc, DAG);
+ IsMasked, Subtarget->isLittle(), Base,
+ Offset, isInc, DAG);
else {
if (Subtarget->isThumb2())
isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
@@ -15298,6 +15322,7 @@ bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
SDValue Ptr;
unsigned Align;
bool isSEXTLoad = false, isNonExt;
+ bool IsMasked = false;
if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
VT = LD->getMemoryVT();
Ptr = LD->getBasePtr();
@@ -15309,6 +15334,19 @@ bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
Ptr = ST->getBasePtr();
Align = ST->getAlignment();
isNonExt = !ST->isTruncatingStore();
+ } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) {
+ VT = LD->getMemoryVT();
+ Ptr = LD->getBasePtr();
+ Align = LD->getAlignment();
+ isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
+ isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD;
+ IsMasked = true;
+ } else if (MaskedStoreSDNode *ST = dyn_cast<MaskedStoreSDNode>(N)) {
+ VT = ST->getMemoryVT();
+ Ptr = ST->getBasePtr();
+ Align = ST->getAlignment();
+ isNonExt = !ST->isTruncatingStore();
+ IsMasked = true;
} else
return false;
@@ -15332,7 +15370,7 @@ bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
bool isLegal = false;
if (VT.isVector())
isLegal = Subtarget->hasMVEIntegerOps() &&
- getMVEIndexedAddressParts(Op, VT, Align, isSEXTLoad,
+ getMVEIndexedAddressParts(Op, VT, Align, isSEXTLoad, IsMasked,
Subtarget->isLittle(), Base, Offset,
isInc, DAG);
else {
diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td
index 429d0a1cf1b..dd8c032dae4 100644
--- a/llvm/lib/Target/ARM/ARMInstrMVE.td
+++ b/llvm/lib/Target/ARM/ARMInstrMVE.td
@@ -5332,6 +5332,10 @@ class MVE_vector_offset_store_typed<ValueType Ty, Instruction Opcode,
PatFrag StoreKind, int shift>
: Pat<(StoreKind (Ty MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<shift>:$addr),
(Opcode MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<shift>:$addr)>;
+class MVE_vector_offset_maskedstore_typed<ValueType Ty, Instruction Opcode,
+ PatFrag StoreKind, int shift>
+ : Pat<(StoreKind (Ty MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<shift>:$addr, VCCR:$pred),
+ (Opcode MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<shift>:$addr, (i32 1), VCCR:$pred)>;
multiclass MVE_vector_offset_store<Instruction RegImmInst, PatFrag StoreKind,
int shift> {
@@ -5363,7 +5367,7 @@ def aligned16_post_store : PatFrag<(ops node:$val, node:$ptr, node:$offset),
def maskedload8 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
- (masked_ld node:$ptr, node:$pred, node:$passthru), [{
+ (masked_ld node:$ptr, undef, node:$pred, node:$passthru), [{
auto *Ld = cast<MaskedLoadSDNode>(N);
return Ld->getMemoryVT().getScalarType() == MVT::i8;
}]>;
@@ -5382,7 +5386,7 @@ def extmaskedload8 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
return ScalarVT.isInteger() && Ld->getExtensionType() == ISD::EXTLOAD;
}]>;
def alignedmaskedload16: PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
- (masked_ld node:$ptr, node:$pred, node:$passthru), [{
+ (masked_ld node:$ptr, undef, node:$pred, node:$passthru), [{
auto *Ld = cast<MaskedLoadSDNode>(N);
EVT ScalarVT = Ld->getMemoryVT().getScalarType();
return (ScalarVT == MVT::i16 || ScalarVT == MVT::f16) && Ld->getAlignment() >= 2;
@@ -5402,14 +5406,14 @@ def extmaskedload16 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
return ScalarVT.isInteger() && Ld->getExtensionType() == ISD::EXTLOAD;
}]>;
def alignedmaskedload32: PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
- (masked_ld node:$ptr, node:$pred, node:$passthru), [{
+ (masked_ld node:$ptr, undef, node:$pred, node:$passthru), [{
auto *Ld = cast<MaskedLoadSDNode>(N);
EVT ScalarVT = Ld->getMemoryVT().getScalarType();
return (ScalarVT == MVT::i32 || ScalarVT == MVT::f32) && Ld->getAlignment() >= 4;
}]>;
def maskedstore8 : PatFrag<(ops node:$val, node:$ptr, node:$pred),
- (masked_st node:$val, node:$ptr, node:$pred), [{
+ (masked_st node:$val, node:$ptr, undef, node:$pred), [{
return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8;
}]>;
def truncatingmaskedstore8 : PatFrag<(ops node:$val, node:$ptr, node:$pred),
@@ -5417,7 +5421,7 @@ def truncatingmaskedstore8 : PatFrag<(ops node:$val, node:$ptr, node:$pred),
return cast<MaskedStoreSDNode>(N)->isTruncatingStore();
}]>;
def maskedstore16 : PatFrag<(ops node:$val, node:$ptr, node:$pred),
- (masked_st node:$val, node:$ptr, node:$pred), [{
+ (masked_st node:$val, node:$ptr, undef, node:$pred), [{
auto *St = cast<MaskedStoreSDNode>(N);
EVT ScalarVT = St->getMemoryVT().getScalarType();
return (ScalarVT == MVT::i16 || ScalarVT == MVT::f16) && St->getAlignment() >= 2;
@@ -5428,12 +5432,41 @@ def truncatingmaskedstore16 : PatFrag<(ops node:$val, node:$ptr, node:$pred),
return cast<MaskedStoreSDNode>(N)->isTruncatingStore();
}]>;
def maskedstore32 : PatFrag<(ops node:$val, node:$ptr, node:$pred),
- (masked_st node:$val, node:$ptr, node:$pred), [{
+ (masked_st node:$val, node:$ptr, undef, node:$pred), [{
auto *St = cast<MaskedStoreSDNode>(N);
EVT ScalarVT = St->getMemoryVT().getScalarType();
return (ScalarVT == MVT::i32 || ScalarVT == MVT::f32) && St->getAlignment() >= 4;
}]>;
+
+def pre_maskedstore : PatFrag<(ops node:$val, node:$base, node:$offset, node:$mask),
+ (masked_st node:$val, node:$base, node:$offset, node:$mask), [{
+ ISD::MemIndexedMode AM = cast<MaskedStoreSDNode>(N)->getAddressingMode();
+ return AM == ISD::PRE_INC || AM == ISD::PRE_DEC;
+}]>;
+def post_maskedstore : PatFrag<(ops node:$val, node:$base, node:$offset, node:$mask),
+ (masked_st node:$val, node:$base, node:$offset, node:$mask), [{
+ ISD::MemIndexedMode AM = cast<MaskedStoreSDNode>(N)->getAddressingMode();
+ return AM == ISD::POST_INC || AM == ISD::POST_DEC;
+}]>;
+def aligned32_pre_maskedstore : PatFrag<(ops node:$val, node:$ptr, node:$offset, node:$mask),
+ (pre_maskedstore node:$val, node:$ptr, node:$offset, node:$mask), [{
+ return cast<MaskedStoreSDNode>(N)->getAlignment() >= 4;
+}]>;
+def aligned32_post_maskedstore : PatFrag<(ops node:$val, node:$ptr, node:$offset, node:$mask),
+ (post_maskedstore node:$val, node:$ptr, node:$offset, node:$mask), [{
+ return cast<MaskedStoreSDNode>(N)->getAlignment() >= 4;
+}]>;
+def aligned16_pre_maskedstore : PatFrag<(ops node:$val, node:$ptr, node:$offset, node:$mask),
+ (pre_maskedstore node:$val, node:$ptr, node:$offset, node:$mask), [{
+ return cast<MaskedStoreSDNode>(N)->getAlignment() >= 2;
+}]>;
+def aligned16_post_maskedstore : PatFrag<(ops node:$val, node:$ptr, node:$offset, node:$mask),
+ (post_maskedstore node:$val, node:$ptr, node:$offset, node:$mask), [{
+ return cast<MaskedStoreSDNode>(N)->getAlignment() >= 2;
+}]>;
+
+
let Predicates = [HasMVEInt, IsLE] in {
// Stores
defm : MVE_vector_store<MVE_VSTRBU8, byte_alignedstore, 0>;
@@ -5515,19 +5548,26 @@ let Predicates = [HasMVEInt] in {
def : MVE_vector_maskedstore_typed<v8f16, MVE_VSTRHU16, maskedstore16, 1>;
def : MVE_vector_maskedstore_typed<v4i32, MVE_VSTRWU32, maskedstore32, 2>;
def : MVE_vector_maskedstore_typed<v4f32, MVE_VSTRWU32, maskedstore32, 2>;
- // Truncating stores
- def : Pat<(truncatingmaskedstore8 (v8i16 MQPR:$val), t2addrmode_imm7<0>:$addr, VCCR:$pred),
- (MVE_VSTRB16 MQPR:$val, t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)>;
- def : Pat<(truncatingmaskedstore8 (v4i32 MQPR:$val), t2addrmode_imm7<0>:$addr, VCCR:$pred),
- (MVE_VSTRB32 MQPR:$val, t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)>;
- def : Pat<(truncatingmaskedstore16 (v4i32 MQPR:$val), t2addrmode_imm7<1>:$addr, VCCR:$pred),
- (MVE_VSTRH32 MQPR:$val, t2addrmode_imm7<1>:$addr, (i32 1), VCCR:$pred)>;
+
+ // Pre/Post inc masked stores
+ def : MVE_vector_offset_maskedstore_typed<v16i8, MVE_VSTRBU8_pre, pre_maskedstore, 0>;
+ def : MVE_vector_offset_maskedstore_typed<v16i8, MVE_VSTRBU8_post, post_maskedstore, 0>;
+ def : MVE_vector_offset_maskedstore_typed<v8i16, MVE_VSTRHU16_pre, aligned16_pre_maskedstore, 1>;
+ def : MVE_vector_offset_maskedstore_typed<v8i16, MVE_VSTRHU16_post, aligned16_post_maskedstore, 1>;
+ def : MVE_vector_offset_maskedstore_typed<v8f16, MVE_VSTRHU16_pre, aligned16_pre_maskedstore, 1>;
+ def : MVE_vector_offset_maskedstore_typed<v8f16, MVE_VSTRHU16_post, aligned16_post_maskedstore, 1>;
+ def : MVE_vector_offset_maskedstore_typed<v4i32, MVE_VSTRWU32_pre, aligned32_pre_maskedstore, 2>;
+ def : MVE_vector_offset_maskedstore_typed<v4i32, MVE_VSTRWU32_post, aligned32_post_maskedstore, 2>;
+ def : MVE_vector_offset_maskedstore_typed<v4f32, MVE_VSTRWU32_pre, aligned32_pre_maskedstore, 2>;
+ def : MVE_vector_offset_maskedstore_typed<v4f32, MVE_VSTRWU32_post, aligned32_post_maskedstore, 2>;
+
// Aligned masked loads
def : MVE_vector_maskedload_typed<v16i8, MVE_VLDRBU8, maskedload8, 0>;
def : MVE_vector_maskedload_typed<v8i16, MVE_VLDRHU16, alignedmaskedload16, 1>;
def : MVE_vector_maskedload_typed<v8f16, MVE_VLDRHU16, alignedmaskedload16, 1>;
def : MVE_vector_maskedload_typed<v4i32, MVE_VLDRWU32, alignedmaskedload32, 2>;
def : MVE_vector_maskedload_typed<v4f32, MVE_VLDRWU32, alignedmaskedload32, 2>;
+
// Extending masked loads.
def : Pat<(v8i16 (sextmaskedload8 t2addrmode_imm7<0>:$addr, VCCR:$pred,
(v8i16 NEONimmAllZerosV))),
@@ -5569,6 +5609,37 @@ let MinAlignment = 2 in {
(pre_truncstvi16 node:$val, node:$base, node:$offset)>;
}
+def pre_truncmaskedst : PatFrag<(ops node:$val, node:$base, node:$offset, node:$pred),
+ (masked_st node:$val, node:$base, node:$offset, node:$pred), [{
+ ISD::MemIndexedMode AM = cast<MaskedStoreSDNode>(N)->getAddressingMode();
+ return AM == ISD::PRE_INC || AM == ISD::PRE_DEC;
+}]>;
+def pre_truncmaskedstvi8 : PatFrag<(ops node:$val, node:$base, node:$offset, node:$pred),
+ (pre_truncmaskedst node:$val, node:$base, node:$offset, node:$pred), [{
+ return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8;
+}]>;
+def pre_truncmaskedstvi16_align2 : PatFrag<(ops node:$val, node:$base, node:$offset, node:$pred),
+ (pre_truncmaskedst node:$val, node:$base, node:$offset, node:$pred), [{
+ auto *St = cast<MaskedStoreSDNode>(N);
+ EVT ScalarVT = St->getMemoryVT().getScalarType();
+ return (ScalarVT == MVT::i16 || ScalarVT == MVT::f16) && St->getAlignment() >= 2;
+}]>;
+def post_truncmaskedst : PatFrag<(ops node:$val, node:$base, node:$offset, node:$postd),
+ (masked_st node:$val, node:$base, node:$offset, node:$postd), [{
+ ISD::MemIndexedMode AM = cast<MaskedStoreSDNode>(N)->getAddressingMode();
+ return AM == ISD::POST_INC || AM == ISD::POST_DEC;
+}]>;
+def post_truncmaskedstvi8 : PatFrag<(ops node:$val, node:$base, node:$offset, node:$postd),
+ (post_truncmaskedst node:$val, node:$base, node:$offset, node:$postd), [{
+ return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8;
+}]>;
+def post_truncmaskedstvi16_align2 : PatFrag<(ops node:$val, node:$base, node:$offset, node:$postd),
+ (post_truncmaskedst node:$val, node:$base, node:$offset, node:$postd), [{
+ auto *St = cast<MaskedStoreSDNode>(N);
+ EVT ScalarVT = St->getMemoryVT().getScalarType();
+ return (ScalarVT == MVT::i16 || ScalarVT == MVT::f16) && St->getAlignment() >= 2;
+}]>;
+
let Predicates = [HasMVEInt] in {
def : Pat<(truncstorevi8 (v8i16 MQPR:$val), taddrmode_imm7<0>:$addr),
(MVE_VSTRB16 MQPR:$val, taddrmode_imm7<0>:$addr)>;
@@ -5590,6 +5661,27 @@ let Predicates = [HasMVEInt] in {
(MVE_VSTRB32_pre MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<0>:$addr)>;
def : Pat<(pre_truncstvi16_align2 (v4i32 MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<1>:$addr),
(MVE_VSTRH32_pre MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<1>:$addr)>;
+
+ def : Pat<(truncatingmaskedstore8 (v8i16 MQPR:$val), taddrmode_imm7<0>:$addr, VCCR:$pred),
+ (MVE_VSTRB16 MQPR:$val, taddrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)>;
+ def : Pat<(truncatingmaskedstore8 (v4i32 MQPR:$val), taddrmode_imm7<0>:$addr, VCCR:$pred),
+ (MVE_VSTRB32 MQPR:$val, taddrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)>;
+ def : Pat<(truncatingmaskedstore16 (v4i32 MQPR:$val), taddrmode_imm7<1>:$addr, VCCR:$pred),
+ (MVE_VSTRH32 MQPR:$val, taddrmode_imm7<1>:$addr, (i32 1), VCCR:$pred)>;
+
+ def : Pat<(post_truncmaskedstvi8 (v8i16 MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<0>:$addr, VCCR:$pred),
+ (MVE_VSTRB16_post MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<0>:$addr, (i32 1), VCCR:$pred)>;
+ def : Pat<(post_truncmaskedstvi8 (v4i32 MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<0>:$addr, VCCR:$pred),
+ (MVE_VSTRB32_post MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<0>:$addr, (i32 1), VCCR:$pred)>;
+ def : Pat<(post_truncmaskedstvi16_align2 (v4i32 MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<1>:$addr, VCCR:$pred),
+ (MVE_VSTRH32_post MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<1>:$addr, (i32 1), VCCR:$pred)>;
+
+ def : Pat<(pre_truncmaskedstvi8 (v8i16 MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<0>:$addr, VCCR:$pred),
+ (MVE_VSTRB16_pre MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<0>:$addr, (i32 1), VCCR:$pred)>;
+ def : Pat<(pre_truncmaskedstvi8 (v4i32 MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<0>:$addr, VCCR:$pred),
+ (MVE_VSTRB32_pre MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<0>:$addr, (i32 1), VCCR:$pred)>;
+ def : Pat<(pre_truncmaskedstvi16_align2 (v4i32 MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<1>:$addr, VCCR:$pred),
+ (MVE_VSTRH32_pre MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<1>:$addr, (i32 1), VCCR:$pred)>;
}
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index c3861adf091..32072df268d 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -24280,9 +24280,11 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
+ SDValue Offset = DAG.getUNDEF(VMask.getValueType());
- return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, VMask, MemVT,
- MemIntr->getMemOperand(), true /* truncating */);
+ return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, Offset, VMask,
+ MemVT, MemIntr->getMemOperand(), ISD::UNINDEXED,
+ true /* truncating */);
}
case X86ISD::VTRUNCUS:
case X86ISD::VTRUNCS: {
@@ -27593,12 +27595,11 @@ static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
if (PassThru.isUndef() || ISD::isBuildVectorAllZeros(PassThru.getNode()))
return Op;
- SDValue NewLoad = DAG.getMaskedLoad(VT, dl, N->getChain(),
- N->getBasePtr(), Mask,
- getZeroVector(VT, Subtarget, DAG, dl),
- N->getMemoryVT(), N->getMemOperand(),
- N->getExtensionType(),
- N->isExpandingLoad());
+ SDValue NewLoad = DAG.getMaskedLoad(
+ VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
+ getZeroVector(VT, Subtarget, DAG, dl), N->getMemoryVT(),
+ N->getMemOperand(), N->getAddressingMode(), N->getExtensionType(),
+ N->isExpandingLoad());
// Emit a blend.
SDValue Select = DAG.getNode(ISD::VSELECT, dl, MaskVT, Mask, NewLoad,
PassThru);
@@ -27632,11 +27633,10 @@ static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
- SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(),
- N->getBasePtr(), Mask, PassThru,
- N->getMemoryVT(), N->getMemOperand(),
- N->getExtensionType(),
- N->isExpandingLoad());
+ SDValue NewLoad = DAG.getMaskedLoad(
+ WideDataVT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
+ PassThru, N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
+ N->getExtensionType(), N->isExpandingLoad());
SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
NewLoad.getValue(0),
@@ -27682,7 +27682,8 @@ static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
- Mask, N->getMemoryVT(), N->getMemOperand(),
+ N->getOffset(), Mask, N->getMemoryVT(),
+ N->getMemOperand(), N->getAddressingMode(),
N->isTruncatingStore(), N->isCompressingStore());
}
@@ -40453,6 +40454,7 @@ static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
static SDValue
reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI) {
+ assert(ML->isUnindexed() && "Unexpected indexed masked load!");
// TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
// However, some target hooks may need to be added to know when the transform
// is profitable. Endianness would also have to be considered.
@@ -40480,6 +40482,7 @@ reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
static SDValue
combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI) {
+ assert(ML->isUnindexed() && "Unexpected indexed masked load!");
if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
return SDValue();
@@ -40515,10 +40518,10 @@ combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
// The new masked load has an undef pass-through operand. The select uses the
// original pass-through operand.
- SDValue NewML = DAG.getMaskedLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
- ML->getMask(), DAG.getUNDEF(VT),
- ML->getMemoryVT(), ML->getMemOperand(),
- ML->getExtensionType());
+ SDValue NewML = DAG.getMaskedLoad(
+ VT, DL, ML->getChain(), ML->getBasePtr(), ML->getOffset(), ML->getMask(),
+ DAG.getUNDEF(VT), ML->getMemoryVT(), ML->getMemOperand(),
+ ML->getAddressingMode(), ML->getExtensionType());
SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML,
ML->getPassThru());
@@ -40604,8 +40607,9 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),
Mst->getMemoryVT())) {
return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0),
- Mst->getBasePtr(), Mask,
- Mst->getMemoryVT(), Mst->getMemOperand(), true);
+ Mst->getBasePtr(), Mst->getOffset(), Mask,
+ Mst->getMemoryVT(), Mst->getMemOperand(),
+ Mst->getAddressingMode(), true);
}
return SDValue();
diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
index de6f8a81dff..1a4f7e1e6bb 100644
--- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -706,6 +706,10 @@ def X86GF2P8affineinvqb : SDNode<"X86ISD::GF2P8AFFINEINVQB", SDTBlend>;
def X86GF2P8affineqb : SDNode<"X86ISD::GF2P8AFFINEQB", SDTBlend>;
def X86GF2P8mulb : SDNode<"X86ISD::GF2P8MULB", SDTIntBinOp>;
+def SDTX86MaskedStore: SDTypeProfile<0, 3, [ // masked store
+ SDTCisVec<0>, SDTCisPtrTy<1>, SDTCisVec<2>, SDTCisSameNumEltsAs<0, 2>
+]>;
+
//===----------------------------------------------------------------------===//
// SSE Complex Patterns
//===----------------------------------------------------------------------===//
@@ -1040,9 +1044,10 @@ def vinsert256_insert : PatFrag<(ops node:$bigvec, node:$smallvec,
INSERT_get_vinsert256_imm>;
def masked_load : PatFrag<(ops node:$src1, node:$src2, node:$src3),
- (masked_ld node:$src1, node:$src2, node:$src3), [{
+ (masked_ld node:$src1, undef, node:$src2, node:$src3), [{
return !cast<MaskedLoadSDNode>(N)->isExpandingLoad() &&
- cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::NON_EXTLOAD;
+ cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::NON_EXTLOAD &&
+ cast<MaskedLoadSDNode>(N)->isUnindexed();
}]>;
def masked_load_aligned : PatFrag<(ops node:$src1, node:$src2, node:$src3),
@@ -1055,17 +1060,19 @@ def masked_load_aligned : PatFrag<(ops node:$src1, node:$src2, node:$src3),
}]>;
def X86mExpandingLoad : PatFrag<(ops node:$src1, node:$src2, node:$src3),
- (masked_ld node:$src1, node:$src2, node:$src3), [{
- return cast<MaskedLoadSDNode>(N)->isExpandingLoad();
+ (masked_ld node:$src1, undef, node:$src2, node:$src3), [{
+ return cast<MaskedLoadSDNode>(N)->isExpandingLoad() &&
+ cast<MaskedLoadSDNode>(N)->isUnindexed();
}]>;
// Masked store fragments.
// X86mstore can't be implemented in core DAG files because some targets
// do not support vector types (llvm-tblgen will fail).
def masked_store : PatFrag<(ops node:$src1, node:$src2, node:$src3),
- (masked_st node:$src1, node:$src2, node:$src3), [{
- return (!cast<MaskedStoreSDNode>(N)->isTruncatingStore()) &&
- (!cast<MaskedStoreSDNode>(N)->isCompressingStore());
+ (masked_st node:$src1, node:$src2, undef, node:$src3), [{
+ return !cast<MaskedStoreSDNode>(N)->isTruncatingStore() &&
+ !cast<MaskedStoreSDNode>(N)->isCompressingStore() &&
+ cast<MaskedStoreSDNode>(N)->isUnindexed();
}]>;
def masked_store_aligned : PatFrag<(ops node:$src1, node:$src2, node:$src3),
@@ -1078,16 +1085,18 @@ def masked_store_aligned : PatFrag<(ops node:$src1, node:$src2, node:$src3),
}]>;
def X86mCompressingStore : PatFrag<(ops node:$src1, node:$src2, node:$src3),
- (masked_st node:$src1, node:$src2, node:$src3), [{
- return cast<MaskedStoreSDNode>(N)->isCompressingStore();
+ (masked_st node:$src1, node:$src2, undef, node:$src3), [{
+ return cast<MaskedStoreSDNode>(N)->isCompressingStore() &&
+ cast<MaskedStoreSDNode>(N)->isUnindexed();
}]>;
// masked truncstore fragments
// X86mtruncstore can't be implemented in core DAG files because some targets
// doesn't support vector type ( llvm-tblgen will fail)
def X86mtruncstore : PatFrag<(ops node:$src1, node:$src2, node:$src3),
- (masked_st node:$src1, node:$src2, node:$src3), [{
- return cast<MaskedStoreSDNode>(N)->isTruncatingStore();
+ (masked_st node:$src1, node:$src2, undef, node:$src3), [{
+ return cast<MaskedStoreSDNode>(N)->isTruncatingStore() &&
+ cast<MaskedStoreSDNode>(N)->isUnindexed();
}]>;
def masked_truncstorevi8 :
PatFrag<(ops node:$src1, node:$src2, node:$src3),
@@ -1111,10 +1120,10 @@ def X86TruncSStore : SDNode<"X86ISD::VTRUNCSTORES", SDTStore,
def X86TruncUSStore : SDNode<"X86ISD::VTRUNCSTOREUS", SDTStore,
[SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
-def X86MTruncSStore : SDNode<"X86ISD::VMTRUNCSTORES", SDTMaskedStore,
+def X86MTruncSStore : SDNode<"X86ISD::VMTRUNCSTORES", SDTX86MaskedStore,
[SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
-def X86MTruncUSStore : SDNode<"X86ISD::VMTRUNCSTOREUS", SDTMaskedStore,
+def X86MTruncUSStore : SDNode<"X86ISD::VMTRUNCSTOREUS", SDTX86MaskedStore,
[SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
def truncstore_s_vi8 : PatFrag<(ops node:$val, node:$ptr),
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll
index 5900dd9ac66..0b50b9a1db4 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll
@@ -154,11 +154,11 @@ for.cond.cleanup: ; preds = %middle.block, %entr
; CHECK-NEXT: vldrwt.u32
; CHECK-NEXT: vldrwt.u32
; CHECK: mov [[ELEMS_OUT:r[0-9]+]], [[ELEMS]]
+; CHECK: sub{{.*}} [[ELEMS]],{{.*}}#4
; CHECK: vpsttt
; CHECK-NEXT: vcmpt.i32 eq, {{.*}}, zr
; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r3]
; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r2]
-; CHECK: sub{{.*}} [[ELEMS]],{{.*}}#4
; CHECK: le lr, [[LOOP]]
; CHECK: vctp.32 [[ELEMS_OUT]]
; CHECK: vpsel
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll
index f285b445cf3..f7c9236c6e6 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll
@@ -39,14 +39,11 @@ define arm_aapcs_vfpcc void @fast_float_mul(float* nocapture %a, float* nocaptur
; CHECK-NEXT: dlstp.32 lr, r3
; CHECK-NEXT: .LBB0_5: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vldrw.u32 q0, [r1]
-; CHECK-NEXT: vldrw.u32 q1, [r2]
-; CHECK-NEXT: vmul.f32 q0, q1, q0
-; CHECK-NEXT: vstrw.32 q0, [r0]
-; CHECK-NEXT: adds r1, #16
-; CHECK-NEXT: adds r2, #16
-; CHECK-NEXT: adds r0, #16
+; CHECK-NEXT: vldrw.u32 q0, [r1], #16
+; CHECK-NEXT: vldrw.u32 q1, [r2], #16
; CHECK-NEXT: subs r3, #4
+; CHECK-NEXT: vmul.f32 q0, q1, q0
+; CHECK-NEXT: vstrw.32 q0, [r0], #16
; CHECK-NEXT: letp lr, .LBB0_5
; CHECK-NEXT: b .LBB0_11
; CHECK-NEXT: .LBB0_6: @ %for.body.preheader.new
@@ -236,13 +233,11 @@ define arm_aapcs_vfpcc float @fast_float_mac(float* nocapture readonly %b, float
; CHECK-NEXT: .LBB1_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.32 r2
-; CHECK-NEXT: vpstt
-; CHECK-NEXT: vldrwt.u32 q2, [r0]
-; CHECK-NEXT: vldrwt.u32 q3, [r1]
; CHECK-NEXT: mov r3, r2
-; CHECK-NEXT: adds r0, #16
-; CHECK-NEXT: adds r1, #16
; CHECK-NEXT: subs r2, #4
+; CHECK-NEXT: vpstt
+; CHECK-NEXT: vldrwt.u32 q2, [r0], #16
+; CHECK-NEXT: vldrwt.u32 q3, [r1], #16
; CHECK-NEXT: vmov q1, q0
; CHECK-NEXT: vfma.f32 q0, q3, q2
; CHECK-NEXT: le lr, .LBB1_2
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll
index 21be95e1fcc..23c44728429 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll
@@ -88,10 +88,9 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_short(i16 signext %a, i16* nocapture
; CHECK-NEXT: dlstp.32 lr, r2
; CHECK-NEXT: .LBB1_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vldrh.s32 q2, [r1]
; CHECK-NEXT: mov r3, r2
-; CHECK-NEXT: adds r1, #8
; CHECK-NEXT: subs r2, #4
+; CHECK-NEXT: vldrh.s32 q2, [r1], #8
; CHECK-NEXT: vmov q1, q0
; CHECK-NEXT: vmla.u32 q0, q2, r0
; CHECK-NEXT: letp lr, .LBB1_1
@@ -229,10 +228,9 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_ushort(i16 signext %a, i16* nocaptur
; CHECK-NEXT: dlstp.32 lr, r2
; CHECK-NEXT: .LBB3_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vldrh.u32 q2, [r1]
; CHECK-NEXT: mov r3, r2
-; CHECK-NEXT: adds r1, #8
; CHECK-NEXT: subs r2, #4
+; CHECK-NEXT: vldrh.u32 q2, [r1], #8
; CHECK-NEXT: vmov q1, q0
; CHECK-NEXT: vmla.u32 q0, q2, r0
; CHECK-NEXT: letp lr, .LBB3_1
@@ -295,10 +293,9 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_int(i32 %a, i32* nocapture readonly
; CHECK-NEXT: dlstp.32 lr, r2
; CHECK-NEXT: .LBB4_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vldrw.u32 q2, [r1]
; CHECK-NEXT: mov r3, r2
-; CHECK-NEXT: adds r1, #16
; CHECK-NEXT: subs r2, #4
+; CHECK-NEXT: vldrw.u32 q2, [r1], #16
; CHECK-NEXT: vmov q1, q0
; CHECK-NEXT: vmla.u32 q0, q2, r0
; CHECK-NEXT: letp lr, .LBB4_1
@@ -390,11 +387,10 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_char(i8* nocapture readonly
; CHECK-NEXT: adds r5, r1, r4
; CHECK-NEXT: vldrb.u32 q1, [r5]
; CHECK-NEXT: vmul.i32 q0, q1, q0
-; CHECK-NEXT: vadd.i32 q0, q0, r2
-; CHECK-NEXT: vstrw.32 q0, [r3]
-; CHECK-NEXT: adds r3, #16
; CHECK-NEXT: adds r4, #4
; CHECK-NEXT: sub.w r12, r12, #4
+; CHECK-NEXT: vadd.i32 q0, q0, r2
+; CHECK-NEXT: vstrw.32 q0, [r3], #16
; CHECK-NEXT: letp lr, .LBB5_5
; CHECK-NEXT: b .LBB5_12
; CHECK-NEXT: .LBB5_6: @ %for.body.preheader.new
@@ -594,15 +590,12 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_short(i16* nocapture readon
; CHECK-NEXT: dlstp.32 lr, r12
; CHECK-NEXT: .LBB6_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vldrh.s32 q0, [r0]
-; CHECK-NEXT: vldrh.s32 q1, [r1]
+; CHECK-NEXT: vldrh.s32 q0, [r0], #8
+; CHECK-NEXT: vldrh.s32 q1, [r1], #8
; CHECK-NEXT: vmul.i32 q0, q1, q0
-; CHECK-NEXT: adds r0, #8
-; CHECK-NEXT: vadd.i32 q0, q0, r2
-; CHECK-NEXT: vstrw.32 q0, [r3]
-; CHECK-NEXT: adds r1, #8
-; CHECK-NEXT: adds r3, #16
; CHECK-NEXT: sub.w r12, r12, #4
+; CHECK-NEXT: vadd.i32 q0, q0, r2
+; CHECK-NEXT: vstrw.32 q0, [r3], #16
; CHECK-NEXT: letp lr, .LBB6_1
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
; CHECK-NEXT: pop {r4, pc}
@@ -691,11 +684,10 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_uchar(i8* nocapture readonl
; CHECK-NEXT: adds r5, r1, r4
; CHECK-NEXT: vldrb.u32 q1, [r5]
; CHECK-NEXT: vmul.i32 q0, q1, q0
-; CHECK-NEXT: vadd.i32 q0, q0, r2
-; CHECK-NEXT: vstrw.32 q0, [r3]
-; CHECK-NEXT: adds r3, #16
; CHECK-NEXT: adds r4, #4
; CHECK-NEXT: sub.w r12, r12, #4
+; CHECK-NEXT: vadd.i32 q0, q0, r2
+; CHECK-NEXT: vstrw.32 q0, [r3], #16
; CHECK-NEXT: letp lr, .LBB7_5
; CHECK-NEXT: b .LBB7_12
; CHECK-NEXT: .LBB7_6: @ %for.body.preheader.new
@@ -895,15 +887,12 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_ushort(i16* nocapture reado
; CHECK-NEXT: dlstp.32 lr, r12
; CHECK-NEXT: .LBB8_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vldrh.u32 q0, [r0]
-; CHECK-NEXT: vldrh.u32 q1, [r1]
+; CHECK-NEXT: vldrh.u32 q0, [r0], #8
+; CHECK-NEXT: vldrh.u32 q1, [r1], #8
; CHECK-NEXT: vmul.i32 q0, q1, q0
-; CHECK-NEXT: adds r0, #8
-; CHECK-NEXT: vadd.i32 q0, q0, r2
-; CHECK-NEXT: vstrw.32 q0, [r3]
-; CHECK-NEXT: adds r1, #8
-; CHECK-NEXT: adds r3, #16
; CHECK-NEXT: sub.w r12, r12, #4
+; CHECK-NEXT: vadd.i32 q0, q0, r2
+; CHECK-NEXT: vstrw.32 q0, [r3], #16
; CHECK-NEXT: letp lr, .LBB8_1
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
; CHECK-NEXT: pop {r4, pc}
@@ -988,15 +977,12 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_int(i32* nocapture readonly
; CHECK-NEXT: dlstp.32 lr, r12
; CHECK-NEXT: .LBB9_5: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vldrw.u32 q0, [r0]
-; CHECK-NEXT: vldrw.u32 q1, [r1]
+; CHECK-NEXT: vldrw.u32 q0, [r0], #16
+; CHECK-NEXT: vldrw.u32 q1, [r1], #16
; CHECK-NEXT: vmul.i32 q0, q1, q0
-; CHECK-NEXT: adds r0, #16
-; CHECK-NEXT: vadd.i32 q0, q0, r2
-; CHECK-NEXT: vstrw.32 q0, [r3]
-; CHECK-NEXT: adds r1, #16
-; CHECK-NEXT: adds r3, #16
; CHECK-NEXT: sub.w r12, r12, #4
+; CHECK-NEXT: vadd.i32 q0, q0, r2
+; CHECK-NEXT: vstrw.32 q0, [r3], #16
; CHECK-NEXT: letp lr, .LBB9_5
; CHECK-NEXT: b .LBB9_11
; CHECK-NEXT: .LBB9_6: @ %for.body.preheader.new
@@ -1189,12 +1175,11 @@ define dso_local arm_aapcs_vfpcc void @test_v8i8_to_v8i16(i16* noalias nocapture
; CHECK-NEXT: add.w r4, r1, r12
; CHECK-NEXT: vldrb.u16 q0, [r4]
; CHECK-NEXT: add.w r4, r2, r12
-; CHECK-NEXT: vldrb.u16 q1, [r4]
-; CHECK-NEXT: vmul.i16 q0, q1, q0
-; CHECK-NEXT: vstrh.16 q0, [r0]
-; CHECK-NEXT: adds r0, #16
; CHECK-NEXT: add.w r12, r12, #8
; CHECK-NEXT: subs r3, #8
+; CHECK-NEXT: vldrb.u16 q1, [r4]
+; CHECK-NEXT: vmul.i16 q0, q1, q0
+; CHECK-NEXT: vstrh.16 q0, [r0], #16
; CHECK-NEXT: letp lr, .LBB10_1
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
; CHECK-NEXT: pop {r4, pc}
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll
index fdf04db8220..04f408d78ac 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll
@@ -9,23 +9,21 @@ define dso_local i32 @mul_reduce_add(i32* noalias nocapture readonly %a, i32* no
; CHECK-NEXT: moveq r0, #0
; CHECK-NEXT: bxeq lr
; CHECK-NEXT: push {r7, lr}
-; CHECK-NEXT: vmov.i32 q0, #0x0
+; CHECK-NEXT: vmov.i32 q1, #0x0
; CHECK-NEXT: dlstp.32 lr, r2
; CHECK-NEXT: .LBB0_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vmov q1, q0
-; CHECK-NEXT: vldrw.u32 q0, [r0]
-; CHECK-NEXT: vldrw.u32 q2, [r1]
+; CHECK-NEXT: vmov q0, q1
+; CHECK-NEXT: vldrw.u32 q1, [r0], #16
+; CHECK-NEXT: vldrw.u32 q2, [r1], #16
; CHECK-NEXT: mov r3, r2
-; CHECK-NEXT: vmul.i32 q0, q2, q0
-; CHECK-NEXT: adds r0, #16
-; CHECK-NEXT: adds r1, #16
+; CHECK-NEXT: vmul.i32 q1, q2, q1
; CHECK-NEXT: subs r2, #4
-; CHECK-NEXT: vadd.i32 q0, q0, q1
+; CHECK-NEXT: vadd.i32 q1, q1, q0
; CHECK-NEXT: letp lr, .LBB0_1
; CHECK-NEXT: @ %bb.2: @ %middle.block
; CHECK-NEXT: vctp.32 r3
-; CHECK-NEXT: vpsel q0, q0, q1
+; CHECK-NEXT: vpsel q0, q1, q0
; CHECK-NEXT: vaddv.u32 r0, q0
; CHECK-NEXT: pop {r7, pc}
entry:
@@ -83,8 +81,7 @@ define dso_local i32 @mul_reduce_add_const(i32* noalias nocapture readonly %a, i
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: mov r1, r2
; CHECK-NEXT: vmov q1, q0
-; CHECK-NEXT: vldrw.u32 q0, [r0]
-; CHECK-NEXT: adds r0, #16
+; CHECK-NEXT: vldrw.u32 q0, [r0], #16
; CHECK-NEXT: subs r2, #4
; CHECK-NEXT: vadd.i32 q0, q0, q1
; CHECK-NEXT: letp lr, .LBB1_1
@@ -144,8 +141,7 @@ define dso_local i32 @add_reduce_add_const(i32* noalias nocapture readonly %a, i
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: mov r1, r2
; CHECK-NEXT: vmov q1, q0
-; CHECK-NEXT: vldrw.u32 q0, [r0]
-; CHECK-NEXT: adds r0, #16
+; CHECK-NEXT: vldrw.u32 q0, [r0], #16
; CHECK-NEXT: subs r2, #4
; CHECK-NEXT: vadd.i32 q0, q0, q1
; CHECK-NEXT: letp lr, .LBB2_1
@@ -201,12 +197,10 @@ define dso_local void @vector_mul_const(i32* noalias nocapture %a, i32* noalias
; CHECK-NEXT: dlstp.32 lr, r3
; CHECK-NEXT: .LBB3_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vldrw.u32 q0, [r1]
-; CHECK-NEXT: vmul.i32 q0, q0, r2
-; CHECK-NEXT: vstrw.32 q0, [r0]
-; CHECK-NEXT: adds r1, #16
-; CHECK-NEXT: adds r0, #16
+; CHECK-NEXT: vldrw.u32 q0, [r1], #16
; CHECK-NEXT: subs r3, #4
+; CHECK-NEXT: vmul.i32 q0, q0, r2
+; CHECK-NEXT: vstrw.32 q0, [r0], #16
; CHECK-NEXT: letp lr, .LBB3_1
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
; CHECK-NEXT: pop {r7, pc}
@@ -255,12 +249,10 @@ define dso_local void @vector_add_const(i32* noalias nocapture %a, i32* noalias
; CHECK-NEXT: dlstp.32 lr, r3
; CHECK-NEXT: .LBB4_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vldrw.u32 q0, [r1]
-; CHECK-NEXT: vadd.i32 q0, q0, r2
-; CHECK-NEXT: vstrw.32 q0, [r0]
-; CHECK-NEXT: adds r1, #16
-; CHECK-NEXT: adds r0, #16
+; CHECK-NEXT: vldrw.u32 q0, [r1], #16
; CHECK-NEXT: subs r3, #4
+; CHECK-NEXT: vadd.i32 q0, q0, r2
+; CHECK-NEXT: vstrw.32 q0, [r0], #16
; CHECK-NEXT: letp lr, .LBB4_1
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
; CHECK-NEXT: pop {r7, pc}
@@ -369,14 +361,11 @@ define dso_local arm_aapcs_vfpcc void @vector_mul_vector_i16(i16* noalias nocapt
; CHECK-NEXT: dlstp.16 lr, r3
; CHECK-NEXT: .LBB6_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vldrh.u16 q0, [r1]
-; CHECK-NEXT: vldrh.u16 q1, [r2]
-; CHECK-NEXT: vmul.i16 q0, q1, q0
-; CHECK-NEXT: vstrh.16 q0, [r0]
-; CHECK-NEXT: adds r1, #16
-; CHECK-NEXT: adds r2, #16
-; CHECK-NEXT: adds r0, #16
+; CHECK-NEXT: vldrh.u16 q0, [r1], #16
+; CHECK-NEXT: vldrh.u16 q1, [r2], #16
; CHECK-NEXT: subs r3, #8
+; CHECK-NEXT: vmul.i16 q0, q1, q0
+; CHECK-NEXT: vstrh.16 q0, [r0], #16
; CHECK-NEXT: letp lr, .LBB6_1
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
; CHECK-NEXT: pop {r7, pc}
diff --git a/llvm/test/CodeGen/Thumb2/mve-masked-ldst-postinc.ll b/llvm/test/CodeGen/Thumb2/mve-masked-ldst-postinc.ll
index 69286c8777c..0951589eaa1 100644
--- a/llvm/test/CodeGen/Thumb2/mve-masked-ldst-postinc.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-masked-ldst-postinc.ll
@@ -7,8 +7,7 @@ define i8* @ldrwu32_4(i8* %x, i8* %y, <4 x i32> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrw.u32 q0, [r2]
; CHECK-NEXT: vpt.i32 ne, q0, zr
-; CHECK-NEXT: vldrwt.u32 q0, [r0]
-; CHECK-NEXT: adds r0, #4
+; CHECK-NEXT: vldrwt.u32 q0, [r0], #4
; CHECK-NEXT: vstrw.32 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -67,8 +66,7 @@ define i8* @ldrwu32_508(i8* %x, i8* %y, <4 x i32> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrw.u32 q0, [r2]
; CHECK-NEXT: vpt.i32 ne, q0, zr
-; CHECK-NEXT: vldrwt.u32 q0, [r0]
-; CHECK-NEXT: add.w r0, r0, #508
+; CHECK-NEXT: vldrwt.u32 q0, [r0], #508
; CHECK-NEXT: vstrw.32 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -107,8 +105,7 @@ define i8* @ldrwu32_m508(i8* %x, i8* %y, <4 x i32> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrw.u32 q0, [r2]
; CHECK-NEXT: vpt.i32 ne, q0, zr
-; CHECK-NEXT: vldrwt.u32 q0, [r0]
-; CHECK-NEXT: sub.w r0, r0, #508
+; CHECK-NEXT: vldrwt.u32 q0, [r0], #-508
; CHECK-NEXT: vstrw.32 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -147,8 +144,7 @@ define i8* @ldrhu32_4(i8* %x, i8* %y, <4 x i32> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrw.u32 q0, [r2]
; CHECK-NEXT: vpt.i32 ne, q0, zr
-; CHECK-NEXT: vldrht.u32 q0, [r0]
-; CHECK-NEXT: adds r0, #4
+; CHECK-NEXT: vldrht.u32 q0, [r0], #4
; CHECK-NEXT: vstrw.32 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -189,8 +185,7 @@ define i8* @ldrhu32_2(i8* %x, i8* %y, <4 x i32> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrw.u32 q0, [r2]
; CHECK-NEXT: vpt.i32 ne, q0, zr
-; CHECK-NEXT: vldrht.u32 q0, [r0]
-; CHECK-NEXT: adds r0, #2
+; CHECK-NEXT: vldrht.u32 q0, [r0], #2
; CHECK-NEXT: vstrw.32 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -210,8 +205,7 @@ define i8* @ldrhu32_254(i8* %x, i8* %y, <4 x i32> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrw.u32 q0, [r2]
; CHECK-NEXT: vpt.i32 ne, q0, zr
-; CHECK-NEXT: vldrht.u32 q0, [r0]
-; CHECK-NEXT: adds r0, #254
+; CHECK-NEXT: vldrht.u32 q0, [r0], #254
; CHECK-NEXT: vstrw.32 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -252,8 +246,7 @@ define i8* @ldrhu32_m254(i8* %x, i8* %y, <4 x i32> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrw.u32 q0, [r2]
; CHECK-NEXT: vpt.i32 ne, q0, zr
-; CHECK-NEXT: vldrht.u32 q0, [r0]
-; CHECK-NEXT: subs r0, #254
+; CHECK-NEXT: vldrht.u32 q0, [r0], #-254
; CHECK-NEXT: vstrw.32 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -294,8 +287,7 @@ define i8* @ldrhs32_4(i8* %x, i8* %y, <4 x i32> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrw.u32 q0, [r2]
; CHECK-NEXT: vpt.i32 ne, q0, zr
-; CHECK-NEXT: vldrht.s32 q0, [r0]
-; CHECK-NEXT: adds r0, #4
+; CHECK-NEXT: vldrht.s32 q0, [r0], #4
; CHECK-NEXT: vstrw.32 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -336,8 +328,7 @@ define i8* @ldrhs32_2(i8* %x, i8* %y, <4 x i32> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrw.u32 q0, [r2]
; CHECK-NEXT: vpt.i32 ne, q0, zr
-; CHECK-NEXT: vldrht.s32 q0, [r0]
-; CHECK-NEXT: adds r0, #2
+; CHECK-NEXT: vldrht.s32 q0, [r0], #2
; CHECK-NEXT: vstrw.32 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -357,8 +348,7 @@ define i8* @ldrhs32_254(i8* %x, i8* %y, <4 x i32> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrw.u32 q0, [r2]
; CHECK-NEXT: vpt.i32 ne, q0, zr
-; CHECK-NEXT: vldrht.s32 q0, [r0]
-; CHECK-NEXT: adds r0, #254
+; CHECK-NEXT: vldrht.s32 q0, [r0], #254
; CHECK-NEXT: vstrw.32 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -399,8 +389,7 @@ define i8* @ldrhs32_m254(i8* %x, i8* %y, <4 x i32> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrw.u32 q0, [r2]
; CHECK-NEXT: vpt.i32 ne, q0, zr
-; CHECK-NEXT: vldrht.s32 q0, [r0]
-; CHECK-NEXT: subs r0, #254
+; CHECK-NEXT: vldrht.s32 q0, [r0], #-254
; CHECK-NEXT: vstrw.32 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -441,8 +430,7 @@ define i8* @ldrhu16_4(i8* %x, i8* %y, <8 x i16> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrh.u16 q0, [r2]
; CHECK-NEXT: vpt.i16 ne, q0, zr
-; CHECK-NEXT: vldrht.u16 q0, [r0]
-; CHECK-NEXT: adds r0, #4
+; CHECK-NEXT: vldrht.u16 q0, [r0], #4
; CHECK-NEXT: vstrh.16 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -481,8 +469,7 @@ define i8* @ldrhu16_2(i8* %x, i8* %y, <8 x i16> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrh.u16 q0, [r2]
; CHECK-NEXT: vpt.i16 ne, q0, zr
-; CHECK-NEXT: vldrht.u16 q0, [r0]
-; CHECK-NEXT: adds r0, #2
+; CHECK-NEXT: vldrht.u16 q0, [r0], #2
; CHECK-NEXT: vstrh.16 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -501,8 +488,7 @@ define i8* @ldrhu16_254(i8* %x, i8* %y, <8 x i16> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrh.u16 q0, [r2]
; CHECK-NEXT: vpt.i16 ne, q0, zr
-; CHECK-NEXT: vldrht.u16 q0, [r0]
-; CHECK-NEXT: adds r0, #254
+; CHECK-NEXT: vldrht.u16 q0, [r0], #254
; CHECK-NEXT: vstrh.16 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -541,8 +527,7 @@ define i8* @ldrhu16_m254(i8* %x, i8* %y, <8 x i16> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrh.u16 q0, [r2]
; CHECK-NEXT: vpt.i16 ne, q0, zr
-; CHECK-NEXT: vldrht.u16 q0, [r0]
-; CHECK-NEXT: subs r0, #254
+; CHECK-NEXT: vldrht.u16 q0, [r0], #-254
; CHECK-NEXT: vstrh.16 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -581,8 +566,7 @@ define i8* @ldrbu32_4(i8* %x, i8* %y, <4 x i32> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrw.u32 q0, [r2]
; CHECK-NEXT: vpt.i32 ne, q0, zr
-; CHECK-NEXT: vldrbt.u32 q0, [r0]
-; CHECK-NEXT: adds r0, #4
+; CHECK-NEXT: vldrbt.u32 q0, [r0], #4
; CHECK-NEXT: vstrw.32 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -602,8 +586,7 @@ define i8* @ldrbu32_3(i8* %x, i8* %y, <4 x i32> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrw.u32 q0, [r2]
; CHECK-NEXT: vpt.i32 ne, q0, zr
-; CHECK-NEXT: vldrbt.u32 q0, [r0]
-; CHECK-NEXT: adds r0, #3
+; CHECK-NEXT: vldrbt.u32 q0, [r0], #3
; CHECK-NEXT: vstrw.32 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -623,8 +606,7 @@ define i8* @ldrbu32_2(i8* %x, i8* %y, <4 x i32> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrw.u32 q0, [r2]
; CHECK-NEXT: vpt.i32 ne, q0, zr
-; CHECK-NEXT: vldrbt.u32 q0, [r0]
-; CHECK-NEXT: adds r0, #2
+; CHECK-NEXT: vldrbt.u32 q0, [r0], #2
; CHECK-NEXT: vstrw.32 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -644,8 +626,7 @@ define i8* @ldrbu32_127(i8* %x, i8* %y, <4 x i32> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrw.u32 q0, [r2]
; CHECK-NEXT: vpt.i32 ne, q0, zr
-; CHECK-NEXT: vldrbt.u32 q0, [r0]
-; CHECK-NEXT: adds r0, #127
+; CHECK-NEXT: vldrbt.u32 q0, [r0], #127
; CHECK-NEXT: vstrw.32 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -686,8 +667,7 @@ define i8* @ldrbu32_m127(i8* %x, i8* %y, <4 x i32> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrw.u32 q0, [r2]
; CHECK-NEXT: vpt.i32 ne, q0, zr
-; CHECK-NEXT: vldrbt.u32 q0, [r0]
-; CHECK-NEXT: subs r0, #127
+; CHECK-NEXT: vldrbt.u32 q0, [r0], #-127
; CHECK-NEXT: vstrw.32 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -728,8 +708,7 @@ define i8* @ldrbs32_4(i8* %x, i8* %y, <4 x i32> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrw.u32 q0, [r2]
; CHECK-NEXT: vpt.i32 ne, q0, zr
-; CHECK-NEXT: vldrbt.s32 q0, [r0]
-; CHECK-NEXT: adds r0, #4
+; CHECK-NEXT: vldrbt.s32 q0, [r0], #4
; CHECK-NEXT: vstrw.32 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -749,8 +728,7 @@ define i8* @ldrbs32_3(i8* %x, i8* %y, <4 x i32> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrw.u32 q0, [r2]
; CHECK-NEXT: vpt.i32 ne, q0, zr
-; CHECK-NEXT: vldrbt.s32 q0, [r0]
-; CHECK-NEXT: adds r0, #3
+; CHECK-NEXT: vldrbt.s32 q0, [r0], #3
; CHECK-NEXT: vstrw.32 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -770,8 +748,7 @@ define i8* @ldrbs32_2(i8* %x, i8* %y, <4 x i32> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrw.u32 q0, [r2]
; CHECK-NEXT: vpt.i32 ne, q0, zr
-; CHECK-NEXT: vldrbt.s32 q0, [r0]
-; CHECK-NEXT: adds r0, #2
+; CHECK-NEXT: vldrbt.s32 q0, [r0], #2
; CHECK-NEXT: vstrw.32 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -791,8 +768,7 @@ define i8* @ldrbs32_127(i8* %x, i8* %y, <4 x i32> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrw.u32 q0, [r2]
; CHECK-NEXT: vpt.i32 ne, q0, zr
-; CHECK-NEXT: vldrbt.s32 q0, [r0]
-; CHECK-NEXT: adds r0, #127
+; CHECK-NEXT: vldrbt.s32 q0, [r0], #127
; CHECK-NEXT: vstrw.32 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -833,8 +809,7 @@ define i8* @ldrbs32_m127(i8* %x, i8* %y, <4 x i32> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrw.u32 q0, [r2]
; CHECK-NEXT: vpt.i32 ne, q0, zr
-; CHECK-NEXT: vldrbt.s32 q0, [r0]
-; CHECK-NEXT: subs r0, #127
+; CHECK-NEXT: vldrbt.s32 q0, [r0], #-127
; CHECK-NEXT: vstrw.32 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -875,8 +850,7 @@ define i8* @ldrbu16_4(i8* %x, i8* %y, <8 x i16> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrh.u16 q0, [r2]
; CHECK-NEXT: vpt.i16 ne, q0, zr
-; CHECK-NEXT: vldrbt.u16 q0, [r0]
-; CHECK-NEXT: adds r0, #4
+; CHECK-NEXT: vldrbt.u16 q0, [r0], #4
; CHECK-NEXT: vstrh.16 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -896,8 +870,7 @@ define i8* @ldrbu16_3(i8* %x, i8* %y, <8 x i16> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrh.u16 q0, [r2]
; CHECK-NEXT: vpt.i16 ne, q0, zr
-; CHECK-NEXT: vldrbt.u16 q0, [r0]
-; CHECK-NEXT: adds r0, #3
+; CHECK-NEXT: vldrbt.u16 q0, [r0], #3
; CHECK-NEXT: vstrh.16 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -917,8 +890,7 @@ define i8* @ldrbu16_2(i8* %x, i8* %y, <8 x i16> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrh.u16 q0, [r2]
; CHECK-NEXT: vpt.i16 ne, q0, zr
-; CHECK-NEXT: vldrbt.u16 q0, [r0]
-; CHECK-NEXT: adds r0, #2
+; CHECK-NEXT: vldrbt.u16 q0, [r0], #2
; CHECK-NEXT: vstrh.16 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -938,8 +910,7 @@ define i8* @ldrbu16_127(i8* %x, i8* %y, <8 x i16> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrh.u16 q0, [r2]
; CHECK-NEXT: vpt.i16 ne, q0, zr
-; CHECK-NEXT: vldrbt.u16 q0, [r0]
-; CHECK-NEXT: adds r0, #127
+; CHECK-NEXT: vldrbt.u16 q0, [r0], #127
; CHECK-NEXT: vstrh.16 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -980,8 +951,7 @@ define i8* @ldrbu16_m127(i8* %x, i8* %y, <8 x i16> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrh.u16 q0, [r2]
; CHECK-NEXT: vpt.i16 ne, q0, zr
-; CHECK-NEXT: vldrbt.u16 q0, [r0]
-; CHECK-NEXT: subs r0, #127
+; CHECK-NEXT: vldrbt.u16 q0, [r0], #-127
; CHECK-NEXT: vstrh.16 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -1022,8 +992,7 @@ define i8* @ldrbs16_4(i8* %x, i8* %y, <8 x i16> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrh.u16 q0, [r2]
; CHECK-NEXT: vpt.i16 ne, q0, zr
-; CHECK-NEXT: vldrbt.s16 q0, [r0]
-; CHECK-NEXT: adds r0, #4
+; CHECK-NEXT: vldrbt.s16 q0, [r0], #4
; CHECK-NEXT: vstrh.16 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -1043,8 +1012,7 @@ define i8* @ldrbs16_3(i8* %x, i8* %y, <8 x i16> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrh.u16 q0, [r2]
; CHECK-NEXT: vpt.i16 ne, q0, zr
-; CHECK-NEXT: vldrbt.s16 q0, [r0]
-; CHECK-NEXT: adds r0, #3
+; CHECK-NEXT: vldrbt.s16 q0, [r0], #3
; CHECK-NEXT: vstrh.16 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -1064,8 +1032,7 @@ define i8* @ldrbs16_2(i8* %x, i8* %y, <8 x i16> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrh.u16 q0, [r2]
; CHECK-NEXT: vpt.i16 ne, q0, zr
-; CHECK-NEXT: vldrbt.s16 q0, [r0]
-; CHECK-NEXT: adds r0, #2
+; CHECK-NEXT: vldrbt.s16 q0, [r0], #2
; CHECK-NEXT: vstrh.16 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -1085,8 +1052,7 @@ define i8* @ldrbs16_127(i8* %x, i8* %y, <8 x i16> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrh.u16 q0, [r2]
; CHECK-NEXT: vpt.i16 ne, q0, zr
-; CHECK-NEXT: vldrbt.s16 q0, [r0]
-; CHECK-NEXT: adds r0, #127
+; CHECK-NEXT: vldrbt.s16 q0, [r0], #127
; CHECK-NEXT: vstrh.16 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -1127,8 +1093,7 @@ define i8* @ldrbs16_m127(i8* %x, i8* %y, <8 x i16> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrh.u16 q0, [r2]
; CHECK-NEXT: vpt.i16 ne, q0, zr
-; CHECK-NEXT: vldrbt.s16 q0, [r0]
-; CHECK-NEXT: subs r0, #127
+; CHECK-NEXT: vldrbt.s16 q0, [r0], #-127
; CHECK-NEXT: vstrh.16 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -1169,8 +1134,7 @@ define i8* @ldrbu8_4(i8* %x, i8* %y, <16 x i8> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrb.u8 q0, [r2]
; CHECK-NEXT: vpt.i8 ne, q0, zr
-; CHECK-NEXT: vldrbt.u8 q0, [r0]
-; CHECK-NEXT: adds r0, #4
+; CHECK-NEXT: vldrbt.u8 q0, [r0], #4
; CHECK-NEXT: vstrb.8 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -1189,8 +1153,7 @@ define i8* @ldrbu8_3(i8* %x, i8* %y, <16 x i8> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrb.u8 q0, [r2]
; CHECK-NEXT: vpt.i8 ne, q0, zr
-; CHECK-NEXT: vldrbt.u8 q0, [r0]
-; CHECK-NEXT: adds r0, #3
+; CHECK-NEXT: vldrbt.u8 q0, [r0], #3
; CHECK-NEXT: vstrb.8 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -1209,8 +1172,7 @@ define i8* @ldrbu8_2(i8* %x, i8* %y, <16 x i8> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrb.u8 q0, [r2]
; CHECK-NEXT: vpt.i8 ne, q0, zr
-; CHECK-NEXT: vldrbt.u8 q0, [r0]
-; CHECK-NEXT: adds r0, #2
+; CHECK-NEXT: vldrbt.u8 q0, [r0], #2
; CHECK-NEXT: vstrb.8 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -1229,8 +1191,7 @@ define i8* @ldrbu8_127(i8* %x, i8* %y, <16 x i8> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrb.u8 q0, [r2]
; CHECK-NEXT: vpt.i8 ne, q0, zr
-; CHECK-NEXT: vldrbt.u8 q0, [r0]
-; CHECK-NEXT: adds r0, #127
+; CHECK-NEXT: vldrbt.u8 q0, [r0], #127
; CHECK-NEXT: vstrb.8 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -1269,8 +1230,7 @@ define i8* @ldrbu8_m127(i8* %x, i8* %y, <16 x i8> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrb.u8 q0, [r2]
; CHECK-NEXT: vpt.i8 ne, q0, zr
-; CHECK-NEXT: vldrbt.u8 q0, [r0]
-; CHECK-NEXT: subs r0, #127
+; CHECK-NEXT: vldrbt.u8 q0, [r0], #-127
; CHECK-NEXT: vstrb.8 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -1309,8 +1269,7 @@ define i8* @ldrwf32_4(i8* %x, i8* %y, <4 x i32> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrw.u32 q0, [r2]
; CHECK-NEXT: vpt.i32 ne, q0, zr
-; CHECK-NEXT: vldrwt.u32 q0, [r0]
-; CHECK-NEXT: adds r0, #4
+; CHECK-NEXT: vldrwt.u32 q0, [r0], #4
; CHECK-NEXT: vstrw.32 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -1369,8 +1328,7 @@ define i8* @ldrwf32_508(i8* %x, i8* %y, <4 x i32> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrw.u32 q0, [r2]
; CHECK-NEXT: vpt.i32 ne, q0, zr
-; CHECK-NEXT: vldrwt.u32 q0, [r0]
-; CHECK-NEXT: add.w r0, r0, #508
+; CHECK-NEXT: vldrwt.u32 q0, [r0], #508
; CHECK-NEXT: vstrw.32 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -1409,8 +1367,7 @@ define i8* @ldrwf32_m508(i8* %x, i8* %y, <4 x i32> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrw.u32 q0, [r2]
; CHECK-NEXT: vpt.i32 ne, q0, zr
-; CHECK-NEXT: vldrwt.u32 q0, [r0]
-; CHECK-NEXT: sub.w r0, r0, #508
+; CHECK-NEXT: vldrwt.u32 q0, [r0], #-508
; CHECK-NEXT: vstrw.32 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -1449,8 +1406,7 @@ define i8* @ldrhf16_4(i8* %x, i8* %y, <8 x i16> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrh.u16 q0, [r2]
; CHECK-NEXT: vpt.i16 ne, q0, zr
-; CHECK-NEXT: vldrht.u16 q0, [r0]
-; CHECK-NEXT: adds r0, #4
+; CHECK-NEXT: vldrht.u16 q0, [r0], #4
; CHECK-NEXT: vstrh.16 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -1489,8 +1445,7 @@ define i8* @ldrhf16_2(i8* %x, i8* %y, <8 x i16> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrh.u16 q0, [r2]
; CHECK-NEXT: vpt.i16 ne, q0, zr
-; CHECK-NEXT: vldrht.u16 q0, [r0]
-; CHECK-NEXT: adds r0, #2
+; CHECK-NEXT: vldrht.u16 q0, [r0], #2
; CHECK-NEXT: vstrh.16 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -1509,8 +1464,7 @@ define i8* @ldrhf16_254(i8* %x, i8* %y, <8 x i16> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrh.u16 q0, [r2]
; CHECK-NEXT: vpt.i16 ne, q0, zr
-; CHECK-NEXT: vldrht.u16 q0, [r0]
-; CHECK-NEXT: adds r0, #254
+; CHECK-NEXT: vldrht.u16 q0, [r0], #254
; CHECK-NEXT: vstrh.16 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -1549,8 +1503,7 @@ define i8* @ldrhf16_m254(i8* %x, i8* %y, <8 x i16> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrh.u16 q0, [r2]
; CHECK-NEXT: vpt.i16 ne, q0, zr
-; CHECK-NEXT: vldrht.u16 q0, [r0]
-; CHECK-NEXT: subs r0, #254
+; CHECK-NEXT: vldrht.u16 q0, [r0], #-254
; CHECK-NEXT: vstrh.16 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -1593,8 +1546,7 @@ define i8* @strw32_4(i8* %y, i8* %x, <4 x i32> *%m) {
; CHECK-NEXT: vldrw.u32 q0, [r1]
; CHECK-NEXT: vldrw.u32 q1, [r2]
; CHECK-NEXT: vpt.i32 ne, q1, zr
-; CHECK-NEXT: vstrwt.32 q0, [r0]
-; CHECK-NEXT: adds r0, #4
+; CHECK-NEXT: vstrwt.32 q0, [r0], #4
; CHECK-NEXT: bx lr
entry:
%z = getelementptr inbounds i8, i8* %y, i32 4
@@ -1653,8 +1605,7 @@ define i8* @strw32_508(i8* %y, i8* %x, <4 x i32> *%m) {
; CHECK-NEXT: vldrw.u32 q0, [r1]
; CHECK-NEXT: vldrw.u32 q1, [r2]
; CHECK-NEXT: vpt.i32 ne, q1, zr
-; CHECK-NEXT: vstrwt.32 q0, [r0]
-; CHECK-NEXT: add.w r0, r0, #508
+; CHECK-NEXT: vstrwt.32 q0, [r0], #508
; CHECK-NEXT: bx lr
entry:
%z = getelementptr inbounds i8, i8* %y, i32 508
@@ -1693,8 +1644,7 @@ define i8* @strw32_m508(i8* %y, i8* %x, <4 x i32> *%m) {
; CHECK-NEXT: vldrw.u32 q0, [r1]
; CHECK-NEXT: vldrw.u32 q1, [r2]
; CHECK-NEXT: vpt.i32 ne, q1, zr
-; CHECK-NEXT: vstrwt.32 q0, [r0]
-; CHECK-NEXT: sub.w r0, r0, #508
+; CHECK-NEXT: vstrwt.32 q0, [r0], #-508
; CHECK-NEXT: bx lr
entry:
%z = getelementptr inbounds i8, i8* %y, i32 -508
@@ -1733,8 +1683,7 @@ define i8* @strh32_4(i8* %y, i8* %x, <4 x i32> *%m) {
; CHECK-NEXT: vldrh.u32 q0, [r1]
; CHECK-NEXT: vldrw.u32 q1, [r2]
; CHECK-NEXT: vpt.i32 ne, q1, zr
-; CHECK-NEXT: vstrht.32 q0, [r0]
-; CHECK-NEXT: adds r0, #4
+; CHECK-NEXT: vstrht.32 q0, [r0], #4
; CHECK-NEXT: bx lr
entry:
%z = getelementptr inbounds i8, i8* %y, i32 4
@@ -1773,8 +1722,7 @@ define i8* @strh32_2(i8* %y, i8* %x, <4 x i32> *%m) {
; CHECK-NEXT: vldrh.u32 q0, [r1]
; CHECK-NEXT: vldrw.u32 q1, [r2]
; CHECK-NEXT: vpt.i32 ne, q1, zr
-; CHECK-NEXT: vstrht.32 q0, [r0]
-; CHECK-NEXT: adds r0, #2
+; CHECK-NEXT: vstrht.32 q0, [r0], #2
; CHECK-NEXT: bx lr
entry:
%z = getelementptr inbounds i8, i8* %y, i32 2
@@ -1793,8 +1741,7 @@ define i8* @strh32_254(i8* %y, i8* %x, <4 x i32> *%m) {
; CHECK-NEXT: vldrh.u32 q0, [r1]
; CHECK-NEXT: vldrw.u32 q1, [r2]
; CHECK-NEXT: vpt.i32 ne, q1, zr
-; CHECK-NEXT: vstrht.32 q0, [r0]
-; CHECK-NEXT: adds r0, #254
+; CHECK-NEXT: vstrht.32 q0, [r0], #254
; CHECK-NEXT: bx lr
entry:
%z = getelementptr inbounds i8, i8* %y, i32 254
@@ -1833,8 +1780,7 @@ define i8* @strh32_m254(i8* %y, i8* %x, <4 x i32> *%m) {
; CHECK-NEXT: vldrh.u32 q0, [r1]
; CHECK-NEXT: vldrw.u32 q1, [r2]
; CHECK-NEXT: vpt.i32 ne, q1, zr
-; CHECK-NEXT: vstrht.32 q0, [r0]
-; CHECK-NEXT: subs r0, #254
+; CHECK-NEXT: vstrht.32 q0, [r0], #-254
; CHECK-NEXT: bx lr
entry:
%z = getelementptr inbounds i8, i8* %y, i32 -254
@@ -1873,8 +1819,7 @@ define i8* @strh16_4(i8* %y, i8* %x, <8 x i16> *%m) {
; CHECK-NEXT: vldrh.u16 q0, [r1]
; CHECK-NEXT: vldrh.u16 q1, [r2]
; CHECK-NEXT: vpt.i16 ne, q1, zr
-; CHECK-NEXT: vstrht.16 q0, [r0]
-; CHECK-NEXT: adds r0, #4
+; CHECK-NEXT: vstrht.16 q0, [r0], #4
; CHECK-NEXT: bx lr
entry:
%z = getelementptr inbounds i8, i8* %y, i32 4
@@ -1913,8 +1858,7 @@ define i8* @strh16_2(i8* %y, i8* %x, <8 x i16> *%m) {
; CHECK-NEXT: vldrh.u16 q0, [r1]
; CHECK-NEXT: vldrh.u16 q1, [r2]
; CHECK-NEXT: vpt.i16 ne, q1, zr
-; CHECK-NEXT: vstrht.16 q0, [r0]
-; CHECK-NEXT: adds r0, #2
+; CHECK-NEXT: vstrht.16 q0, [r0], #2
; CHECK-NEXT: bx lr
entry:
%z = getelementptr inbounds i8, i8* %y, i32 2
@@ -1933,8 +1877,7 @@ define i8* @strh16_254(i8* %y, i8* %x, <8 x i16> *%m) {
; CHECK-NEXT: vldrh.u16 q0, [r1]
; CHECK-NEXT: vldrh.u16 q1, [r2]
; CHECK-NEXT: vpt.i16 ne, q1, zr
-; CHECK-NEXT: vstrht.16 q0, [r0]
-; CHECK-NEXT: adds r0, #254
+; CHECK-NEXT: vstrht.16 q0, [r0], #254
; CHECK-NEXT: bx lr
entry:
%z = getelementptr inbounds i8, i8* %y, i32 254
@@ -1973,8 +1916,7 @@ define i8* @strh16_m254(i8* %y, i8* %x, <8 x i16> *%m) {
; CHECK-NEXT: vldrh.u16 q0, [r1]
; CHECK-NEXT: vldrh.u16 q1, [r2]
; CHECK-NEXT: vpt.i16 ne, q1, zr
-; CHECK-NEXT: vstrht.16 q0, [r0]
-; CHECK-NEXT: subs r0, #254
+; CHECK-NEXT: vstrht.16 q0, [r0], #-254
; CHECK-NEXT: bx lr
entry:
%z = getelementptr inbounds i8, i8* %y, i32 -254
@@ -2013,8 +1955,7 @@ define i8* @strb32_4(i8* %y, i8* %x, <4 x i32> *%m) {
; CHECK-NEXT: vldrb.u32 q0, [r1]
; CHECK-NEXT: vldrw.u32 q1, [r2]
; CHECK-NEXT: vpt.i32 ne, q1, zr
-; CHECK-NEXT: vstrbt.32 q0, [r0]
-; CHECK-NEXT: adds r0, #4
+; CHECK-NEXT: vstrbt.32 q0, [r0], #4
; CHECK-NEXT: bx lr
entry:
%z = getelementptr inbounds i8, i8* %y, i32 4
@@ -2033,8 +1974,7 @@ define i8* @strb32_3(i8* %y, i8* %x, <4 x i32> *%m) {
; CHECK-NEXT: vldrb.u32 q0, [r1]
; CHECK-NEXT: vldrw.u32 q1, [r2]
; CHECK-NEXT: vpt.i32 ne, q1, zr
-; CHECK-NEXT: vstrbt.32 q0, [r0]
-; CHECK-NEXT: adds r0, #3
+; CHECK-NEXT: vstrbt.32 q0, [r0], #3
; CHECK-NEXT: bx lr
entry:
%z = getelementptr inbounds i8, i8* %y, i32 3
@@ -2053,8 +1993,7 @@ define i8* @strb32_2(i8* %y, i8* %x, <4 x i32> *%m) {
; CHECK-NEXT: vldrb.u32 q0, [r1]
; CHECK-NEXT: vldrw.u32 q1, [r2]
; CHECK-NEXT: vpt.i32 ne, q1, zr
-; CHECK-NEXT: vstrbt.32 q0, [r0]
-; CHECK-NEXT: adds r0, #2
+; CHECK-NEXT: vstrbt.32 q0, [r0], #2
; CHECK-NEXT: bx lr
entry:
%z = getelementptr inbounds i8, i8* %y, i32 2
@@ -2073,8 +2012,7 @@ define i8* @strb32_127(i8* %y, i8* %x, <4 x i32> *%m) {
; CHECK-NEXT: vldrb.u32 q0, [r1]
; CHECK-NEXT: vldrw.u32 q1, [r2]
; CHECK-NEXT: vpt.i32 ne, q1, zr
-; CHECK-NEXT: vstrbt.32 q0, [r0]
-; CHECK-NEXT: adds r0, #127
+; CHECK-NEXT: vstrbt.32 q0, [r0], #127
; CHECK-NEXT: bx lr
entry:
%z = getelementptr inbounds i8, i8* %y, i32 127
@@ -2113,8 +2051,7 @@ define i8* @strb32_m127(i8* %y, i8* %x, <4 x i32> *%m) {
; CHECK-NEXT: vldrb.u32 q0, [r1]
; CHECK-NEXT: vldrw.u32 q1, [r2]
; CHECK-NEXT: vpt.i32 ne, q1, zr
-; CHECK-NEXT: vstrbt.32 q0, [r0]
-; CHECK-NEXT: subs r0, #127
+; CHECK-NEXT: vstrbt.32 q0, [r0], #-127
; CHECK-NEXT: bx lr
entry:
%z = getelementptr inbounds i8, i8* %y, i32 -127
@@ -2153,8 +2090,7 @@ define i8* @strb16_4(i8* %y, i8* %x, <8 x i16> *%m) {
; CHECK-NEXT: vldrb.u16 q0, [r1]
; CHECK-NEXT: vldrh.u16 q1, [r2]
; CHECK-NEXT: vpt.i16 ne, q1, zr
-; CHECK-NEXT: vstrbt.16 q0, [r0]
-; CHECK-NEXT: adds r0, #4
+; CHECK-NEXT: vstrbt.16 q0, [r0], #4
; CHECK-NEXT: bx lr
entry:
%z = getelementptr inbounds i8, i8* %y, i32 4
@@ -2173,8 +2109,7 @@ define i8* @strb16_3(i8* %y, i8* %x, <8 x i16> *%m) {
; CHECK-NEXT: vldrb.u16 q0, [r1]
; CHECK-NEXT: vldrh.u16 q1, [r2]
; CHECK-NEXT: vpt.i16 ne, q1, zr
-; CHECK-NEXT: vstrbt.16 q0, [r0]
-; CHECK-NEXT: adds r0, #3
+; CHECK-NEXT: vstrbt.16 q0, [r0], #3
; CHECK-NEXT: bx lr
entry:
%z = getelementptr inbounds i8, i8* %y, i32 3
@@ -2193,8 +2128,7 @@ define i8* @strb16_2(i8* %y, i8* %x, <8 x i16> *%m) {
; CHECK-NEXT: vldrb.u16 q0, [r1]
; CHECK-NEXT: vldrh.u16 q1, [r2]
; CHECK-NEXT: vpt.i16 ne, q1, zr
-; CHECK-NEXT: vstrbt.16 q0, [r0]
-; CHECK-NEXT: adds r0, #2
+; CHECK-NEXT: vstrbt.16 q0, [r0], #2
; CHECK-NEXT: bx lr
entry:
%z = getelementptr inbounds i8, i8* %y, i32 2
@@ -2213,8 +2147,7 @@ define i8* @strb16_127(i8* %y, i8* %x, <8 x i16> *%m) {
; CHECK-NEXT: vldrb.u16 q0, [r1]
; CHECK-NEXT: vldrh.u16 q1, [r2]
; CHECK-NEXT: vpt.i16 ne, q1, zr
-; CHECK-NEXT: vstrbt.16 q0, [r0]
-; CHECK-NEXT: adds r0, #127
+; CHECK-NEXT: vstrbt.16 q0, [r0], #127
; CHECK-NEXT: bx lr
entry:
%z = getelementptr inbounds i8, i8* %y, i32 127
@@ -2253,8 +2186,7 @@ define i8* @strb16_m127(i8* %y, i8* %x, <8 x i16> *%m) {
; CHECK-NEXT: vldrb.u16 q0, [r1]
; CHECK-NEXT: vldrh.u16 q1, [r2]
; CHECK-NEXT: vpt.i16 ne, q1, zr
-; CHECK-NEXT: vstrbt.16 q0, [r0]
-; CHECK-NEXT: subs r0, #127
+; CHECK-NEXT: vstrbt.16 q0, [r0], #-127
; CHECK-NEXT: bx lr
entry:
%z = getelementptr inbounds i8, i8* %y, i32 -127
@@ -2293,8 +2225,7 @@ define i8* @strb8_4(i8* %y, i8* %x, <16 x i8> *%m) {
; CHECK-NEXT: vldrb.u8 q0, [r1]
; CHECK-NEXT: vldrb.u8 q1, [r2]
; CHECK-NEXT: vpt.i8 ne, q1, zr
-; CHECK-NEXT: vstrbt.8 q0, [r0]
-; CHECK-NEXT: adds r0, #4
+; CHECK-NEXT: vstrbt.8 q0, [r0], #4
; CHECK-NEXT: bx lr
entry:
%z = getelementptr inbounds i8, i8* %y, i32 4
@@ -2313,8 +2244,7 @@ define i8* @strb8_3(i8* %y, i8* %x, <16 x i8> *%m) {
; CHECK-NEXT: vldrb.u8 q0, [r1]
; CHECK-NEXT: vldrb.u8 q1, [r2]
; CHECK-NEXT: vpt.i8 ne, q1, zr
-; CHECK-NEXT: vstrbt.8 q0, [r0]
-; CHECK-NEXT: adds r0, #3
+; CHECK-NEXT: vstrbt.8 q0, [r0], #3
; CHECK-NEXT: bx lr
entry:
%z = getelementptr inbounds i8, i8* %y, i32 3
@@ -2333,8 +2263,7 @@ define i8* @strb8_2(i8* %y, i8* %x, <16 x i8> *%m) {
; CHECK-NEXT: vldrb.u8 q0, [r1]
; CHECK-NEXT: vldrb.u8 q1, [r2]
; CHECK-NEXT: vpt.i8 ne, q1, zr
-; CHECK-NEXT: vstrbt.8 q0, [r0]
-; CHECK-NEXT: adds r0, #2
+; CHECK-NEXT: vstrbt.8 q0, [r0], #2
; CHECK-NEXT: bx lr
entry:
%z = getelementptr inbounds i8, i8* %y, i32 2
@@ -2353,8 +2282,7 @@ define i8* @strb8_127(i8* %y, i8* %x, <16 x i8> *%m) {
; CHECK-NEXT: vldrb.u8 q0, [r1]
; CHECK-NEXT: vldrb.u8 q1, [r2]
; CHECK-NEXT: vpt.i8 ne, q1, zr
-; CHECK-NEXT: vstrbt.8 q0, [r0]
-; CHECK-NEXT: adds r0, #127
+; CHECK-NEXT: vstrbt.8 q0, [r0], #127
; CHECK-NEXT: bx lr
entry:
%z = getelementptr inbounds i8, i8* %y, i32 127
@@ -2393,8 +2321,7 @@ define i8* @strb8_m127(i8* %y, i8* %x, <16 x i8> *%m) {
; CHECK-NEXT: vldrb.u8 q0, [r1]
; CHECK-NEXT: vldrb.u8 q1, [r2]
; CHECK-NEXT: vpt.i8 ne, q1, zr
-; CHECK-NEXT: vstrbt.8 q0, [r0]
-; CHECK-NEXT: subs r0, #127
+; CHECK-NEXT: vstrbt.8 q0, [r0], #-127
; CHECK-NEXT: bx lr
entry:
%z = getelementptr inbounds i8, i8* %y, i32 -127
@@ -2433,8 +2360,7 @@ define i8* @strwf32_4(i8* %y, i8* %x, <4 x i32> *%m) {
; CHECK-NEXT: vldrw.u32 q0, [r1]
; CHECK-NEXT: vldrw.u32 q1, [r2]
; CHECK-NEXT: vpt.i32 ne, q1, zr
-; CHECK-NEXT: vstrwt.32 q0, [r0]
-; CHECK-NEXT: adds r0, #4
+; CHECK-NEXT: vstrwt.32 q0, [r0], #4
; CHECK-NEXT: bx lr
entry:
%z = getelementptr inbounds i8, i8* %y, i32 4
@@ -2493,8 +2419,7 @@ define i8* @strwf32_508(i8* %y, i8* %x, <4 x i32> *%m) {
; CHECK-NEXT: vldrw.u32 q0, [r1]
; CHECK-NEXT: vldrw.u32 q1, [r2]
; CHECK-NEXT: vpt.i32 ne, q1, zr
-; CHECK-NEXT: vstrwt.32 q0, [r0]
-; CHECK-NEXT: add.w r0, r0, #508
+; CHECK-NEXT: vstrwt.32 q0, [r0], #508
; CHECK-NEXT: bx lr
entry:
%z = getelementptr inbounds i8, i8* %y, i32 508
@@ -2533,8 +2458,7 @@ define i8* @strwf32_m508(i8* %y, i8* %x, <4 x i32> *%m) {
; CHECK-NEXT: vldrw.u32 q0, [r1]
; CHECK-NEXT: vldrw.u32 q1, [r2]
; CHECK-NEXT: vpt.i32 ne, q1, zr
-; CHECK-NEXT: vstrwt.32 q0, [r0]
-; CHECK-NEXT: sub.w r0, r0, #508
+; CHECK-NEXT: vstrwt.32 q0, [r0], #-508
; CHECK-NEXT: bx lr
entry:
%z = getelementptr inbounds i8, i8* %y, i32 -508
@@ -2573,8 +2497,7 @@ define i8* @strhf16_4(i8* %y, i8* %x, <8 x i16> *%m) {
; CHECK-NEXT: vldrh.u16 q0, [r1]
; CHECK-NEXT: vldrh.u16 q1, [r2]
; CHECK-NEXT: vpt.i16 ne, q1, zr
-; CHECK-NEXT: vstrht.16 q0, [r0]
-; CHECK-NEXT: adds r0, #4
+; CHECK-NEXT: vstrht.16 q0, [r0], #4
; CHECK-NEXT: bx lr
entry:
%z = getelementptr inbounds i8, i8* %y, i32 4
@@ -2613,8 +2536,7 @@ define i8* @strhf16_2(i8* %y, i8* %x, <8 x i16> *%m) {
; CHECK-NEXT: vldrh.u16 q0, [r1]
; CHECK-NEXT: vldrh.u16 q1, [r2]
; CHECK-NEXT: vpt.i16 ne, q1, zr
-; CHECK-NEXT: vstrht.16 q0, [r0]
-; CHECK-NEXT: adds r0, #2
+; CHECK-NEXT: vstrht.16 q0, [r0], #2
; CHECK-NEXT: bx lr
entry:
%z = getelementptr inbounds i8, i8* %y, i32 2
@@ -2633,8 +2555,7 @@ define i8* @strhf16_254(i8* %y, i8* %x, <8 x i16> *%m) {
; CHECK-NEXT: vldrh.u16 q0, [r1]
; CHECK-NEXT: vldrh.u16 q1, [r2]
; CHECK-NEXT: vpt.i16 ne, q1, zr
-; CHECK-NEXT: vstrht.16 q0, [r0]
-; CHECK-NEXT: adds r0, #254
+; CHECK-NEXT: vstrht.16 q0, [r0], #254
; CHECK-NEXT: bx lr
entry:
%z = getelementptr inbounds i8, i8* %y, i32 254
@@ -2673,8 +2594,7 @@ define i8* @strhf16_m254(i8* %y, i8* %x, <8 x i16> *%m) {
; CHECK-NEXT: vldrh.u16 q0, [r1]
; CHECK-NEXT: vldrh.u16 q1, [r2]
; CHECK-NEXT: vpt.i16 ne, q1, zr
-; CHECK-NEXT: vstrht.16 q0, [r0]
-; CHECK-NEXT: subs r0, #254
+; CHECK-NEXT: vstrht.16 q0, [r0], #-254
; CHECK-NEXT: bx lr
entry:
%z = getelementptr inbounds i8, i8* %y, i32 -254
diff --git a/llvm/test/CodeGen/Thumb2/mve-masked-ldst-preinc.ll b/llvm/test/CodeGen/Thumb2/mve-masked-ldst-preinc.ll
index 287446963ce..beb5aae6341 100644
--- a/llvm/test/CodeGen/Thumb2/mve-masked-ldst-preinc.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-masked-ldst-preinc.ll
@@ -7,8 +7,7 @@ define i8* @ldrwu32_4(i8* %x, i8* %y, <4 x i32> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrw.u32 q0, [r2]
; CHECK-NEXT: vpt.i32 ne, q0, zr
-; CHECK-NEXT: vldrwt.u32 q0, [r0, #4]
-; CHECK-NEXT: adds r0, #4
+; CHECK-NEXT: vldrwt.u32 q0, [r0, #4]!
; CHECK-NEXT: vstrw.32 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -67,8 +66,7 @@ define i8* @ldrwu32_508(i8* %x, i8* %y, <4 x i32> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrw.u32 q0, [r2]
; CHECK-NEXT: vpt.i32 ne, q0, zr
-; CHECK-NEXT: vldrwt.u32 q0, [r0, #508]
-; CHECK-NEXT: add.w r0, r0, #508
+; CHECK-NEXT: vldrwt.u32 q0, [r0, #508]!
; CHECK-NEXT: vstrw.32 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -107,8 +105,7 @@ define i8* @ldrwu32_m508(i8* %x, i8* %y, <4 x i32> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrw.u32 q0, [r2]
; CHECK-NEXT: vpt.i32 ne, q0, zr
-; CHECK-NEXT: vldrwt.u32 q0, [r0, #-508]
-; CHECK-NEXT: sub.w r0, r0, #508
+; CHECK-NEXT: vldrwt.u32 q0, [r0, #-508]!
; CHECK-NEXT: vstrw.32 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -147,8 +144,7 @@ define i8* @ldrhu32_4(i8* %x, i8* %y, <4 x i32> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrw.u32 q0, [r2]
; CHECK-NEXT: vpt.i32 ne, q0, zr
-; CHECK-NEXT: vldrht.u32 q0, [r0, #4]
-; CHECK-NEXT: adds r0, #4
+; CHECK-NEXT: vldrht.u32 q0, [r0, #4]!
; CHECK-NEXT: vstrw.32 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -189,8 +185,7 @@ define i8* @ldrhu32_2(i8* %x, i8* %y, <4 x i32> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrw.u32 q0, [r2]
; CHECK-NEXT: vpt.i32 ne, q0, zr
-; CHECK-NEXT: vldrht.u32 q0, [r0, #2]
-; CHECK-NEXT: adds r0, #2
+; CHECK-NEXT: vldrht.u32 q0, [r0, #2]!
; CHECK-NEXT: vstrw.32 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -210,8 +205,7 @@ define i8* @ldrhu32_254(i8* %x, i8* %y, <4 x i32> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrw.u32 q0, [r2]
; CHECK-NEXT: vpt.i32 ne, q0, zr
-; CHECK-NEXT: vldrht.u32 q0, [r0, #254]
-; CHECK-NEXT: adds r0, #254
+; CHECK-NEXT: vldrht.u32 q0, [r0, #254]!
; CHECK-NEXT: vstrw.32 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -252,8 +246,7 @@ define i8* @ldrhu32_m254(i8* %x, i8* %y, <4 x i32> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrw.u32 q0, [r2]
; CHECK-NEXT: vpt.i32 ne, q0, zr
-; CHECK-NEXT: vldrht.u32 q0, [r0, #-254]
-; CHECK-NEXT: subs r0, #254
+; CHECK-NEXT: vldrht.u32 q0, [r0, #-254]!
; CHECK-NEXT: vstrw.32 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -294,8 +287,7 @@ define i8* @ldrhs32_4(i8* %x, i8* %y, <4 x i32> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrw.u32 q0, [r2]
; CHECK-NEXT: vpt.i32 ne, q0, zr
-; CHECK-NEXT: vldrht.s32 q0, [r0, #4]
-; CHECK-NEXT: adds r0, #4
+; CHECK-NEXT: vldrht.s32 q0, [r0, #4]!
; CHECK-NEXT: vstrw.32 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -336,8 +328,7 @@ define i8* @ldrhs32_2(i8* %x, i8* %y, <4 x i32> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrw.u32 q0, [r2]
; CHECK-NEXT: vpt.i32 ne, q0, zr
-; CHECK-NEXT: vldrht.s32 q0, [r0, #2]
-; CHECK-NEXT: adds r0, #2
+; CHECK-NEXT: vldrht.s32 q0, [r0, #2]!
; CHECK-NEXT: vstrw.32 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -357,8 +348,7 @@ define i8* @ldrhs32_254(i8* %x, i8* %y, <4 x i32> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrw.u32 q0, [r2]
; CHECK-NEXT: vpt.i32 ne, q0, zr
-; CHECK-NEXT: vldrht.s32 q0, [r0, #254]
-; CHECK-NEXT: adds r0, #254
+; CHECK-NEXT: vldrht.s32 q0, [r0, #254]!
; CHECK-NEXT: vstrw.32 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -399,8 +389,7 @@ define i8* @ldrhs32_m254(i8* %x, i8* %y, <4 x i32> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrw.u32 q0, [r2]
; CHECK-NEXT: vpt.i32 ne, q0, zr
-; CHECK-NEXT: vldrht.s32 q0, [r0, #-254]
-; CHECK-NEXT: subs r0, #254
+; CHECK-NEXT: vldrht.s32 q0, [r0, #-254]!
; CHECK-NEXT: vstrw.32 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -441,8 +430,7 @@ define i8* @ldrhu16_4(i8* %x, i8* %y, <8 x i16> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrh.u16 q0, [r2]
; CHECK-NEXT: vpt.i16 ne, q0, zr
-; CHECK-NEXT: vldrht.u16 q0, [r0, #4]
-; CHECK-NEXT: adds r0, #4
+; CHECK-NEXT: vldrht.u16 q0, [r0, #4]!
; CHECK-NEXT: vstrh.16 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -481,8 +469,7 @@ define i8* @ldrhu16_2(i8* %x, i8* %y, <8 x i16> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrh.u16 q0, [r2]
; CHECK-NEXT: vpt.i16 ne, q0, zr
-; CHECK-NEXT: vldrht.u16 q0, [r0, #2]
-; CHECK-NEXT: adds r0, #2
+; CHECK-NEXT: vldrht.u16 q0, [r0, #2]!
; CHECK-NEXT: vstrh.16 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -501,8 +488,7 @@ define i8* @ldrhu16_254(i8* %x, i8* %y, <8 x i16> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrh.u16 q0, [r2]
; CHECK-NEXT: vpt.i16 ne, q0, zr
-; CHECK-NEXT: vldrht.u16 q0, [r0, #254]
-; CHECK-NEXT: adds r0, #254
+; CHECK-NEXT: vldrht.u16 q0, [r0, #254]!
; CHECK-NEXT: vstrh.16 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -541,8 +527,7 @@ define i8* @ldrhu16_m254(i8* %x, i8* %y, <8 x i16> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrh.u16 q0, [r2]
; CHECK-NEXT: vpt.i16 ne, q0, zr
-; CHECK-NEXT: vldrht.u16 q0, [r0, #-254]
-; CHECK-NEXT: subs r0, #254
+; CHECK-NEXT: vldrht.u16 q0, [r0, #-254]!
; CHECK-NEXT: vstrh.16 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -581,8 +566,7 @@ define i8* @ldrbu32_4(i8* %x, i8* %y, <4 x i32> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrw.u32 q0, [r2]
; CHECK-NEXT: vpt.i32 ne, q0, zr
-; CHECK-NEXT: vldrbt.u32 q0, [r0, #4]
-; CHECK-NEXT: adds r0, #4
+; CHECK-NEXT: vldrbt.u32 q0, [r0, #4]!
; CHECK-NEXT: vstrw.32 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -602,8 +586,7 @@ define i8* @ldrbu32_3(i8* %x, i8* %y, <4 x i32> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrw.u32 q0, [r2]
; CHECK-NEXT: vpt.i32 ne, q0, zr
-; CHECK-NEXT: vldrbt.u32 q0, [r0, #3]
-; CHECK-NEXT: adds r0, #3
+; CHECK-NEXT: vldrbt.u32 q0, [r0, #3]!
; CHECK-NEXT: vstrw.32 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -623,8 +606,7 @@ define i8* @ldrbu32_2(i8* %x, i8* %y, <4 x i32> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrw.u32 q0, [r2]
; CHECK-NEXT: vpt.i32 ne, q0, zr
-; CHECK-NEXT: vldrbt.u32 q0, [r0, #2]
-; CHECK-NEXT: adds r0, #2
+; CHECK-NEXT: vldrbt.u32 q0, [r0, #2]!
; CHECK-NEXT: vstrw.32 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -644,8 +626,7 @@ define i8* @ldrbu32_127(i8* %x, i8* %y, <4 x i32> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrw.u32 q0, [r2]
; CHECK-NEXT: vpt.i32 ne, q0, zr
-; CHECK-NEXT: vldrbt.u32 q0, [r0, #127]
-; CHECK-NEXT: adds r0, #127
+; CHECK-NEXT: vldrbt.u32 q0, [r0, #127]!
; CHECK-NEXT: vstrw.32 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -686,8 +667,7 @@ define i8* @ldrbu32_m127(i8* %x, i8* %y, <4 x i32> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrw.u32 q0, [r2]
; CHECK-NEXT: vpt.i32 ne, q0, zr
-; CHECK-NEXT: vldrbt.u32 q0, [r0, #-127]
-; CHECK-NEXT: subs r0, #127
+; CHECK-NEXT: vldrbt.u32 q0, [r0, #-127]!
; CHECK-NEXT: vstrw.32 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -728,8 +708,7 @@ define i8* @ldrbs32_4(i8* %x, i8* %y, <4 x i32> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrw.u32 q0, [r2]
; CHECK-NEXT: vpt.i32 ne, q0, zr
-; CHECK-NEXT: vldrbt.s32 q0, [r0, #4]
-; CHECK-NEXT: adds r0, #4
+; CHECK-NEXT: vldrbt.s32 q0, [r0, #4]!
; CHECK-NEXT: vstrw.32 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -749,8 +728,7 @@ define i8* @ldrbs32_3(i8* %x, i8* %y, <4 x i32> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrw.u32 q0, [r2]
; CHECK-NEXT: vpt.i32 ne, q0, zr
-; CHECK-NEXT: vldrbt.s32 q0, [r0, #3]
-; CHECK-NEXT: adds r0, #3
+; CHECK-NEXT: vldrbt.s32 q0, [r0, #3]!
; CHECK-NEXT: vstrw.32 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -770,8 +748,7 @@ define i8* @ldrbs32_2(i8* %x, i8* %y, <4 x i32> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrw.u32 q0, [r2]
; CHECK-NEXT: vpt.i32 ne, q0, zr
-; CHECK-NEXT: vldrbt.s32 q0, [r0, #2]
-; CHECK-NEXT: adds r0, #2
+; CHECK-NEXT: vldrbt.s32 q0, [r0, #2]!
; CHECK-NEXT: vstrw.32 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -791,8 +768,7 @@ define i8* @ldrbs32_127(i8* %x, i8* %y, <4 x i32> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrw.u32 q0, [r2]
; CHECK-NEXT: vpt.i32 ne, q0, zr
-; CHECK-NEXT: vldrbt.s32 q0, [r0, #127]
-; CHECK-NEXT: adds r0, #127
+; CHECK-NEXT: vldrbt.s32 q0, [r0, #127]!
; CHECK-NEXT: vstrw.32 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -833,8 +809,7 @@ define i8* @ldrbs32_m127(i8* %x, i8* %y, <4 x i32> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrw.u32 q0, [r2]
; CHECK-NEXT: vpt.i32 ne, q0, zr
-; CHECK-NEXT: vldrbt.s32 q0, [r0, #-127]
-; CHECK-NEXT: subs r0, #127
+; CHECK-NEXT: vldrbt.s32 q0, [r0, #-127]!
; CHECK-NEXT: vstrw.32 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -875,8 +850,7 @@ define i8* @ldrbu16_4(i8* %x, i8* %y, <8 x i16> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrh.u16 q0, [r2]
; CHECK-NEXT: vpt.i16 ne, q0, zr
-; CHECK-NEXT: vldrbt.u16 q0, [r0, #4]
-; CHECK-NEXT: adds r0, #4
+; CHECK-NEXT: vldrbt.u16 q0, [r0, #4]!
; CHECK-NEXT: vstrh.16 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -896,8 +870,7 @@ define i8* @ldrbu16_3(i8* %x, i8* %y, <8 x i16> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrh.u16 q0, [r2]
; CHECK-NEXT: vpt.i16 ne, q0, zr
-; CHECK-NEXT: vldrbt.u16 q0, [r0, #3]
-; CHECK-NEXT: adds r0, #3
+; CHECK-NEXT: vldrbt.u16 q0, [r0, #3]!
; CHECK-NEXT: vstrh.16 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -917,8 +890,7 @@ define i8* @ldrbu16_2(i8* %x, i8* %y, <8 x i16> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrh.u16 q0, [r2]
; CHECK-NEXT: vpt.i16 ne, q0, zr
-; CHECK-NEXT: vldrbt.u16 q0, [r0, #2]
-; CHECK-NEXT: adds r0, #2
+; CHECK-NEXT: vldrbt.u16 q0, [r0, #2]!
; CHECK-NEXT: vstrh.16 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -938,8 +910,7 @@ define i8* @ldrbu16_127(i8* %x, i8* %y, <8 x i16> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrh.u16 q0, [r2]
; CHECK-NEXT: vpt.i16 ne, q0, zr
-; CHECK-NEXT: vldrbt.u16 q0, [r0, #127]
-; CHECK-NEXT: adds r0, #127
+; CHECK-NEXT: vldrbt.u16 q0, [r0, #127]!
; CHECK-NEXT: vstrh.16 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -980,8 +951,7 @@ define i8* @ldrbu16_m127(i8* %x, i8* %y, <8 x i16> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrh.u16 q0, [r2]
; CHECK-NEXT: vpt.i16 ne, q0, zr
-; CHECK-NEXT: vldrbt.u16 q0, [r0, #-127]
-; CHECK-NEXT: subs r0, #127
+; CHECK-NEXT: vldrbt.u16 q0, [r0, #-127]!
; CHECK-NEXT: vstrh.16 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -1022,8 +992,7 @@ define i8* @ldrbs16_4(i8* %x, i8* %y, <8 x i16> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrh.u16 q0, [r2]
; CHECK-NEXT: vpt.i16 ne, q0, zr
-; CHECK-NEXT: vldrbt.s16 q0, [r0, #4]
-; CHECK-NEXT: adds r0, #4
+; CHECK-NEXT: vldrbt.s16 q0, [r0, #4]!
; CHECK-NEXT: vstrh.16 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -1043,8 +1012,7 @@ define i8* @ldrbs16_3(i8* %x, i8* %y, <8 x i16> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrh.u16 q0, [r2]
; CHECK-NEXT: vpt.i16 ne, q0, zr
-; CHECK-NEXT: vldrbt.s16 q0, [r0, #3]
-; CHECK-NEXT: adds r0, #3
+; CHECK-NEXT: vldrbt.s16 q0, [r0, #3]!
; CHECK-NEXT: vstrh.16 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -1064,8 +1032,7 @@ define i8* @ldrbs16_2(i8* %x, i8* %y, <8 x i16> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrh.u16 q0, [r2]
; CHECK-NEXT: vpt.i16 ne, q0, zr
-; CHECK-NEXT: vldrbt.s16 q0, [r0, #2]
-; CHECK-NEXT: adds r0, #2
+; CHECK-NEXT: vldrbt.s16 q0, [r0, #2]!
; CHECK-NEXT: vstrh.16 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -1085,8 +1052,7 @@ define i8* @ldrbs16_127(i8* %x, i8* %y, <8 x i16> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrh.u16 q0, [r2]
; CHECK-NEXT: vpt.i16 ne, q0, zr
-; CHECK-NEXT: vldrbt.s16 q0, [r0, #127]
-; CHECK-NEXT: adds r0, #127
+; CHECK-NEXT: vldrbt.s16 q0, [r0, #127]!
; CHECK-NEXT: vstrh.16 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -1127,8 +1093,7 @@ define i8* @ldrbs16_m127(i8* %x, i8* %y, <8 x i16> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrh.u16 q0, [r2]
; CHECK-NEXT: vpt.i16 ne, q0, zr
-; CHECK-NEXT: vldrbt.s16 q0, [r0, #-127]
-; CHECK-NEXT: subs r0, #127
+; CHECK-NEXT: vldrbt.s16 q0, [r0, #-127]!
; CHECK-NEXT: vstrh.16 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -1169,8 +1134,7 @@ define i8* @ldrbu8_4(i8* %x, i8* %y, <16 x i8> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrb.u8 q0, [r2]
; CHECK-NEXT: vpt.i8 ne, q0, zr
-; CHECK-NEXT: vldrbt.u8 q0, [r0, #4]
-; CHECK-NEXT: adds r0, #4
+; CHECK-NEXT: vldrbt.u8 q0, [r0, #4]!
; CHECK-NEXT: vstrb.8 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -1189,8 +1153,7 @@ define i8* @ldrbu8_3(i8* %x, i8* %y, <16 x i8> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrb.u8 q0, [r2]
; CHECK-NEXT: vpt.i8 ne, q0, zr
-; CHECK-NEXT: vldrbt.u8 q0, [r0, #3]
-; CHECK-NEXT: adds r0, #3
+; CHECK-NEXT: vldrbt.u8 q0, [r0, #3]!
; CHECK-NEXT: vstrb.8 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -1209,8 +1172,7 @@ define i8* @ldrbu8_2(i8* %x, i8* %y, <16 x i8> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrb.u8 q0, [r2]
; CHECK-NEXT: vpt.i8 ne, q0, zr
-; CHECK-NEXT: vldrbt.u8 q0, [r0, #2]
-; CHECK-NEXT: adds r0, #2
+; CHECK-NEXT: vldrbt.u8 q0, [r0, #2]!
; CHECK-NEXT: vstrb.8 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -1229,8 +1191,7 @@ define i8* @ldrbu8_127(i8* %x, i8* %y, <16 x i8> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrb.u8 q0, [r2]
; CHECK-NEXT: vpt.i8 ne, q0, zr
-; CHECK-NEXT: vldrbt.u8 q0, [r0, #127]
-; CHECK-NEXT: adds r0, #127
+; CHECK-NEXT: vldrbt.u8 q0, [r0, #127]!
; CHECK-NEXT: vstrb.8 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -1269,8 +1230,7 @@ define i8* @ldrbu8_m127(i8* %x, i8* %y, <16 x i8> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrb.u8 q0, [r2]
; CHECK-NEXT: vpt.i8 ne, q0, zr
-; CHECK-NEXT: vldrbt.u8 q0, [r0, #-127]
-; CHECK-NEXT: subs r0, #127
+; CHECK-NEXT: vldrbt.u8 q0, [r0, #-127]!
; CHECK-NEXT: vstrb.8 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -1309,8 +1269,7 @@ define i8* @ldrwf32_4(i8* %x, i8* %y, <4 x i32> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrw.u32 q0, [r2]
; CHECK-NEXT: vpt.i32 ne, q0, zr
-; CHECK-NEXT: vldrwt.u32 q0, [r0, #4]
-; CHECK-NEXT: adds r0, #4
+; CHECK-NEXT: vldrwt.u32 q0, [r0, #4]!
; CHECK-NEXT: vstrw.32 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -1369,8 +1328,7 @@ define i8* @ldrwf32_508(i8* %x, i8* %y, <4 x i32> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrw.u32 q0, [r2]
; CHECK-NEXT: vpt.i32 ne, q0, zr
-; CHECK-NEXT: vldrwt.u32 q0, [r0, #508]
-; CHECK-NEXT: add.w r0, r0, #508
+; CHECK-NEXT: vldrwt.u32 q0, [r0, #508]!
; CHECK-NEXT: vstrw.32 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -1409,8 +1367,7 @@ define i8* @ldrwf32_m508(i8* %x, i8* %y, <4 x i32> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrw.u32 q0, [r2]
; CHECK-NEXT: vpt.i32 ne, q0, zr
-; CHECK-NEXT: vldrwt.u32 q0, [r0, #-508]
-; CHECK-NEXT: sub.w r0, r0, #508
+; CHECK-NEXT: vldrwt.u32 q0, [r0, #-508]!
; CHECK-NEXT: vstrw.32 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -1449,8 +1406,7 @@ define i8* @ldrhf16_4(i8* %x, i8* %y, <8 x i16> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrh.u16 q0, [r2]
; CHECK-NEXT: vpt.i16 ne, q0, zr
-; CHECK-NEXT: vldrht.u16 q0, [r0, #4]
-; CHECK-NEXT: adds r0, #4
+; CHECK-NEXT: vldrht.u16 q0, [r0, #4]!
; CHECK-NEXT: vstrh.16 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -1489,8 +1445,7 @@ define i8* @ldrhf16_2(i8* %x, i8* %y, <8 x i16> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrh.u16 q0, [r2]
; CHECK-NEXT: vpt.i16 ne, q0, zr
-; CHECK-NEXT: vldrht.u16 q0, [r0, #2]
-; CHECK-NEXT: adds r0, #2
+; CHECK-NEXT: vldrht.u16 q0, [r0, #2]!
; CHECK-NEXT: vstrh.16 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -1509,8 +1464,7 @@ define i8* @ldrhf16_254(i8* %x, i8* %y, <8 x i16> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrh.u16 q0, [r2]
; CHECK-NEXT: vpt.i16 ne, q0, zr
-; CHECK-NEXT: vldrht.u16 q0, [r0, #254]
-; CHECK-NEXT: adds r0, #254
+; CHECK-NEXT: vldrht.u16 q0, [r0, #254]!
; CHECK-NEXT: vstrh.16 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -1549,8 +1503,7 @@ define i8* @ldrhf16_m254(i8* %x, i8* %y, <8 x i16> *%m) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrh.u16 q0, [r2]
; CHECK-NEXT: vpt.i16 ne, q0, zr
-; CHECK-NEXT: vldrht.u16 q0, [r0, #-254]
-; CHECK-NEXT: subs r0, #254
+; CHECK-NEXT: vldrht.u16 q0, [r0, #-254]!
; CHECK-NEXT: vstrh.16 q0, [r1]
; CHECK-NEXT: bx lr
entry:
@@ -1593,8 +1546,7 @@ define i8* @strw32_4(i8* %y, i8* %x, <4 x i32> *%m) {
; CHECK-NEXT: vldrw.u32 q0, [r1]
; CHECK-NEXT: vldrw.u32 q1, [r2]
; CHECK-NEXT: vpt.i32 ne, q1, zr
-; CHECK-NEXT: vstrwt.32 q0, [r0, #4]
-; CHECK-NEXT: adds r0, #4
+; CHECK-NEXT: vstrwt.32 q0, [r0, #4]!
; CHECK-NEXT: bx lr
entry:
%z = getelementptr inbounds i8, i8* %y, i32 4
@@ -1653,8 +1605,7 @@ define i8* @strw32_508(i8* %y, i8* %x, <4 x i32> *%m) {
; CHECK-NEXT: vldrw.u32 q0, [r1]
; CHECK-NEXT: vldrw.u32 q1, [r2]
; CHECK-NEXT: vpt.i32 ne, q1, zr
-; CHECK-NEXT: vstrwt.32 q0, [r0, #508]
-; CHECK-NEXT: add.w r0, r0, #508
+; CHECK-NEXT: vstrwt.32 q0, [r0, #508]!
; CHECK-NEXT: bx lr
entry:
%z = getelementptr inbounds i8, i8* %y, i32 508
@@ -1693,8 +1644,7 @@ define i8* @strw32_m508(i8* %y, i8* %x, <4 x i32> *%m) {
; CHECK-NEXT: vldrw.u32 q0, [r1]
; CHECK-NEXT: vldrw.u32 q1, [r2]
; CHECK-NEXT: vpt.i32 ne, q1, zr
-; CHECK-NEXT: vstrwt.32 q0, [r0, #-508]
-; CHECK-NEXT: sub.w r0, r0, #508
+; CHECK-NEXT: vstrwt.32 q0, [r0, #-508]!
; CHECK-NEXT: bx lr
entry:
%z = getelementptr inbounds i8, i8* %y, i32 -508
@@ -1733,8 +1683,7 @@ define i8* @strh32_4(i8* %y, i8* %x, <4 x i32> *%m) {
; CHECK-NEXT: vldrh.u32 q0, [r1]
; CHECK-NEXT: vldrw.u32 q1, [r2]
; CHECK-NEXT: vpt.i32 ne, q1, zr
-; CHECK-NEXT: vstrht.32 q0, [r0, #4]
-; CHECK-NEXT: adds r0, #4
+; CHECK-NEXT: vstrht.32 q0, [r0, #4]!
; CHECK-NEXT: bx lr
entry:
%z = getelementptr inbounds i8, i8* %y, i32 4
@@ -1773,8 +1722,7 @@ define i8* @strh32_2(i8* %y, i8* %x, <4 x i32> *%m) {
; CHECK-NEXT: vldrh.u32 q0, [r1]
; CHECK-NEXT: vldrw.u32 q1, [r2]
; CHECK-NEXT: vpt.i32 ne, q1, zr
-; CHECK-NEXT: vstrht.32 q0, [r0, #2]
-; CHECK-NEXT: adds r0, #2
+; CHECK-NEXT: vstrht.32 q0, [r0, #2]!
; CHECK-NEXT: bx lr
entry:
%z = getelementptr inbounds i8, i8* %y, i32 2
@@ -1793,8 +1741,7 @@ define i8* @strh32_254(i8* %y, i8* %x, <4 x i32> *%m) {
; CHECK-NEXT: vldrh.u32 q0, [r1]
; CHECK-NEXT: vldrw.u32 q1, [r2]
; CHECK-NEXT: vpt.i32 ne, q1, zr
-; CHECK-NEXT: vstrht.32 q0, [r0, #254]
-; CHECK-NEXT: adds r0, #254
+; CHECK-NEXT: vstrht.32 q0, [r0, #254]!
; CHECK-NEXT: bx lr
entry:
%z = getelementptr inbounds i8, i8* %y, i32 254
@@ -1833,8 +1780,7 @@ define i8* @strh32_m254(i8* %y, i8* %x, <4 x i32> *%m) {
; CHECK-NEXT: vldrh.u32 q0, [r1]
; CHECK-NEXT: vldrw.u32 q1, [r2]
; CHECK-NEXT: vpt.i32 ne, q1, zr
-; CHECK-NEXT: vstrht.32 q0, [r0, #-254]
-; CHECK-NEXT: subs r0, #254
+; CHECK-NEXT: vstrht.32 q0, [r0, #-254]!
; CHECK-NEXT: bx lr
entry:
%z = getelementptr inbounds i8, i8* %y, i32 -254
@@ -1873,8 +1819,7 @@ define i8* @strh16_4(i8* %y, i8* %x, <8 x i16> *%m) {
; CHECK-NEXT: vldrh.u16 q0, [r1]
; CHECK-NEXT: vldrh.u16 q1, [r2]
; CHECK-NEXT: vpt.i16 ne, q1, zr
-; CHECK-NEXT: vstrht.16 q0, [r0, #4]
-; CHECK-NEXT: adds r0, #4
+; CHECK-NEXT: vstrht.16 q0, [r0, #4]!
; CHECK-NEXT: bx lr
entry:
%z = getelementptr inbounds i8, i8* %y, i32 4
@@ -1913,8 +1858,7 @@ define i8* @strh16_2(i8* %y, i8* %x, <8 x i16> *%m) {
; CHECK-NEXT: vldrh.u16 q0, [r1]
; CHECK-NEXT: vldrh.u16 q1, [r2]
; CHECK-NEXT: vpt.i16 ne, q1, zr
-; CHECK-NEXT: vstrht.16 q0, [r0, #2]
-; CHECK-NEXT: adds r0, #2
+; CHECK-NEXT: vstrht.16 q0, [r0, #2]!
; CHECK-NEXT: bx lr
entry:
%z = getelementptr inbounds i8, i8* %y, i32 2
@@ -1933,8 +1877,7 @@ define i8* @strh16_254(i8* %y, i8* %x, <8 x i16> *%m) {
; CHECK-NEXT: vldrh.u16 q0, [r1]
; CHECK-NEXT: vldrh.u16 q1, [r2]
; CHECK-NEXT: vpt.i16 ne, q1, zr
-; CHECK-NEXT: vstrht.16 q0, [r0, #254]
-; CHECK-NEXT: adds r0, #254
+; CHECK-NEXT: vstrht.16 q0, [r0, #254]!
; CHECK-NEXT: bx lr
entry:
%z = getelementptr inbounds i8, i8* %y, i32 254
@@ -1973,8 +1916,7 @@ define i8* @strh16_m254(i8* %y, i8* %x, <8 x i16> *%m) {
; CHECK-NEXT: vldrh.u16 q0, [r1]
; CHECK-NEXT: vldrh.u16 q1, [r2]
; CHECK-NEXT: vpt.i16 ne, q1, zr
-; CHECK-NEXT: vstrht.16 q0, [r0, #-254]
-; CHECK-NEXT: subs r0, #254
+; CHECK-NEXT: vstrht.16 q0, [r0, #-254]!
; CHECK-NEXT: bx lr
entry:
%z = getelementptr inbounds i8, i8* %y, i32 -254
@@ -2013,8 +1955,7 @@ define i8* @strb32_4(i8* %y, i8* %x, <4 x i32> *%m) {
; CHECK-NEXT: vldrb.u32 q0, [r1]
; CHECK-NEXT: vldrw.u32 q1, [r2]
; CHECK-NEXT: vpt.i32 ne, q1, zr
-; CHECK-NEXT: vstrbt.32 q0, [r0, #4]
-; CHECK-NEXT: adds r0, #4
+; CHECK-NEXT: vstrbt.32 q0, [r0, #4]!
; CHECK-NEXT: bx lr
entry:
%z = getelementptr inbounds i8, i8* %y, i32 4
@@ -2033,8 +1974,7 @@ define i8* @strb32_3(i8* %y, i8* %x, <4 x i32> *%m) {
; CHECK-NEXT: vldrb.u32 q0, [r1]
; CHECK-NEXT: vldrw.u32 q1, [r2]
; CHECK-NEXT: vpt.i32 ne, q1, zr
-; CHECK-NEXT: vstrbt.32 q0, [r0, #3]
-; CHECK-NEXT: adds r0, #3
+; CHECK-NEXT: vstrbt.32 q0, [r0, #3]!
; CHECK-NEXT: bx lr
entry:
%z = getelementptr inbounds i8, i8* %y, i32 3
@@ -2053,8 +1993,7 @@ define i8* @strb32_2(i8* %y, i8* %x, <4 x i32> *%m) {
; CHECK-NEXT: vldrb.u32 q0, [r1]
; CHECK-NEXT: vldrw.u32 q1, [r2]
; CHECK-NEXT: vpt.i32 ne, q1, zr
-; CHECK-NEXT: vstrbt.32 q0, [r0, #2]
-; CHECK-NEXT: adds r0, #2
+; CHECK-NEXT: vstrbt.32 q0, [r0, #2]!
; CHECK-NEXT: bx lr
entry:
%z = getelementptr inbounds i8, i8* %y, i32 2
@@ -2073,8 +2012,7 @@ define i8* @strb32_127(i8* %y, i8* %x, <4 x i32> *%m) {
; CHECK-NEXT: vldrb.u32 q0, [r1]
; CHECK-NEXT: vldrw.u32 q1, [r2]
; CHECK-NEXT: vpt.i32 ne, q1, zr
-; CHECK-NEXT: vstrbt.32 q0, [r0, #127]
-; CHECK-NEXT: adds r0, #127
+; CHECK-NEXT: vstrbt.32 q0, [r0, #127]!
; CHECK-NEXT: bx lr
entry:
%z = getelementptr inbounds i8, i8* %y, i32 127
@@ -2113,8 +2051,7 @@ define i8* @strb32_m127(i8* %y, i8* %x, <4 x i32> *%m) {
; CHECK-NEXT: vldrb.u32 q0, [r1]
; CHECK-NEXT: vldrw.u32 q1, [r2]
; CHECK-NEXT: vpt.i32 ne, q1, zr
-; CHECK-NEXT: vstrbt.32 q0, [r0, #-127]
-; CHECK-NEXT: subs r0, #127
+; CHECK-NEXT: vstrbt.32 q0, [r0, #-127]!
; CHECK-NEXT: bx lr
entry:
%z = getelementptr inbounds i8, i8* %y, i32 -127
@@ -2153,8 +2090,7 @@ define i8* @strb16_4(i8* %y, i8* %x, <8 x i16> *%m) {
; CHECK-NEXT: vldrb.u16 q0, [r1]
; CHECK-NEXT: vldrh.u16 q1, [r2]
; CHECK-NEXT: vpt.i16 ne, q1, zr
-; CHECK-NEXT: vstrbt.16 q0, [r0, #4]
-; CHECK-NEXT: adds r0, #4
+; CHECK-NEXT: vstrbt.16 q0, [r0, #4]!
; CHECK-NEXT: bx lr
entry:
%z = getelementptr inbounds i8, i8* %y, i32 4
@@ -2173,8 +2109,7 @@ define i8* @strb16_3(i8* %y, i8* %x, <8 x i16> *%m) {
; CHECK-NEXT: vldrb.u16 q0, [r1]
; CHECK-NEXT: vldrh.u16 q1, [r2]
; CHECK-NEXT: vpt.i16 ne, q1, zr
-; CHECK-NEXT: vstrbt.16 q0, [r0, #3]
-; CHECK-NEXT: adds r0, #3
+; CHECK-NEXT: vstrbt.16 q0, [r0, #3]!
; CHECK-NEXT: bx lr
entry:
%z = getelementptr inbounds i8, i8* %y, i32 3
@@ -2193,8 +2128,7 @@ define i8* @strb16_2(i8* %y, i8* %x, <8 x i16> *%m) {
; CHECK-NEXT: vldrb.u16 q0, [r1]
; CHECK-NEXT: vldrh.u16 q1, [r2]
; CHECK-NEXT: vpt.i16 ne, q1, zr
-; CHECK-NEXT: vstrbt.16 q0, [r0, #2]
-; CHECK-NEXT: adds r0, #2
+; CHECK-NEXT: vstrbt.16 q0, [r0, #2]!
; CHECK-NEXT: bx lr
entry:
%z = getelementptr inbounds i8, i8* %y, i32 2
@@ -2213,8 +2147,7 @@ define i8* @strb16_127(i8* %y, i8* %x, <8 x i16> *%m) {
; CHECK-NEXT: vldrb.u16 q0, [r1]
; CHECK-NEXT: vldrh.u16 q1, [r2]
; CHECK-NEXT: vpt.i16 ne, q1, zr
-; CHECK-NEXT: vstrbt.16 q0, [r0, #127]
-; CHECK-NEXT: adds r0, #127
+; CHECK-NEXT: vstrbt.16 q0, [r0, #127]!
; CHECK-NEXT: bx lr
entry:
%z = getelementptr inbounds i8, i8* %y, i32 127
@@ -2253,8 +2186,7 @@ define i8* @strb16_m127(i8* %y, i8* %x, <8 x i16> *%m) {
; CHECK-NEXT: vldrb.u16 q0, [r1]
; CHECK-NEXT: vldrh.u16 q1, [r2]
; CHECK-NEXT: vpt.i16 ne, q1, zr
-; CHECK-NEXT: vstrbt.16 q0, [r0, #-127]
-; CHECK-NEXT: subs r0, #127
+; CHECK-NEXT: vstrbt.16 q0, [r0, #-127]!
; CHECK-NEXT: bx lr
entry:
%z = getelementptr inbounds i8, i8* %y, i32 -127
@@ -2293,8 +2225,7 @@ define i8* @strb8_4(i8* %y, i8* %x, <16 x i8> *%m) {
; CHECK-NEXT: vldrb.u8 q0, [r1]
; CHECK-NEXT: vldrb.u8 q1, [r2]
; CHECK-NEXT: vpt.i8 ne, q1, zr
-; CHECK-NEXT: vstrbt.8 q0, [r0, #4]
-; CHECK-NEXT: adds r0, #4
+; CHECK-NEXT: vstrbt.8 q0, [r0, #4]!
; CHECK-NEXT: bx lr
entry:
%z = getelementptr inbounds i8, i8* %y, i32 4
@@ -2313,8 +2244,7 @@ define i8* @strb8_3(i8* %y, i8* %x, <16 x i8> *%m) {
; CHECK-NEXT: vldrb.u8 q0, [r1]
; CHECK-NEXT: vldrb.u8 q1, [r2]
; CHECK-NEXT: vpt.i8 ne, q1, zr
-; CHECK-NEXT: vstrbt.8 q0, [r0, #3]
-; CHECK-NEXT: adds r0, #3
+; CHECK-NEXT: vstrbt.8 q0, [r0, #3]!
; CHECK-NEXT: bx lr
entry:
%z = getelementptr inbounds i8, i8* %y, i32 3
@@ -2333,8 +2263,7 @@ define i8* @strb8_2(i8* %y, i8* %x, <16 x i8> *%m) {
; CHECK-NEXT: vldrb.u8 q0, [r1]
; CHECK-NEXT: vldrb.u8 q1, [r2]
; CHECK-NEXT: vpt.i8 ne, q1, zr
-; CHECK-NEXT: vstrbt.8 q0, [r0, #2]
-; CHECK-NEXT: adds r0, #2
+; CHECK-NEXT: vstrbt.8 q0, [r0, #2]!
; CHECK-NEXT: bx lr
entry:
%z = getelementptr inbounds i8, i8* %y, i32 2
@@ -2353,8 +2282,7 @@ define i8* @strb8_127(i8* %y, i8* %x, <16 x i8> *%m) {
; CHECK-NEXT: vldrb.u8 q0, [r1]
; CHECK-NEXT: vldrb.u8 q1, [r2]
; CHECK-NEXT: vpt.i8 ne, q1, zr
-; CHECK-NEXT: vstrbt.8 q0, [r0, #127]
-; CHECK-NEXT: adds r0, #127
+; CHECK-NEXT: vstrbt.8 q0, [r0, #127]!
; CHECK-NEXT: bx lr
entry:
%z = getelementptr inbounds i8, i8* %y, i32 127
@@ -2393,8 +2321,7 @@ define i8* @strb8_m127(i8* %y, i8* %x, <16 x i8> *%m) {
; CHECK-NEXT: vldrb.u8 q0, [r1]
; CHECK-NEXT: vldrb.u8 q1, [r2]
; CHECK-NEXT: vpt.i8 ne, q1, zr
-; CHECK-NEXT: vstrbt.8 q0, [r0, #-127]
-; CHECK-NEXT: subs r0, #127
+; CHECK-NEXT: vstrbt.8 q0, [r0, #-127]!
; CHECK-NEXT: bx lr
entry:
%z = getelementptr inbounds i8, i8* %y, i32 -127
@@ -2433,8 +2360,7 @@ define i8* @strwf32_4(i8* %y, i8* %x, <4 x i32> *%m) {
; CHECK-NEXT: vldrw.u32 q0, [r1]
; CHECK-NEXT: vldrw.u32 q1, [r2]
; CHECK-NEXT: vpt.i32 ne, q1, zr
-; CHECK-NEXT: vstrwt.32 q0, [r0, #4]
-; CHECK-NEXT: adds r0, #4
+; CHECK-NEXT: vstrwt.32 q0, [r0, #4]!
; CHECK-NEXT: bx lr
entry:
%z = getelementptr inbounds i8, i8* %y, i32 4
@@ -2493,8 +2419,7 @@ define i8* @strwf32_508(i8* %y, i8* %x, <4 x i32> *%m) {
; CHECK-NEXT: vldrw.u32 q0, [r1]
; CHECK-NEXT: vldrw.u32 q1, [r2]
; CHECK-NEXT: vpt.i32 ne, q1, zr
-; CHECK-NEXT: vstrwt.32 q0, [r0, #508]
-; CHECK-NEXT: add.w r0, r0, #508
+; CHECK-NEXT: vstrwt.32 q0, [r0, #508]!
; CHECK-NEXT: bx lr
entry:
%z = getelementptr inbounds i8, i8* %y, i32 508
@@ -2533,8 +2458,7 @@ define i8* @strwf32_m508(i8* %y, i8* %x, <4 x i32> *%m) {
; CHECK-NEXT: vldrw.u32 q0, [r1]
; CHECK-NEXT: vldrw.u32 q1, [r2]
; CHECK-NEXT: vpt.i32 ne, q1, zr
-; CHECK-NEXT: vstrwt.32 q0, [r0, #-508]
-; CHECK-NEXT: sub.w r0, r0, #508
+; CHECK-NEXT: vstrwt.32 q0, [r0, #-508]!
; CHECK-NEXT: bx lr
entry:
%z = getelementptr inbounds i8, i8* %y, i32 -508
@@ -2573,8 +2497,7 @@ define i8* @strhf16_4(i8* %y, i8* %x, <8 x i16> *%m) {
; CHECK-NEXT: vldrh.u16 q0, [r1]
; CHECK-NEXT: vldrh.u16 q1, [r2]
; CHECK-NEXT: vpt.i16 ne, q1, zr
-; CHECK-NEXT: vstrht.16 q0, [r0, #4]
-; CHECK-NEXT: adds r0, #4
+; CHECK-NEXT: vstrht.16 q0, [r0, #4]!
; CHECK-NEXT: bx lr
entry:
%z = getelementptr inbounds i8, i8* %y, i32 4
@@ -2613,8 +2536,7 @@ define i8* @strhf16_2(i8* %y, i8* %x, <8 x i16> *%m) {
; CHECK-NEXT: vldrh.u16 q0, [r1]
; CHECK-NEXT: vldrh.u16 q1, [r2]
; CHECK-NEXT: vpt.i16 ne, q1, zr
-; CHECK-NEXT: vstrht.16 q0, [r0, #2]
-; CHECK-NEXT: adds r0, #2
+; CHECK-NEXT: vstrht.16 q0, [r0, #2]!
; CHECK-NEXT: bx lr
entry:
%z = getelementptr inbounds i8, i8* %y, i32 2
@@ -2633,8 +2555,7 @@ define i8* @strhf16_254(i8* %y, i8* %x, <8 x i16> *%m) {
; CHECK-NEXT: vldrh.u16 q0, [r1]
; CHECK-NEXT: vldrh.u16 q1, [r2]
; CHECK-NEXT: vpt.i16 ne, q1, zr
-; CHECK-NEXT: vstrht.16 q0, [r0, #254]
-; CHECK-NEXT: adds r0, #254
+; CHECK-NEXT: vstrht.16 q0, [r0, #254]!
; CHECK-NEXT: bx lr
entry:
%z = getelementptr inbounds i8, i8* %y, i32 254
@@ -2673,8 +2594,7 @@ define i8* @strhf16_m254(i8* %y, i8* %x, <8 x i16> *%m) {
; CHECK-NEXT: vldrh.u16 q0, [r1]
; CHECK-NEXT: vldrh.u16 q1, [r2]
; CHECK-NEXT: vpt.i16 ne, q1, zr
-; CHECK-NEXT: vstrht.16 q0, [r0, #-254]
-; CHECK-NEXT: subs r0, #254
+; CHECK-NEXT: vstrht.16 q0, [r0, #-254]!
; CHECK-NEXT: bx lr
entry:
%z = getelementptr inbounds i8, i8* %y, i32 -254
diff --git a/llvm/test/CodeGen/Thumb2/mve-masked-load.ll b/llvm/test/CodeGen/Thumb2/mve-masked-load.ll
index e75e07604e8..54a94b8981c 100644
--- a/llvm/test/CodeGen/Thumb2/mve-masked-load.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-masked-load.ll
@@ -468,8 +468,7 @@ define arm_aapcs_vfpcc i8* @masked_v4i32_preinc(i8* %x, i8* %y, <4 x i32> %a) {
; CHECK-LE-LABEL: masked_v4i32_preinc:
; CHECK-LE: @ %bb.0: @ %entry
; CHECK-LE-NEXT: vpt.s32 gt, q0, zr
-; CHECK-LE-NEXT: vldrwt.u32 q0, [r0, #4]
-; CHECK-LE-NEXT: adds r0, #4
+; CHECK-LE-NEXT: vldrwt.u32 q0, [r0, #4]!
; CHECK-LE-NEXT: vstrw.32 q0, [r1]
; CHECK-LE-NEXT: bx lr
;
@@ -477,8 +476,7 @@ define arm_aapcs_vfpcc i8* @masked_v4i32_preinc(i8* %x, i8* %y, <4 x i32> %a) {
; CHECK-BE: @ %bb.0: @ %entry
; CHECK-BE-NEXT: vrev64.32 q1, q0
; CHECK-BE-NEXT: vpt.s32 gt, q1, zr
-; CHECK-BE-NEXT: vldrwt.u32 q0, [r0, #4]
-; CHECK-BE-NEXT: adds r0, #4
+; CHECK-BE-NEXT: vldrwt.u32 q0, [r0, #4]!
; CHECK-BE-NEXT: vstrw.32 q0, [r1]
; CHECK-BE-NEXT: bx lr
entry:
@@ -495,8 +493,7 @@ define arm_aapcs_vfpcc i8* @masked_v4i32_postinc(i8* %x, i8* %y, <4 x i32> %a) {
; CHECK-LE-LABEL: masked_v4i32_postinc:
; CHECK-LE: @ %bb.0: @ %entry
; CHECK-LE-NEXT: vpt.s32 gt, q0, zr
-; CHECK-LE-NEXT: vldrwt.u32 q0, [r0]
-; CHECK-LE-NEXT: adds r0, #4
+; CHECK-LE-NEXT: vldrwt.u32 q0, [r0], #4
; CHECK-LE-NEXT: vstrw.32 q0, [r1]
; CHECK-LE-NEXT: bx lr
;
@@ -504,8 +501,7 @@ define arm_aapcs_vfpcc i8* @masked_v4i32_postinc(i8* %x, i8* %y, <4 x i32> %a) {
; CHECK-BE: @ %bb.0: @ %entry
; CHECK-BE-NEXT: vrev64.32 q1, q0
; CHECK-BE-NEXT: vpt.s32 gt, q1, zr
-; CHECK-BE-NEXT: vldrwt.u32 q0, [r0]
-; CHECK-BE-NEXT: adds r0, #4
+; CHECK-BE-NEXT: vldrwt.u32 q0, [r0], #4
; CHECK-BE-NEXT: vstrw.32 q0, [r1]
; CHECK-BE-NEXT: bx lr
entry:
@@ -1032,8 +1028,7 @@ define i8* @masked_v8i16_preinc(i8* %x, i8* %y, <8 x i16> %a) {
; CHECK-LE-NEXT: vldr d1, [sp]
; CHECK-LE-NEXT: vmov d0, r2, r3
; CHECK-LE-NEXT: vpt.s16 gt, q0, zr
-; CHECK-LE-NEXT: vldrht.u16 q0, [r0, #4]
-; CHECK-LE-NEXT: adds r0, #4
+; CHECK-LE-NEXT: vldrht.u16 q0, [r0, #4]!
; CHECK-LE-NEXT: vstrw.32 q0, [r1]
; CHECK-LE-NEXT: bx lr
;
@@ -1043,8 +1038,7 @@ define i8* @masked_v8i16_preinc(i8* %x, i8* %y, <8 x i16> %a) {
; CHECK-BE-NEXT: vmov d0, r3, r2
; CHECK-BE-NEXT: vrev64.16 q1, q0
; CHECK-BE-NEXT: vpt.s16 gt, q1, zr
-; CHECK-BE-NEXT: vldrht.u16 q0, [r0, #4]
-; CHECK-BE-NEXT: adds r0, #4
+; CHECK-BE-NEXT: vldrht.u16 q0, [r0, #4]!
; CHECK-BE-NEXT: vstrh.16 q0, [r1]
; CHECK-BE-NEXT: bx lr
entry:
@@ -1061,8 +1055,7 @@ define arm_aapcs_vfpcc i8* @masked_v8i16_postinc(i8* %x, i8* %y, <8 x i16> %a) {
; CHECK-LE-LABEL: masked_v8i16_postinc:
; CHECK-LE: @ %bb.0: @ %entry
; CHECK-LE-NEXT: vpt.s16 gt, q0, zr
-; CHECK-LE-NEXT: vldrht.u16 q0, [r0]
-; CHECK-LE-NEXT: adds r0, #4
+; CHECK-LE-NEXT: vldrht.u16 q0, [r0], #4
; CHECK-LE-NEXT: vstrw.32 q0, [r1]
; CHECK-LE-NEXT: bx lr
;
@@ -1070,8 +1063,7 @@ define arm_aapcs_vfpcc i8* @masked_v8i16_postinc(i8* %x, i8* %y, <8 x i16> %a) {
; CHECK-BE: @ %bb.0: @ %entry
; CHECK-BE-NEXT: vrev64.16 q1, q0
; CHECK-BE-NEXT: vpt.s16 gt, q1, zr
-; CHECK-BE-NEXT: vldrht.u16 q0, [r0]
-; CHECK-BE-NEXT: adds r0, #4
+; CHECK-BE-NEXT: vldrht.u16 q0, [r0], #4
; CHECK-BE-NEXT: vstrh.16 q0, [r1]
; CHECK-BE-NEXT: bx lr
entry:
@@ -1151,8 +1143,7 @@ define arm_aapcs_vfpcc i8* @masked_v16i8_preinc(i8* %x, i8* %y, <16 x i8> %a) {
; CHECK-LE-LABEL: masked_v16i8_preinc:
; CHECK-LE: @ %bb.0: @ %entry
; CHECK-LE-NEXT: vpt.s8 gt, q0, zr
-; CHECK-LE-NEXT: vldrbt.u8 q0, [r0, #4]
-; CHECK-LE-NEXT: adds r0, #4
+; CHECK-LE-NEXT: vldrbt.u8 q0, [r0, #4]!
; CHECK-LE-NEXT: vstrw.32 q0, [r1]
; CHECK-LE-NEXT: bx lr
;
@@ -1160,8 +1151,7 @@ define arm_aapcs_vfpcc i8* @masked_v16i8_preinc(i8* %x, i8* %y, <16 x i8> %a) {
; CHECK-BE: @ %bb.0: @ %entry
; CHECK-BE-NEXT: vrev64.8 q1, q0
; CHECK-BE-NEXT: vpt.s8 gt, q1, zr
-; CHECK-BE-NEXT: vldrbt.u8 q0, [r0, #4]
-; CHECK-BE-NEXT: adds r0, #4
+; CHECK-BE-NEXT: vldrbt.u8 q0, [r0, #4]!
; CHECK-BE-NEXT: vstrb.8 q0, [r1]
; CHECK-BE-NEXT: bx lr
entry:
@@ -1178,8 +1168,7 @@ define arm_aapcs_vfpcc i8* @masked_v16i8_postinc(i8* %x, i8* %y, <16 x i8> %a) {
; CHECK-LE-LABEL: masked_v16i8_postinc:
; CHECK-LE: @ %bb.0: @ %entry
; CHECK-LE-NEXT: vpt.s8 gt, q0, zr
-; CHECK-LE-NEXT: vldrbt.u8 q0, [r0]
-; CHECK-LE-NEXT: adds r0, #4
+; CHECK-LE-NEXT: vldrbt.u8 q0, [r0], #4
; CHECK-LE-NEXT: vstrw.32 q0, [r1]
; CHECK-LE-NEXT: bx lr
;
@@ -1187,8 +1176,7 @@ define arm_aapcs_vfpcc i8* @masked_v16i8_postinc(i8* %x, i8* %y, <16 x i8> %a) {
; CHECK-BE: @ %bb.0: @ %entry
; CHECK-BE-NEXT: vrev64.8 q1, q0
; CHECK-BE-NEXT: vpt.s8 gt, q1, zr
-; CHECK-BE-NEXT: vldrbt.u8 q0, [r0]
-; CHECK-BE-NEXT: adds r0, #4
+; CHECK-BE-NEXT: vldrbt.u8 q0, [r0], #4
; CHECK-BE-NEXT: vstrb.8 q0, [r1]
; CHECK-BE-NEXT: bx lr
entry:
@@ -1355,8 +1343,7 @@ define arm_aapcs_vfpcc i8* @masked_v4f32_preinc(i8* %x, i8* %y, <4 x i32> %a) {
; CHECK-LE-LABEL: masked_v4f32_preinc:
; CHECK-LE: @ %bb.0: @ %entry
; CHECK-LE-NEXT: vpt.s32 gt, q0, zr
-; CHECK-LE-NEXT: vldrwt.u32 q0, [r0, #4]
-; CHECK-LE-NEXT: adds r0, #4
+; CHECK-LE-NEXT: vldrwt.u32 q0, [r0, #4]!
; CHECK-LE-NEXT: vstrw.32 q0, [r1]
; CHECK-LE-NEXT: bx lr
;
@@ -1364,8 +1351,7 @@ define arm_aapcs_vfpcc i8* @masked_v4f32_preinc(i8* %x, i8* %y, <4 x i32> %a) {
; CHECK-BE: @ %bb.0: @ %entry
; CHECK-BE-NEXT: vrev64.32 q1, q0
; CHECK-BE-NEXT: vpt.s32 gt, q1, zr
-; CHECK-BE-NEXT: vldrwt.u32 q0, [r0, #4]
-; CHECK-BE-NEXT: adds r0, #4
+; CHECK-BE-NEXT: vldrwt.u32 q0, [r0, #4]!
; CHECK-BE-NEXT: vstrw.32 q0, [r1]
; CHECK-BE-NEXT: bx lr
entry:
@@ -1382,8 +1368,7 @@ define arm_aapcs_vfpcc i8* @masked_v4f32_postinc(i8* %x, i8* %y, <4 x i32> %a) {
; CHECK-LE-LABEL: masked_v4f32_postinc:
; CHECK-LE: @ %bb.0: @ %entry
; CHECK-LE-NEXT: vpt.s32 gt, q0, zr
-; CHECK-LE-NEXT: vldrwt.u32 q0, [r0]
-; CHECK-LE-NEXT: adds r0, #4
+; CHECK-LE-NEXT: vldrwt.u32 q0, [r0], #4
; CHECK-LE-NEXT: vstrw.32 q0, [r1]
; CHECK-LE-NEXT: bx lr
;
@@ -1391,8 +1376,7 @@ define arm_aapcs_vfpcc i8* @masked_v4f32_postinc(i8* %x, i8* %y, <4 x i32> %a) {
; CHECK-BE: @ %bb.0: @ %entry
; CHECK-BE-NEXT: vrev64.32 q1, q0
; CHECK-BE-NEXT: vpt.s32 gt, q1, zr
-; CHECK-BE-NEXT: vldrwt.u32 q0, [r0]
-; CHECK-BE-NEXT: adds r0, #4
+; CHECK-BE-NEXT: vldrwt.u32 q0, [r0], #4
; CHECK-BE-NEXT: vstrw.32 q0, [r1]
; CHECK-BE-NEXT: bx lr
entry:
@@ -1724,8 +1708,7 @@ define arm_aapcs_vfpcc i8* @masked_v8f16_preinc(i8* %x, i8* %y, <8 x i16> %a) {
; CHECK-LE-LABEL: masked_v8f16_preinc:
; CHECK-LE: @ %bb.0: @ %entry
; CHECK-LE-NEXT: vpt.s16 gt, q0, zr
-; CHECK-LE-NEXT: vldrht.u16 q0, [r0, #4]
-; CHECK-LE-NEXT: adds r0, #4
+; CHECK-LE-NEXT: vldrht.u16 q0, [r0, #4]!
; CHECK-LE-NEXT: vstrw.32 q0, [r1]
; CHECK-LE-NEXT: bx lr
;
@@ -1733,8 +1716,7 @@ define arm_aapcs_vfpcc i8* @masked_v8f16_preinc(i8* %x, i8* %y, <8 x i16> %a) {
; CHECK-BE: @ %bb.0: @ %entry
; CHECK-BE-NEXT: vrev64.16 q1, q0
; CHECK-BE-NEXT: vpt.s16 gt, q1, zr
-; CHECK-BE-NEXT: vldrht.u16 q0, [r0, #4]
-; CHECK-BE-NEXT: adds r0, #4
+; CHECK-BE-NEXT: vldrht.u16 q0, [r0, #4]!
; CHECK-BE-NEXT: vstrh.16 q0, [r1]
; CHECK-BE-NEXT: bx lr
entry:
@@ -1751,8 +1733,7 @@ define arm_aapcs_vfpcc i8* @masked_v8f16_postinc(i8* %x, i8* %y, <8 x i16> %a) {
; CHECK-LE-LABEL: masked_v8f16_postinc:
; CHECK-LE: @ %bb.0: @ %entry
; CHECK-LE-NEXT: vpt.s16 gt, q0, zr
-; CHECK-LE-NEXT: vldrht.u16 q0, [r0]
-; CHECK-LE-NEXT: adds r0, #4
+; CHECK-LE-NEXT: vldrht.u16 q0, [r0], #4
; CHECK-LE-NEXT: vstrw.32 q0, [r1]
; CHECK-LE-NEXT: bx lr
;
@@ -1760,8 +1741,7 @@ define arm_aapcs_vfpcc i8* @masked_v8f16_postinc(i8* %x, i8* %y, <8 x i16> %a) {
; CHECK-BE: @ %bb.0: @ %entry
; CHECK-BE-NEXT: vrev64.16 q1, q0
; CHECK-BE-NEXT: vpt.s16 gt, q1, zr
-; CHECK-BE-NEXT: vldrht.u16 q0, [r0]
-; CHECK-BE-NEXT: adds r0, #4
+; CHECK-BE-NEXT: vldrht.u16 q0, [r0], #4
; CHECK-BE-NEXT: vstrh.16 q0, [r1]
; CHECK-BE-NEXT: bx lr
entry:
diff --git a/llvm/test/CodeGen/Thumb2/mve-masked-store.ll b/llvm/test/CodeGen/Thumb2/mve-masked-store.ll
index 1fc9793fd50..425162721ac 100644
--- a/llvm/test/CodeGen/Thumb2/mve-masked-store.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-masked-store.ll
@@ -111,8 +111,7 @@ define i8* @masked_v4i32_pre(i8* %y, i8* %x, <4 x i32> %a) {
; CHECK-LE-NEXT: vldrw.u32 q1, [r1]
; CHECK-LE-NEXT: vmov d0, r2, r3
; CHECK-LE-NEXT: vpt.s32 gt, q0, zr
-; CHECK-LE-NEXT: vstrwt.32 q1, [r0, #4]
-; CHECK-LE-NEXT: adds r0, #4
+; CHECK-LE-NEXT: vstrwt.32 q1, [r0, #4]!
; CHECK-LE-NEXT: bx lr
;
; CHECK-BE-LABEL: masked_v4i32_pre:
@@ -122,8 +121,7 @@ define i8* @masked_v4i32_pre(i8* %y, i8* %x, <4 x i32> %a) {
; CHECK-BE-NEXT: vmov d0, r3, r2
; CHECK-BE-NEXT: vrev64.32 q2, q0
; CHECK-BE-NEXT: vpt.s32 gt, q2, zr
-; CHECK-BE-NEXT: vstrwt.32 q1, [r0, #4]
-; CHECK-BE-NEXT: adds r0, #4
+; CHECK-BE-NEXT: vstrwt.32 q1, [r0, #4]!
; CHECK-BE-NEXT: bx lr
entry:
%z = getelementptr inbounds i8, i8* %y, i32 4
@@ -142,8 +140,7 @@ define i8* @masked_v4i32_post(i8* %y, i8* %x, <4 x i32> %a) {
; CHECK-LE-NEXT: vldrw.u32 q1, [r1]
; CHECK-LE-NEXT: vmov d0, r2, r3
; CHECK-LE-NEXT: vpt.s32 gt, q0, zr
-; CHECK-LE-NEXT: vstrwt.32 q1, [r0]
-; CHECK-LE-NEXT: adds r0, #4
+; CHECK-LE-NEXT: vstrwt.32 q1, [r0], #4
; CHECK-LE-NEXT: bx lr
;
; CHECK-BE-LABEL: masked_v4i32_post:
@@ -153,8 +150,7 @@ define i8* @masked_v4i32_post(i8* %y, i8* %x, <4 x i32> %a) {
; CHECK-BE-NEXT: vmov d0, r3, r2
; CHECK-BE-NEXT: vrev64.32 q2, q0
; CHECK-BE-NEXT: vpt.s32 gt, q2, zr
-; CHECK-BE-NEXT: vstrwt.32 q1, [r0]
-; CHECK-BE-NEXT: adds r0, #4
+; CHECK-BE-NEXT: vstrwt.32 q1, [r0], #4
; CHECK-BE-NEXT: bx lr
entry:
%z = getelementptr inbounds i8, i8* %y, i32 4
@@ -334,8 +330,7 @@ define i8* @masked_v8i16_pre(i8* %y, i8* %x, <8 x i16> %a) {
; CHECK-LE-NEXT: vldrw.u32 q1, [r1]
; CHECK-LE-NEXT: vmov d0, r2, r3
; CHECK-LE-NEXT: vpt.s16 gt, q0, zr
-; CHECK-LE-NEXT: vstrht.16 q1, [r0, #4]
-; CHECK-LE-NEXT: adds r0, #4
+; CHECK-LE-NEXT: vstrht.16 q1, [r0, #4]!
; CHECK-LE-NEXT: bx lr
;
; CHECK-BE-LABEL: masked_v8i16_pre:
@@ -345,8 +340,7 @@ define i8* @masked_v8i16_pre(i8* %y, i8* %x, <8 x i16> %a) {
; CHECK-BE-NEXT: vmov d0, r3, r2
; CHECK-BE-NEXT: vrev64.16 q2, q0
; CHECK-BE-NEXT: vpt.s16 gt, q2, zr
-; CHECK-BE-NEXT: vstrht.16 q1, [r0, #4]
-; CHECK-BE-NEXT: adds r0, #4
+; CHECK-BE-NEXT: vstrht.16 q1, [r0, #4]!
; CHECK-BE-NEXT: bx lr
entry:
%z = getelementptr inbounds i8, i8* %y, i32 4
@@ -365,8 +359,7 @@ define i8* @masked_v8i16_post(i8* %y, i8* %x, <8 x i16> %a) {
; CHECK-LE-NEXT: vldrw.u32 q1, [r1]
; CHECK-LE-NEXT: vmov d0, r2, r3
; CHECK-LE-NEXT: vpt.s16 gt, q0, zr
-; CHECK-LE-NEXT: vstrht.16 q1, [r0]
-; CHECK-LE-NEXT: adds r0, #4
+; CHECK-LE-NEXT: vstrht.16 q1, [r0], #4
; CHECK-LE-NEXT: bx lr
;
; CHECK-BE-LABEL: masked_v8i16_post:
@@ -376,8 +369,7 @@ define i8* @masked_v8i16_post(i8* %y, i8* %x, <8 x i16> %a) {
; CHECK-BE-NEXT: vmov d0, r3, r2
; CHECK-BE-NEXT: vrev64.16 q2, q0
; CHECK-BE-NEXT: vpt.s16 gt, q2, zr
-; CHECK-BE-NEXT: vstrht.16 q1, [r0]
-; CHECK-BE-NEXT: adds r0, #4
+; CHECK-BE-NEXT: vstrht.16 q1, [r0], #4
; CHECK-BE-NEXT: bx lr
entry:
%z = getelementptr inbounds i8, i8* %y, i32 4
@@ -416,8 +408,7 @@ define i8* @masked_v16i8_pre(i8* %y, i8* %x, <16 x i8> %a) {
; CHECK-LE-NEXT: vldrw.u32 q1, [r1]
; CHECK-LE-NEXT: vmov d0, r2, r3
; CHECK-LE-NEXT: vpt.s8 gt, q0, zr
-; CHECK-LE-NEXT: vstrbt.8 q1, [r0, #4]
-; CHECK-LE-NEXT: adds r0, #4
+; CHECK-LE-NEXT: vstrbt.8 q1, [r0, #4]!
; CHECK-LE-NEXT: bx lr
;
; CHECK-BE-LABEL: masked_v16i8_pre:
@@ -427,8 +418,7 @@ define i8* @masked_v16i8_pre(i8* %y, i8* %x, <16 x i8> %a) {
; CHECK-BE-NEXT: vmov d0, r3, r2
; CHECK-BE-NEXT: vrev64.8 q2, q0
; CHECK-BE-NEXT: vpt.s8 gt, q2, zr
-; CHECK-BE-NEXT: vstrbt.8 q1, [r0, #4]
-; CHECK-BE-NEXT: adds r0, #4
+; CHECK-BE-NEXT: vstrbt.8 q1, [r0, #4]!
; CHECK-BE-NEXT: bx lr
entry:
%z = getelementptr inbounds i8, i8* %y, i32 4
@@ -447,8 +437,7 @@ define i8* @masked_v16i8_post(i8* %y, i8* %x, <16 x i8> %a) {
; CHECK-LE-NEXT: vldrw.u32 q1, [r1]
; CHECK-LE-NEXT: vmov d0, r2, r3
; CHECK-LE-NEXT: vpt.s8 gt, q0, zr
-; CHECK-LE-NEXT: vstrbt.8 q1, [r0]
-; CHECK-LE-NEXT: adds r0, #4
+; CHECK-LE-NEXT: vstrbt.8 q1, [r0], #4
; CHECK-LE-NEXT: bx lr
;
; CHECK-BE-LABEL: masked_v16i8_post:
@@ -458,8 +447,7 @@ define i8* @masked_v16i8_post(i8* %y, i8* %x, <16 x i8> %a) {
; CHECK-BE-NEXT: vmov d0, r3, r2
; CHECK-BE-NEXT: vrev64.8 q2, q0
; CHECK-BE-NEXT: vpt.s8 gt, q2, zr
-; CHECK-BE-NEXT: vstrbt.8 q1, [r0]
-; CHECK-BE-NEXT: adds r0, #4
+; CHECK-BE-NEXT: vstrbt.8 q1, [r0], #4
; CHECK-BE-NEXT: bx lr
entry:
%z = getelementptr inbounds i8, i8* %y, i32 4
@@ -591,8 +579,7 @@ define i8* @masked_v4f32_pre(i8* %y, i8* %x, <4 x i32> %a) {
; CHECK-LE-NEXT: vldrw.u32 q1, [r1]
; CHECK-LE-NEXT: vmov d0, r2, r3
; CHECK-LE-NEXT: vpt.s32 gt, q0, zr
-; CHECK-LE-NEXT: vstrwt.32 q1, [r0, #4]
-; CHECK-LE-NEXT: adds r0, #4
+; CHECK-LE-NEXT: vstrwt.32 q1, [r0, #4]!
; CHECK-LE-NEXT: bx lr
;
; CHECK-BE-LABEL: masked_v4f32_pre:
@@ -602,8 +589,7 @@ define i8* @masked_v4f32_pre(i8* %y, i8* %x, <4 x i32> %a) {
; CHECK-BE-NEXT: vmov d0, r3, r2
; CHECK-BE-NEXT: vrev64.32 q2, q0
; CHECK-BE-NEXT: vpt.s32 gt, q2, zr
-; CHECK-BE-NEXT: vstrwt.32 q1, [r0, #4]
-; CHECK-BE-NEXT: adds r0, #4
+; CHECK-BE-NEXT: vstrwt.32 q1, [r0, #4]!
; CHECK-BE-NEXT: bx lr
entry:
%z = getelementptr inbounds i8, i8* %y, i32 4
@@ -622,8 +608,7 @@ define i8* @masked_v4f32_post(i8* %y, i8* %x, <4 x i32> %a) {
; CHECK-LE-NEXT: vldrw.u32 q1, [r1]
; CHECK-LE-NEXT: vmov d0, r2, r3
; CHECK-LE-NEXT: vpt.s32 gt, q0, zr
-; CHECK-LE-NEXT: vstrwt.32 q1, [r0]
-; CHECK-LE-NEXT: adds r0, #4
+; CHECK-LE-NEXT: vstrwt.32 q1, [r0], #4
; CHECK-LE-NEXT: bx lr
;
; CHECK-BE-LABEL: masked_v4f32_post:
@@ -633,8 +618,7 @@ define i8* @masked_v4f32_post(i8* %y, i8* %x, <4 x i32> %a) {
; CHECK-BE-NEXT: vmov d0, r3, r2
; CHECK-BE-NEXT: vrev64.32 q2, q0
; CHECK-BE-NEXT: vpt.s32 gt, q2, zr
-; CHECK-BE-NEXT: vstrwt.32 q1, [r0]
-; CHECK-BE-NEXT: adds r0, #4
+; CHECK-BE-NEXT: vstrwt.32 q1, [r0], #4
; CHECK-BE-NEXT: bx lr
entry:
%z = getelementptr inbounds i8, i8* %y, i32 4
@@ -904,8 +888,7 @@ define i8* @masked_v8f16_pre(i8* %y, i8* %x, <8 x i16> %a) {
; CHECK-LE-NEXT: vldrw.u32 q1, [r1]
; CHECK-LE-NEXT: vmov d0, r2, r3
; CHECK-LE-NEXT: vpt.s16 gt, q0, zr
-; CHECK-LE-NEXT: vstrht.16 q1, [r0, #4]
-; CHECK-LE-NEXT: adds r0, #4
+; CHECK-LE-NEXT: vstrht.16 q1, [r0, #4]!
; CHECK-LE-NEXT: bx lr
;
; CHECK-BE-LABEL: masked_v8f16_pre:
@@ -915,8 +898,7 @@ define i8* @masked_v8f16_pre(i8* %y, i8* %x, <8 x i16> %a) {
; CHECK-BE-NEXT: vmov d0, r3, r2
; CHECK-BE-NEXT: vrev64.16 q2, q0
; CHECK-BE-NEXT: vpt.s16 gt, q2, zr
-; CHECK-BE-NEXT: vstrht.16 q1, [r0, #4]
-; CHECK-BE-NEXT: adds r0, #4
+; CHECK-BE-NEXT: vstrht.16 q1, [r0, #4]!
; CHECK-BE-NEXT: bx lr
entry:
%z = getelementptr inbounds i8, i8* %y, i32 4
@@ -935,8 +917,7 @@ define i8* @masked_v8f16_post(i8* %y, i8* %x, <8 x i16> %a) {
; CHECK-LE-NEXT: vldrw.u32 q1, [r1]
; CHECK-LE-NEXT: vmov d0, r2, r3
; CHECK-LE-NEXT: vpt.s16 gt, q0, zr
-; CHECK-LE-NEXT: vstrht.16 q1, [r0]
-; CHECK-LE-NEXT: adds r0, #4
+; CHECK-LE-NEXT: vstrht.16 q1, [r0], #4
; CHECK-LE-NEXT: bx lr
;
; CHECK-BE-LABEL: masked_v8f16_post:
@@ -946,8 +927,7 @@ define i8* @masked_v8f16_post(i8* %y, i8* %x, <8 x i16> %a) {
; CHECK-BE-NEXT: vmov d0, r3, r2
; CHECK-BE-NEXT: vrev64.16 q2, q0
; CHECK-BE-NEXT: vpt.s16 gt, q2, zr
-; CHECK-BE-NEXT: vstrht.16 q1, [r0]
-; CHECK-BE-NEXT: adds r0, #4
+; CHECK-BE-NEXT: vstrht.16 q1, [r0], #4
; CHECK-BE-NEXT: bx lr
entry:
%z = getelementptr inbounds i8, i8* %y, i32 4
OpenPOWER on IntegriCloud