summaryrefslogtreecommitdiffstats
path: root/llvm/lib/Target
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target')
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td3
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp111
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h14
-rw-r--r--llvm/lib/Target/AMDGPU/BUFInstructions.td9
-rw-r--r--llvm/lib/Target/AMDGPU/FLATInstructions.td4
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp77
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.cpp25
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstructions.td21
-rw-r--r--llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp13
-rw-r--r--llvm/lib/Target/AMDGPU/SIRegisterInfo.td50
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp2
12 files changed, 279 insertions, 52 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
index b89ea1d5ea4..597d23a9c1a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
@@ -110,9 +110,10 @@ def CC_AMDGPU_Func : CallingConv<[
VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31]>>,
- CCIfType<[i64, f64, v2i32, v2f32, v4i32, v4f32, v8i32, v8f32, v16i32, v16f32, v2i64, v2f64, v4i16, v4f16], CCCustom<"allocateVGPRTuple">>,
+ CCIfType<[i64, f64, v2i32, v2f32, v3i32, v3f32, v4i32, v4f32, v8i32, v8f32, v16i32, v16f32, v2i64, v2f64, v4i16, v4f16], CCCustom<"allocateVGPRTuple">>,
CCIfType<[i32, f32, v2i16, v2f16, i16, f16, i1], CCAssignToStack<4, 4>>,
CCIfType<[i64, f64, v2i32, v2f32], CCAssignToStack<8, 4>>,
+ CCIfType<[v3i32, v3f32], CCAssignToStack<12, 4>>,
CCIfType<[v4i32, v4f32, v2i64, v2f64], CCAssignToStack<16, 4>>,
CCIfType<[v8i32, v8f32], CCAssignToStack<32, 4>>,
CCIfType<[v16i32, v16f32], CCAssignToStack<64, 4>>
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index ec55c2700e6..113880e1495 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -540,6 +540,8 @@ static unsigned selectSGPRVectorRegClassID(unsigned NumVectorElts) {
return AMDGPU::SReg_32_XM0RegClassID;
case 2:
return AMDGPU::SReg_64RegClassID;
+ case 3:
+ return AMDGPU::SGPR_96RegClassID;
case 4:
return AMDGPU::SReg_128RegClassID;
case 8:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 9fcc3355775..c8bff58126a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -150,6 +150,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::LOAD, MVT::v2f32, Promote);
AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
+ setOperationAction(ISD::LOAD, MVT::v3f32, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::v3f32, MVT::v3i32);
+
setOperationAction(ISD::LOAD, MVT::v4f32, Promote);
AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
@@ -235,6 +238,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::STORE, MVT::v2f32, Promote);
AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
+ setOperationAction(ISD::STORE, MVT::v3f32, Promote);
+ AddPromotedToType(ISD::STORE, MVT::v3f32, MVT::v3i32);
+
setOperationAction(ISD::STORE, MVT::v4f32, Promote);
AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
@@ -325,12 +331,16 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
// Expand to fneg + fadd.
setOperationAction(ISD::FSUB, MVT::f64, Expand);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v3i32, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v3f32, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3f32, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3i32, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f32, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i32, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom);
@@ -392,7 +402,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
static const MVT::SimpleValueType VectorIntTypes[] = {
- MVT::v2i32, MVT::v4i32
+ MVT::v2i32, MVT::v3i32, MVT::v4i32
};
for (MVT VT : VectorIntTypes) {
@@ -434,7 +444,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
}
static const MVT::SimpleValueType FloatVectorTypes[] = {
- MVT::v2f32, MVT::v4f32
+ MVT::v2f32, MVT::v3f32, MVT::v4f32
};
for (MVT VT : FloatVectorTypes) {
@@ -476,6 +486,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SELECT, MVT::v2f32, Promote);
AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32);
+ setOperationAction(ISD::SELECT, MVT::v3f32, Promote);
+ AddPromotedToType(ISD::SELECT, MVT::v3f32, MVT::v3i32);
+
setOperationAction(ISD::SELECT, MVT::v4f32, Promote);
AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32);
@@ -1372,6 +1385,41 @@ SDValue AMDGPUTargetLowering::getHiHalf64(SDValue Op, SelectionDAG &DAG) const {
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
}
+// Split a vector type into two parts. The first part is a power of two vector.
+// The second part is whatever is left over, and is a scalar if it would
+// otherwise be a 1-vector.
+std::pair<EVT, EVT>
+AMDGPUTargetLowering::getSplitDestVTs(const EVT &VT, SelectionDAG &DAG) const {
+ EVT LoVT, HiVT;
+ EVT EltVT = VT.getVectorElementType();
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned LoNumElts = PowerOf2Ceil((NumElts + 1) / 2);
+ LoVT = EVT::getVectorVT(*DAG.getContext(), EltVT, LoNumElts);
+ HiVT = NumElts - LoNumElts == 1
+ ? EltVT
+ : EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts - LoNumElts);
+ return std::make_pair(LoVT, HiVT);
+}
+
+// Split a vector value into two parts of types LoVT and HiVT. HiVT could be
+// scalar.
+std::pair<SDValue, SDValue>
+AMDGPUTargetLowering::splitVector(const SDValue &N, const SDLoc &DL,
+ const EVT &LoVT, const EVT &HiVT,
+ SelectionDAG &DAG) const {
+ assert(LoVT.getVectorNumElements() +
+ (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <=
+ N.getValueType().getVectorNumElements() &&
+ "More vector elements requested than available!");
+ auto IdxTy = getVectorIdxTy(DAG.getDataLayout());
+ SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LoVT, N,
+ DAG.getConstant(0, DL, IdxTy));
+ SDValue Hi = DAG.getNode(
+ HiVT.isVector() ? ISD::EXTRACT_SUBVECTOR : ISD::EXTRACT_VECTOR_ELT, DL,
+ HiVT, N, DAG.getConstant(LoVT.getVectorNumElements(), DL, IdxTy));
+ return std::make_pair(Lo, Hi);
+}
+
SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
SelectionDAG &DAG) const {
LoadSDNode *Load = cast<LoadSDNode>(Op);
@@ -1393,9 +1441,9 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
EVT LoMemVT, HiMemVT;
SDValue Lo, Hi;
- std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
- std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemVT);
- std::tie(Lo, Hi) = DAG.SplitVector(Op, SL, LoVT, HiVT);
+ std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
+ std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
+ std::tie(Lo, Hi) = splitVector(Op, SL, LoVT, HiVT, DAG);
unsigned Size = LoMemVT.getStoreSize();
unsigned BaseAlign = Load->getAlignment();
@@ -1410,15 +1458,52 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()),
HiMemVT, HiAlign, Load->getMemOperand()->getFlags());
- SDValue Ops[] = {
- DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad),
- DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
- LoLoad.getValue(1), HiLoad.getValue(1))
- };
+ auto IdxTy = getVectorIdxTy(DAG.getDataLayout());
+ SDValue Join;
+ if (LoVT == HiVT) {
+ // This is the case that the vector is power of two so was evenly split.
+ Join = DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad);
+ } else {
+ Join = DAG.getNode(ISD::INSERT_SUBVECTOR, SL, VT, DAG.getUNDEF(VT), LoLoad,
+ DAG.getConstant(0, SL, IdxTy));
+ Join = DAG.getNode(HiVT.isVector() ? ISD::INSERT_SUBVECTOR
+ : ISD::INSERT_VECTOR_ELT,
+ SL, VT, Join, HiLoad,
+ DAG.getConstant(LoVT.getVectorNumElements(), SL, IdxTy));
+ }
+
+ SDValue Ops[] = {Join, DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
+ LoLoad.getValue(1), HiLoad.getValue(1))};
return DAG.getMergeValues(Ops, SL);
}
+// Widen a vector load from vec3 to vec4.
+SDValue AMDGPUTargetLowering::WidenVectorLoad(SDValue Op,
+ SelectionDAG &DAG) const {
+ LoadSDNode *Load = cast<LoadSDNode>(Op);
+ EVT VT = Op.getValueType();
+ assert(VT.getVectorNumElements() == 3);
+ SDValue BasePtr = Load->getBasePtr();
+ EVT MemVT = Load->getMemoryVT();
+ SDLoc SL(Op);
+ const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
+ unsigned BaseAlign = Load->getAlignment();
+
+ EVT WideVT =
+ EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 4);
+ EVT WideMemVT =
+ EVT::getVectorVT(*DAG.getContext(), MemVT.getVectorElementType(), 4);
+ SDValue WideLoad = DAG.getExtLoad(
+ Load->getExtensionType(), SL, WideVT, Load->getChain(), BasePtr, SrcValue,
+ WideMemVT, BaseAlign, Load->getMemOperand()->getFlags());
+ return DAG.getMergeValues(
+ {DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, VT, WideLoad,
+ DAG.getConstant(0, SL, getVectorIdxTy(DAG.getDataLayout()))),
+ WideLoad.getValue(1)},
+ SL);
+}
+
SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
SelectionDAG &DAG) const {
StoreSDNode *Store = cast<StoreSDNode>(Op);
@@ -1439,9 +1524,9 @@ SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
EVT LoMemVT, HiMemVT;
SDValue Lo, Hi;
- std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
- std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemVT);
- std::tie(Lo, Hi) = DAG.SplitVector(Val, SL, LoVT, HiVT);
+ std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
+ std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
+ std::tie(Lo, Hi) = splitVector(Val, SL, LoVT, HiVT, DAG);
SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, LoMemVT.getStoreSize());
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index fadbcc63845..0f79e019300 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -110,9 +110,23 @@ protected:
SDValue getLoHalf64(SDValue Op, SelectionDAG &DAG) const;
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const;
+ /// Split a vector type into two parts. The first part is a power of two
+ /// vector. The second part is whatever is left over, and is a scalar if it
+ /// would otherwise be a 1-vector.
+ std::pair<EVT, EVT> getSplitDestVTs(const EVT &VT, SelectionDAG &DAG) const;
+
+ /// Split a vector value into two parts of types LoVT and HiVT. HiVT could be
+ /// scalar.
+ std::pair<SDValue, SDValue> splitVector(const SDValue &N, const SDLoc &DL,
+ const EVT &LoVT, const EVT &HighVT,
+ SelectionDAG &DAG) const;
+
/// Split a vector load into 2 loads of half the vector.
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const;
+ /// Widen a vector load from vec3 to vec4.
+ SDValue WidenVectorLoad(SDValue Op, SelectionDAG &DAG) const;
+
/// Split a vector store into 2 stores of half the vector.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index 408c15a7b48..d720eb9d46f 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -821,7 +821,7 @@ defm BUFFER_LOAD_DWORDX2 : MUBUF_Pseudo_Loads <
"buffer_load_dwordx2", VReg_64, v2i32, mubuf_load
>;
defm BUFFER_LOAD_DWORDX3 : MUBUF_Pseudo_Loads <
- "buffer_load_dwordx3", VReg_96, untyped, mubuf_load
+ "buffer_load_dwordx3", VReg_96, v3i32, mubuf_load
>;
defm BUFFER_LOAD_DWORDX4 : MUBUF_Pseudo_Loads <
"buffer_load_dwordx4", VReg_128, v4i32, mubuf_load
@@ -855,7 +855,7 @@ defm BUFFER_STORE_DWORDX2 : MUBUF_Pseudo_Stores <
"buffer_store_dwordx2", VReg_64, v2i32, store_global
>;
defm BUFFER_STORE_DWORDX3 : MUBUF_Pseudo_Stores <
- "buffer_store_dwordx3", VReg_96, untyped, store_global
+ "buffer_store_dwordx3", VReg_96, v3i32, store_global
>;
defm BUFFER_STORE_DWORDX4 : MUBUF_Pseudo_Stores <
"buffer_store_dwordx4", VReg_128, v4i32, store_global
@@ -1045,9 +1045,6 @@ let SubtargetPredicate = isCIVI in {
//===----------------------------------------------------------------------===//
// Instruction definitions for CI and newer.
//===----------------------------------------------------------------------===//
-// Remaining instructions:
-// BUFFER_LOAD_DWORDX3
-// BUFFER_STORE_DWORDX3
def BUFFER_WBINVL1_VOL : MUBUF_Invalidate <"buffer_wbinvl1_vol",
int_amdgcn_buffer_wbinvl1_vol>;
@@ -1405,6 +1402,7 @@ defm : MUBUFScratchLoadPat <BUFFER_LOAD_USHORT_OFFEN, BUFFER_LOAD_USHORT_OFFSET,
defm : MUBUFScratchLoadPat <BUFFER_LOAD_USHORT_OFFEN, BUFFER_LOAD_USHORT_OFFSET, i16, load_private>;
defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORD_OFFEN, BUFFER_LOAD_DWORD_OFFSET, i32, load_private>;
defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX2_OFFEN, BUFFER_LOAD_DWORDX2_OFFSET, v2i32, load_private>;
+defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX3_OFFEN, BUFFER_LOAD_DWORDX3_OFFSET, v3i32, load_private>;
defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX4_OFFEN, BUFFER_LOAD_DWORDX4_OFFSET, v4i32, load_private>;
let OtherPredicates = [D16PreservesUnusedBits] in {
@@ -1477,6 +1475,7 @@ defm : MUBUFScratchStorePat <BUFFER_STORE_BYTE_OFFEN, BUFFER_STORE_BYTE_OFFSET,
defm : MUBUFScratchStorePat <BUFFER_STORE_SHORT_OFFEN, BUFFER_STORE_SHORT_OFFSET, i16, store_private>;
defm : MUBUFScratchStorePat <BUFFER_STORE_DWORD_OFFEN, BUFFER_STORE_DWORD_OFFSET, i32, store_private>;
defm : MUBUFScratchStorePat <BUFFER_STORE_DWORDX2_OFFEN, BUFFER_STORE_DWORDX2_OFFSET, v2i32, store_private>;
+defm : MUBUFScratchStorePat <BUFFER_STORE_DWORDX3_OFFEN, BUFFER_STORE_DWORDX3_OFFSET, v3i32, store_private>;
defm : MUBUFScratchStorePat <BUFFER_STORE_DWORDX4_OFFEN, BUFFER_STORE_DWORDX4_OFFSET, v4i32, store_private>;
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index cefcf90d0d2..52ce89c345d 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -730,6 +730,7 @@ def : FlatLoadPat <FLAT_LOAD_USHORT, load_flat, i16>;
def : FlatLoadPat <FLAT_LOAD_SSHORT, sextloadi16_flat, i32>;
def : FlatLoadPat <FLAT_LOAD_DWORD, load_flat, i32>;
def : FlatLoadPat <FLAT_LOAD_DWORDX2, load_flat, v2i32>;
+def : FlatLoadPat <FLAT_LOAD_DWORDX3, load_flat, v3i32>;
def : FlatLoadPat <FLAT_LOAD_DWORDX4, load_flat, v4i32>;
def : FlatLoadAtomicPat <FLAT_LOAD_DWORD, atomic_load_flat, i32>;
@@ -739,6 +740,7 @@ def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_flat, i32>;
def : FlatStorePat <FLAT_STORE_SHORT, truncstorei16_flat, i32>;
def : FlatStorePat <FLAT_STORE_DWORD, store_flat, i32>;
def : FlatStorePat <FLAT_STORE_DWORDX2, store_flat, v2i32>;
+def : FlatStorePat <FLAT_STORE_DWORDX3, store_flat, v3i32>;
def : FlatStorePat <FLAT_STORE_DWORDX4, store_flat, v4i32>;
def : FlatStoreAtomicPat <FLAT_STORE_DWORD, atomic_store_flat, i32>;
@@ -808,6 +810,7 @@ def : FlatLoadSignedPat <GLOBAL_LOAD_USHORT, load_global, i16>;
def : FlatLoadSignedPat <GLOBAL_LOAD_DWORD, load_global, i32>;
def : FlatLoadSignedPat <GLOBAL_LOAD_DWORDX2, load_global, v2i32>;
+def : FlatLoadSignedPat <GLOBAL_LOAD_DWORDX3, load_global, v3i32>;
def : FlatLoadSignedPat <GLOBAL_LOAD_DWORDX4, load_global, v4i32>;
def : FlatLoadAtomicPat <GLOBAL_LOAD_DWORD, atomic_load_global, i32>;
@@ -819,6 +822,7 @@ def : FlatStoreSignedPat <GLOBAL_STORE_SHORT, truncstorei16_global, i32>;
def : FlatStoreSignedPat <GLOBAL_STORE_SHORT, store_global, i16>;
def : FlatStoreSignedPat <GLOBAL_STORE_DWORD, store_global, i32>;
def : FlatStoreSignedPat <GLOBAL_STORE_DWORDX2, store_global, v2i32>;
+def : FlatStoreSignedPat <GLOBAL_STORE_DWORDX3, store_global, v3i32>;
def : FlatStoreSignedPat <GLOBAL_STORE_DWORDX4, store_global, v4i32>;
let OtherPredicates = [D16PreservesUnusedBits] in {
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index a708cedf488..f029b3468e6 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -123,6 +123,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass);
+ addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
+ addRegisterClass(MVT::v3f32, &AMDGPU::VReg_96RegClass);
+
addRegisterClass(MVT::v2i64, &AMDGPU::SReg_128RegClass);
addRegisterClass(MVT::v2f64, &AMDGPU::SReg_128RegClass);
@@ -150,6 +153,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
// We need to custom lower vector stores from local memory
setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
+ setOperationAction(ISD::LOAD, MVT::v3i32, Custom);
setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
setOperationAction(ISD::LOAD, MVT::v16i32, Custom);
@@ -157,6 +161,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::LOAD, MVT::v32i32, Custom);
setOperationAction(ISD::STORE, MVT::v2i32, Custom);
+ setOperationAction(ISD::STORE, MVT::v3i32, Custom);
setOperationAction(ISD::STORE, MVT::v4i32, Custom);
setOperationAction(ISD::STORE, MVT::v8i32, Custom);
setOperationAction(ISD::STORE, MVT::v16i32, Custom);
@@ -325,6 +330,12 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f16, Custom);
+ // Deal with vec3 vector operations when widened to vec4.
+ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v3i32, Expand);
+ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v3f32, Expand);
+ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i32, Expand);
+ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4f32, Expand);
+
// BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
// and output demarshalling
setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom);
@@ -1328,6 +1339,17 @@ SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
const SDLoc &SL, SDValue Val,
bool Signed,
const ISD::InputArg *Arg) const {
+ // First, if it is a widened vector, narrow it.
+ if (VT.isVector() &&
+ VT.getVectorNumElements() != MemVT.getVectorNumElements()) {
+ EVT NarrowedVT =
+ EVT::getVectorVT(*DAG.getContext(), MemVT.getVectorElementType(),
+ VT.getVectorNumElements());
+ Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val,
+ DAG.getConstant(0, SL, MVT::i32));
+ }
+
+ // Then convert the vector elements or scalar value.
if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) &&
VT.bitsLT(MemVT)) {
unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
@@ -6546,8 +6568,25 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain,
BasePtr, RealMemVT, MMO);
+ if (!MemVT.isVector()) {
+ SDValue Ops[] = {
+ DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
+ NewLD.getValue(1)
+ };
+
+ return DAG.getMergeValues(Ops, DL);
+ }
+
+ SmallVector<SDValue, 3> Elts;
+ for (unsigned I = 0, N = MemVT.getVectorNumElements(); I != N; ++I) {
+ SDValue Elt = DAG.getNode(ISD::SRL, DL, MVT::i32, NewLD,
+ DAG.getConstant(I, DL, MVT::i32));
+
+ Elts.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Elt));
+ }
+
SDValue Ops[] = {
- DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
+ DAG.getBuildVector(MemVT, DL, Elts),
NewLD.getValue(1)
};
@@ -6581,8 +6620,13 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
- if (!Op->isDivergent() && Alignment >= 4 && NumElements < 32)
- return SDValue();
+ if (!Op->isDivergent() && Alignment >= 4 && NumElements < 32) {
+ if (MemVT.isPow2VectorType())
+ return SDValue();
+ if (NumElements == 3)
+ return WidenVectorLoad(Op, DAG);
+ return SplitVectorLoad(Op, DAG);
+ }
// Non-uniform loads will be selected to MUBUF instructions, so they
// have the same legalization requirements as global and private
// loads.
@@ -6594,8 +6638,13 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
AS == AMDGPUAS::GLOBAL_ADDRESS) {
if (Subtarget->getScalarizeGlobalBehavior() && !Op->isDivergent() &&
!Load->isVolatile() && isMemOpHasNoClobberedMemOperand(Load) &&
- Alignment >= 4 && NumElements < 32)
- return SDValue();
+ Alignment >= 4 && NumElements < 32) {
+ if (MemVT.isPow2VectorType())
+ return SDValue();
+ if (NumElements == 3)
+ return WidenVectorLoad(Op, DAG);
+ return SplitVectorLoad(Op, DAG);
+ }
// Non-uniform loads will be selected to MUBUF instructions, so they
// have the same legalization requirements as global and private
// loads.
@@ -6607,7 +6656,10 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
AS == AMDGPUAS::FLAT_ADDRESS) {
if (NumElements > 4)
return SplitVectorLoad(Op, DAG);
- // v4 loads are supported for private and global memory.
+ // v3 loads not supported on SI.
+ if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
+ return WidenVectorLoad(Op, DAG);
+ // v3 and v4 loads are supported for private and global memory.
return SDValue();
}
if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
@@ -6625,6 +6677,9 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
// Same as global/flat
if (NumElements > 4)
return SplitVectorLoad(Op, DAG);
+ // v3 loads not supported on SI.
+ if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
+ return WidenVectorLoad(Op, DAG);
return SDValue();
default:
llvm_unreachable("unsupported private_element_size");
@@ -7026,6 +7081,9 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
AS == AMDGPUAS::FLAT_ADDRESS) {
if (NumElements > 4)
return SplitVectorStore(Op, DAG);
+ // v3 stores not supported on SI.
+ if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
+ return SplitVectorStore(Op, DAG);
return SDValue();
} else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
switch (Subtarget->getMaxPrivateElementSize()) {
@@ -7036,7 +7094,7 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
return SplitVectorStore(Op, DAG);
return SDValue();
case 16:
- if (NumElements > 4)
+ if (NumElements > 4 || NumElements == 3)
return SplitVectorStore(Op, DAG);
return SDValue();
default:
@@ -7045,7 +7103,7 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
} else if (AS == AMDGPUAS::LOCAL_ADDRESS) {
// Use ds_write_b128 if possible.
if (Subtarget->useDS128() && Store->getAlignment() >= 16 &&
- VT.getStoreSize() == 16)
+ VT.getStoreSize() == 16 && NumElements != 3)
return SDValue();
if (NumElements > 2)
@@ -9624,6 +9682,9 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
case 64:
RC = &AMDGPU::SGPR_64RegClass;
break;
+ case 96:
+ RC = &AMDGPU::SReg_96RegClass;
+ break;
case 128:
RC = &AMDGPU::SReg_128RegClass;
break;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 878b456e1b5..8a35ef01457 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -562,7 +562,8 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
unsigned EltSize = 4;
unsigned Opcode = AMDGPU::V_MOV_B32_e32;
if (RI.isSGPRClass(RC)) {
- if (RI.getRegSizeInBits(*RC) > 32) {
+ // TODO: Copy vec3/vec5 with s_mov_b64s then final s_mov_b32.
+ if (!(RI.getRegSizeInBits(*RC) % 64)) {
Opcode = AMDGPU::S_MOV_B64;
EltSize = 8;
} else {
@@ -840,6 +841,8 @@ static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
return AMDGPU::SI_SPILL_S32_SAVE;
case 8:
return AMDGPU::SI_SPILL_S64_SAVE;
+ case 12:
+ return AMDGPU::SI_SPILL_S96_SAVE;
case 16:
return AMDGPU::SI_SPILL_S128_SAVE;
case 32:
@@ -942,6 +945,8 @@ static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
return AMDGPU::SI_SPILL_S32_RESTORE;
case 8:
return AMDGPU::SI_SPILL_S64_RESTORE;
+ case 12:
+ return AMDGPU::SI_SPILL_S96_RESTORE;
case 16:
return AMDGPU::SI_SPILL_S128_RESTORE;
case 32:
@@ -1916,14 +1921,18 @@ void SIInstrInfo::insertSelect(MachineBasicBlock &MBB,
const int16_t *SubIndices = Sub0_15;
int NElts = DstSize / 32;
- // 64-bit select is only avaialble for SALU.
+ // 64-bit select is only available for SALU.
+ // TODO: Split 96-bit into 64-bit and 32-bit, not 3x 32-bit.
if (Pred == SCC_TRUE) {
- SelOp = AMDGPU::S_CSELECT_B64;
- EltRC = &AMDGPU::SGPR_64RegClass;
- SubIndices = Sub0_15_64;
-
- assert(NElts % 2 == 0);
- NElts /= 2;
+ if (NElts % 2) {
+ SelOp = AMDGPU::S_CSELECT_B32;
+ EltRC = &AMDGPU::SGPR_32RegClass;
+ } else {
+ SelOp = AMDGPU::S_CSELECT_B64;
+ EltRC = &AMDGPU::SGPR_64RegClass;
+ SubIndices = Sub0_15_64;
+ NElts /= 2;
+ }
}
MachineInstrBuilder MIB = BuildMI(
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index a2b6f76da26..9ab755c6785 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -479,6 +479,7 @@ multiclass SI_SPILL_SGPR <RegisterClass sgpr_class> {
// SI_SPILL_32_* instructions.
defm SI_SPILL_S32 : SI_SPILL_SGPR <SReg_32>;
defm SI_SPILL_S64 : SI_SPILL_SGPR <SReg_64>;
+defm SI_SPILL_S96 : SI_SPILL_SGPR <SReg_96>;
defm SI_SPILL_S128 : SI_SPILL_SGPR <SReg_128>;
defm SI_SPILL_S256 : SI_SPILL_SGPR <SReg_256>;
defm SI_SPILL_S512 : SI_SPILL_SGPR <SReg_512>;
@@ -738,6 +739,22 @@ foreach Index = 0-2 in {
>;
}
+foreach Index = 0-2 in {
+ def Extract_Element_v3i32_#Index : Extract_Element <
+ i32, v3i32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+ def Insert_Element_v3i32_#Index : Insert_Element <
+ i32, v3i32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+
+ def Extract_Element_v3f32_#Index : Extract_Element <
+ f32, v3f32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+ def Insert_Element_v3f32_#Index : Insert_Element <
+ f32, v3f32, Index, !cast<SubRegIndex>(sub#Index)
+ >;
+}
+
foreach Index = 0-3 in {
def Extract_Element_v4i32_#Index : Extract_Element <
i32, v4i32, Index, !cast<SubRegIndex>(sub#Index)
@@ -869,6 +886,10 @@ def : BitConvert <i64, v4f16, VReg_64>;
def : BitConvert <v4i32, v4f32, VReg_128>;
def : BitConvert <v4f32, v4i32, VReg_128>;
+// 96-bit bitcast
+def : BitConvert <v3i32, v3f32, SGPR_96>;
+def : BitConvert <v3f32, v3i32, SGPR_96>;
+
// 128-bit bitcast
def : BitConvert <v2i64, v4i32, SReg_128>;
def : BitConvert <v4i32, v2i64, SReg_128>;
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index c6af4528236..294aa7b8b25 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -415,6 +415,8 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) {
case AMDGPU::SI_SPILL_V128_SAVE:
case AMDGPU::SI_SPILL_V128_RESTORE:
return 4;
+ case AMDGPU::SI_SPILL_S96_SAVE:
+ case AMDGPU::SI_SPILL_S96_RESTORE:
case AMDGPU::SI_SPILL_V96_SAVE:
case AMDGPU::SI_SPILL_V96_RESTORE:
return 3;
@@ -978,12 +980,14 @@ bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex(
case AMDGPU::SI_SPILL_S512_SAVE:
case AMDGPU::SI_SPILL_S256_SAVE:
case AMDGPU::SI_SPILL_S128_SAVE:
+ case AMDGPU::SI_SPILL_S96_SAVE:
case AMDGPU::SI_SPILL_S64_SAVE:
case AMDGPU::SI_SPILL_S32_SAVE:
return spillSGPR(MI, FI, RS, true);
case AMDGPU::SI_SPILL_S512_RESTORE:
case AMDGPU::SI_SPILL_S256_RESTORE:
case AMDGPU::SI_SPILL_S128_RESTORE:
+ case AMDGPU::SI_SPILL_S96_RESTORE:
case AMDGPU::SI_SPILL_S64_RESTORE:
case AMDGPU::SI_SPILL_S32_RESTORE:
return restoreSGPR(MI, FI, RS, true);
@@ -1012,6 +1016,7 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
case AMDGPU::SI_SPILL_S512_SAVE:
case AMDGPU::SI_SPILL_S256_SAVE:
case AMDGPU::SI_SPILL_S128_SAVE:
+ case AMDGPU::SI_SPILL_S96_SAVE:
case AMDGPU::SI_SPILL_S64_SAVE:
case AMDGPU::SI_SPILL_S32_SAVE: {
spillSGPR(MI, Index, RS);
@@ -1022,6 +1027,7 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
case AMDGPU::SI_SPILL_S512_RESTORE:
case AMDGPU::SI_SPILL_S256_RESTORE:
case AMDGPU::SI_SPILL_S128_RESTORE:
+ case AMDGPU::SI_SPILL_S96_RESTORE:
case AMDGPU::SI_SPILL_S64_RESTORE:
case AMDGPU::SI_SPILL_S32_RESTORE: {
restoreSGPR(MI, Index, RS);
@@ -1242,6 +1248,7 @@ const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const {
&AMDGPU::VReg_64RegClass,
&AMDGPU::SReg_64RegClass,
&AMDGPU::VReg_96RegClass,
+ &AMDGPU::SReg_96RegClass,
&AMDGPU::VReg_128RegClass,
&AMDGPU::SReg_128RegClass,
&AMDGPU::VReg_256RegClass,
@@ -1312,6 +1319,8 @@ const TargetRegisterClass *SIRegisterInfo::getEquivalentSGPRClass(
return &AMDGPU::SGPR_32RegClass;
case 64:
return &AMDGPU::SReg_64RegClass;
+ case 96:
+ return &AMDGPU::SReg_96RegClass;
case 128:
return &AMDGPU::SReg_128RegClass;
case 256:
@@ -1336,6 +1345,8 @@ const TargetRegisterClass *SIRegisterInfo::getSubRegClass(
return &AMDGPU::SGPR_32RegClass;
case 2:
return &AMDGPU::SReg_64RegClass;
+ case 3:
+ return &AMDGPU::SReg_96RegClass;
case 4:
return &AMDGPU::SReg_128RegClass;
case 8:
@@ -1603,7 +1614,7 @@ SIRegisterInfo::getConstrainedRegClassForOperand(const MachineOperand &MO,
&AMDGPU::SReg_64_XEXECRegClass;
case 96:
return RB->getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_96RegClass :
- nullptr;
+ &AMDGPU::SReg_96RegClass;
case 128:
return RB->getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_128RegClass :
&AMDGPU::SReg_128RegClass;
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index 551ffcbd4d9..7707e5af1b9 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -177,6 +177,12 @@ def SGPR_64Regs : RegisterTuples<getSubRegs<2>.ret,
[(add (decimate SGPR_32, 2)),
(add (decimate (shl SGPR_32, 1), 2))]>;
+// SGPR 96-bit registers. No operations use these, but for symmetry with 96-bit VGPRs.
+def SGPR_96Regs : RegisterTuples<getSubRegs<3>.ret,
+ [(add (decimate SGPR_32, 3)),
+ (add (decimate (shl SGPR_32, 1), 3)),
+ (add (decimate (shl SGPR_32, 2), 3))]>;
+
// SGPR 128-bit registers
def SGPR_128Regs : RegisterTuples<getSubRegs<4>.ret,
[(add (decimate SGPR_32, 4)),
@@ -424,23 +430,23 @@ def SReg_32_XM0_XEXEC : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f1
(add SGPR_32, VCC_LO, VCC_HI, FLAT_SCR_LO, FLAT_SCR_HI, XNACK_MASK_LO, XNACK_MASK_HI,
TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI, SRC_SHARED_BASE, SRC_SHARED_LIMIT,
SRC_PRIVATE_BASE, SRC_PRIVATE_LIMIT, SRC_POPS_EXITING_WAVE_ID)> {
- let AllocationPriority = 7;
+ let AllocationPriority = 8;
}
def SReg_32_XEXEC_HI : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
(add SReg_32_XM0_XEXEC, EXEC_LO, M0_CLASS)> {
- let AllocationPriority = 7;
+ let AllocationPriority = 8;
}
def SReg_32_XM0 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
(add SReg_32_XM0_XEXEC, EXEC_LO, EXEC_HI)> {
- let AllocationPriority = 7;
+ let AllocationPriority = 8;
}
// Register class for all scalar registers (SGPRs + Special Registers)
def SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
(add SReg_32_XM0, M0_CLASS, EXEC_LO, EXEC_HI, SReg_32_XEXEC_HI)> {
- let AllocationPriority = 7;
+ let AllocationPriority = 8;
}
def SRegOrLds_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
@@ -450,7 +456,7 @@ def SRegOrLds_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 3
def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, v4i16, v4f16], 32, (add SGPR_64Regs)> {
let CopyCost = 1;
- let AllocationPriority = 8;
+ let AllocationPriority = 9;
}
def TTMP_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16], 32, (add TTMP_64Regs)> {
@@ -460,35 +466,47 @@ def TTMP_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16], 32, (add
def SReg_64_XEXEC : RegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16], 32,
(add SGPR_64, VCC, FLAT_SCR, XNACK_MASK, TTMP_64, TBA, TMA)> {
let CopyCost = 1;
- let AllocationPriority = 8;
+ let AllocationPriority = 9;
}
def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16], 32,
(add SReg_64_XEXEC, EXEC)> {
let CopyCost = 1;
- let AllocationPriority = 8;
+ let AllocationPriority = 9;
}
// Requires 2 s_mov_b64 to copy
let CopyCost = 2 in {
-def SGPR_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64], 32, (add SGPR_128Regs)> {
+// There are no 3-component scalar instructions, but this is needed
+// for symmetry with VGPRs.
+def SGPR_96 : RegisterClass<"AMDGPU", [v3i32, v3f32], 32,
+ (add SGPR_96Regs)> {
let AllocationPriority = 10;
}
+def SReg_96 : RegisterClass<"AMDGPU", [v3i32, v3f32], 32,
+ (add SGPR_96)> {
+ let AllocationPriority = 10;
+}
+
+def SGPR_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64], 32, (add SGPR_128Regs)> {
+ let AllocationPriority = 11;
+}
+
def TTMP_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64], 32, (add TTMP_128Regs)> {
let isAllocatable = 0;
}
def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32,
(add SGPR_128, TTMP_128)> {
- let AllocationPriority = 10;
+ let AllocationPriority = 11;
}
} // End CopyCost = 2
def SGPR_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add SGPR_256Regs)> {
- let AllocationPriority = 11;
+ let AllocationPriority = 13;
}
def TTMP_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add TTMP_256Regs)> {
@@ -499,11 +517,11 @@ def SReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32,
(add SGPR_256, TTMP_256)> {
// Requires 4 s_mov_b64 to copy
let CopyCost = 4;
- let AllocationPriority = 11;
+ let AllocationPriority = 13;
}
def SGPR_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32, (add SGPR_512Regs)> {
- let AllocationPriority = 12;
+ let AllocationPriority = 14;
}
def TTMP_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32, (add TTMP_512Regs)> {
@@ -514,7 +532,7 @@ def SReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32,
(add SGPR_512, TTMP_512)> {
// Requires 8 s_mov_b64 to copy
let CopyCost = 8;
- let AllocationPriority = 12;
+ let AllocationPriority = 14;
}
def VRegOrLds_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
@@ -531,7 +549,7 @@ def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32, v4f16, v4i16], 32
let AllocationPriority = 2;
}
-def VReg_96 : RegisterClass<"AMDGPU", [untyped], 32, (add VGPR_96)> {
+def VReg_96 : RegisterClass<"AMDGPU", [v3i32, v3f32], 32, (add VGPR_96)> {
let Size = 96;
// Requires 3 v_mov_b32 to copy
@@ -550,13 +568,13 @@ def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32, (add VG
def VReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add VGPR_256)> {
let Size = 256;
let CopyCost = 8;
- let AllocationPriority = 5;
+ let AllocationPriority = 6;
}
def VReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32, (add VGPR_512)> {
let Size = 512;
let CopyCost = 16;
- let AllocationPriority = 6;
+ let AllocationPriority = 7;
}
def VReg_1 : RegisterClass<"AMDGPU", [i1], 32, (add VGPR_32)> {
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 96128749081..3e1cf68e265 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -814,6 +814,8 @@ unsigned getRegBitWidth(unsigned RCID) {
case AMDGPU::VReg_64RegClassID:
case AMDGPU::SReg_64_XEXECRegClassID:
return 64;
+ case AMDGPU::SGPR_96RegClassID:
+ case AMDGPU::SReg_96RegClassID:
case AMDGPU::VReg_96RegClassID:
return 96;
case AMDGPU::SGPR_128RegClassID:
OpenPOWER on IntegriCloud