diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU')
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUInstructions.td | 8 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h | 6 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp | 8 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/DSInstructions.td | 1 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 16 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstrInfo.td | 4 |
6 files changed, 33 insertions, 10 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td index 4f28d6fa430..e719933b052 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -248,6 +248,10 @@ class Aligned8Bytes <dag ops, dag frag> : PatFrag <ops, frag, [{ return cast<MemSDNode>(N)->getAlignment() % 8 == 0; }]>; +class Aligned16Bytes <dag ops, dag frag> : PatFrag <ops, frag, [{ + return cast<MemSDNode>(N)->getAlignment() >= 16; +}]>; + class LoadFrag <SDPatternOperator op> : PatFrag<(ops node:$ptr), (op node:$ptr)>; class StoreFrag<SDPatternOperator op> : PatFrag < @@ -371,6 +375,10 @@ def load_align8_local : Aligned8Bytes < (ops node:$ptr), (load_local node:$ptr) >; +def load_align16_local : Aligned16Bytes < + (ops node:$ptr), (load_local node:$ptr) +>; + def store_align8_local : Aligned8Bytes < (ops node:$val, node:$ptr), (store_local node:$val, node:$ptr) >; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h index 830c0770706..0006636432f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -414,6 +414,12 @@ public: return FlatForGlobal; } + /// \returns If target supports ds_read/write_b128 and user enables generation + /// of ds_read/write_b128. + bool useDS128(bool UserEnable) const { + return CIInsts && UserEnable; + } + /// \returns If MUBUF instructions always perform range checking, even for /// buffer resources used for private memory access. bool privateMemoryResourceIsRangeChecked() const { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 4292575c601..a22c1803c23 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -265,11 +265,11 @@ unsigned AMDGPUTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const { return 512; } - if (AddrSpace == AS.FLAT_ADDRESS) - return 128; - if (AddrSpace == AS.LOCAL_ADDRESS || + if (AddrSpace == AS.FLAT_ADDRESS || + AddrSpace == AS.LOCAL_ADDRESS || AddrSpace == AS.REGION_ADDRESS) - return 64; + return 128; + if (AddrSpace == AS.PRIVATE_ADDRESS) return 8 * ST->getMaxPrivateElementSize(); diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td index ec85da72bfa..88484c06218 100644 --- a/llvm/lib/Target/AMDGPU/DSInstructions.td +++ b/llvm/lib/Target/AMDGPU/DSInstructions.td @@ -649,6 +649,7 @@ defm : DSReadPat_mc <DS_READ_B32, i32, "load_local">; let AddedComplexity = 100 in { defm : DSReadPat_mc <DS_READ_B64, v2i32, "load_align8_local">; +defm : DSReadPat_mc <DS_READ_B128, v4i32, "load_align16_local">; } // End AddedComplexity = 100 diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 68a45cb8817..8463b22d60a 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -94,6 +94,11 @@ static cl::opt<bool> EnableVGPRIndexMode( cl::desc("Use GPR indexing mode instead of movrel for vector indexing"), cl::init(false)); +static cl::opt<bool> EnableDS128( + "amdgpu-ds128", + cl::desc("Use DS_read/write_b128"), + cl::init(false)); + static cl::opt<unsigned> AssumeFrameIndexHighZeroBits( "amdgpu-frame-index-zero-bits", cl::desc("High bits of frame index assumed to be zero"), @@ -5425,14 +5430,13 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { llvm_unreachable("unsupported private_element_size"); } } else if (AS == AMDGPUASI.LOCAL_ADDRESS) { - if (NumElements > 2) - return SplitVectorLoad(Op, DAG); - - if (NumElements == 2) + // Use ds_read_b128 if possible. + if (Subtarget->useDS128(EnableDS128) && Load->getAlignment() >= 16 && + MemVT.getStoreSize() == 16) return SDValue(); - // If properly aligned, if we split we might be able to use ds_read_b64. - return SplitVectorLoad(Op, DAG); + if (NumElements > 2) + return SplitVectorLoad(Op, DAG); } return SDValue(); } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 10f5c3bae3f..fb46d174bff 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -410,6 +410,9 @@ def sextloadi16_glue : PatFrag<(ops node:$ptr), (sextload_glue node:$ptr), [{ def load_glue_align8 : Aligned8Bytes < (ops node:$ptr), (load_glue node:$ptr) >; +def load_glue_align16 : Aligned16Bytes < + (ops node:$ptr), (load_glue node:$ptr) +>; def load_local_m0 : LoadFrag<load_glue>, LocalAddress; @@ -418,6 +421,7 @@ def sextloadi16_local_m0 : LoadFrag<sextloadi16_glue>, LocalAddress; def az_extloadi8_local_m0 : LoadFrag<az_extloadi8_glue>, LocalAddress; def az_extloadi16_local_m0 : LoadFrag<az_extloadi16_glue>, LocalAddress; def load_align8_local_m0 : LoadFrag <load_glue_align8>, LocalAddress; +def load_align16_local_m0 : LoadFrag <load_glue_align16>, LocalAddress; def AMDGPUst_glue : SDNode <"ISD::STORE", SDTStore, |