diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU')
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp | 34 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h | 6 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 10 |
3 files changed, 44 insertions, 6 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 6d6ab084ee6..4292575c601 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -234,12 +234,38 @@ unsigned AMDGPUTTIImpl::getMinVectorRegisterBitWidth() const { return 32; } +unsigned AMDGPUTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize, + unsigned ChainSizeInBytes, + VectorType *VecTy) const { + unsigned VecRegBitWidth = VF * LoadSize; + if (VecRegBitWidth > 128 && VecTy->getScalarSizeInBits() < 32) + // TODO: Support element-size less than 32bit? + return 128 / LoadSize; + + return VF; +} + +unsigned AMDGPUTTIImpl::getStoreVectorFactor(unsigned VF, unsigned StoreSize, + unsigned ChainSizeInBytes, + VectorType *VecTy) const { + unsigned VecRegBitWidth = VF * StoreSize; + if (VecRegBitWidth > 128) + return 128 / StoreSize; + + return VF; +} + unsigned AMDGPUTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const { AMDGPUAS AS = ST->getAMDGPUAS(); if (AddrSpace == AS.GLOBAL_ADDRESS || AddrSpace == AS.CONSTANT_ADDRESS || - AddrSpace == AS.CONSTANT_ADDRESS_32BIT || - AddrSpace == AS.FLAT_ADDRESS) + AddrSpace == AS.CONSTANT_ADDRESS_32BIT) { + if (ST->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) + return 128; + return 512; + } + + if (AddrSpace == AS.FLAT_ADDRESS) return 128; if (AddrSpace == AS.LOCAL_ADDRESS || AddrSpace == AS.REGION_ADDRESS) @@ -250,8 +276,8 @@ unsigned AMDGPUTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const { if (ST->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS && (AddrSpace == AS.PARAM_D_ADDRESS || AddrSpace == AS.PARAM_I_ADDRESS || - (AddrSpace >= AS.CONSTANT_BUFFER_0 && - AddrSpace <= AS.CONSTANT_BUFFER_15))) + (AddrSpace >= AS.CONSTANT_BUFFER_0 && + AddrSpace <= AS.CONSTANT_BUFFER_15))) return 128; llvm_unreachable("unhandled address space"); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h index 8899d2c6da8..a112757173d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -118,6 +118,12 @@ public: unsigned getNumberOfRegisters(bool Vector) const; unsigned getRegisterBitWidth(bool Vector) const; unsigned getMinVectorRegisterBitWidth() const; + unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize, + unsigned ChainSizeInBytes, + VectorType *VecTy) const; + unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize, + unsigned ChainSizeInBytes, + VectorType *VecTy) const; unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const; bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 54aef36333d..68a45cb8817 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -3454,6 +3454,10 @@ bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { return false; } +static bool isDwordAligned(unsigned Alignment) { + return Alignment % 4 == 0; +} + //===----------------------------------------------------------------------===// // Custom DAG Lowering Operations //===----------------------------------------------------------------------===// @@ -5353,9 +5357,10 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { assert(Op.getValueType().getVectorElementType() == MVT::i32 && "Custom lowering for non-i32 vectors hasn't been implemented."); + unsigned Alignment = Load->getAlignment(); unsigned AS = Load->getAddressSpace(); if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT, - AS, Load->getAlignment())) { + AS, Alignment)) { SDValue Ops[2]; std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG); return DAG.getMergeValues(Ops, DL); @@ -5383,7 +5388,8 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT || AS == AMDGPUASI.GLOBAL_ADDRESS) { if (Subtarget->getScalarizeGlobalBehavior() && !Op->isDivergent() && - !Load->isVolatile() && isMemOpHasNoClobberedMemOperand(Load)) + !Load->isVolatile() && isMemOpHasNoClobberedMemOperand(Load) && + isDwordAligned(Alignment)) return SDValue(); // Non-uniform loads will be selected to MUBUF instructions, so they // have the same legalization requirements as global and private |