diff options
| author | Matt Arsenault <Matthew.Arsenault@amd.com> | 2016-07-01 23:03:44 +0000 |
|---|---|---|
| committer | Matt Arsenault <Matthew.Arsenault@amd.com> | 2016-07-01 23:03:44 +0000 |
| commit | 7f681ac7a983c8edcce930e4e4ac7c5f71c75310 (patch) | |
| tree | 3c6bcaa990cb00dbf383bb7975873775ac216e28 /llvm/lib | |
| parent | 8af47a09e5e91e7ea8845d9403f06766f05b5395 (diff) | |
| download | bcm5719-llvm-7f681ac7a983c8edcce930e4e4ac7c5f71c75310.tar.gz bcm5719-llvm-7f681ac7a983c8edcce930e4e4ac7c5f71c75310.zip | |
AMDGPU: Add feature for unaligned access
llvm-svn: 274398
Diffstat (limited to 'llvm/lib')
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPU.td | 6 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp | 5 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h | 5 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 22 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstrInfo.td | 6 |
5 files changed, 32 insertions, 12 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 607e8d9bfdd..72c45535441 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -61,6 +61,12 @@ def FeatureFlatAddressSpace : SubtargetFeature<"flat-address-space", "Support flat address space" >; +def FeatureUnalignedBufferAccess : SubtargetFeature<"unaligned-buffer-access", + "UnalignedBufferAccess", + "true", + "Support unaligned global loads and stores" +>; + def FeatureXNACK : SubtargetFeature<"xnack", "EnableXNACK", "true", diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index 9843ddf590a..10fa9cf4673 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -47,7 +47,7 @@ AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT, SmallString<256> FullFS("+promote-alloca,+fp64-denormals,+load-store-opt,"); if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA. - FullFS += "+flat-for-global,"; + FullFS += "+flat-for-global,+unaligned-buffer-access,"; FullFS += FS; ParseSubtargetFeatures(GPU, FullFS); @@ -85,6 +85,8 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, FP64Denormals(false), FPExceptions(false), FlatForGlobal(false), + UnalignedBufferAccess(false), + EnableXNACK(false), DebuggerInsertNops(false), DebuggerReserveRegs(false), @@ -114,7 +116,6 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, TexVTXClauseSize(0), FeatureDisable(false), - InstrItins(getInstrItineraryForCPU(GPU)) { initializeSubtargetDependencies(TT, GPU, FS); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h index 44560f30327..3fe61aa449e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -74,6 +74,7 @@ protected: bool FP64Denormals; bool FPExceptions; bool FlatForGlobal; + bool UnalignedBufferAccess; bool EnableXNACK; bool DebuggerInsertNops; bool DebuggerReserveRegs; @@ -254,6 +255,10 @@ public: return FlatForGlobal; } + bool hasUnalignedBufferAccess() const { + return UnalignedBufferAccess; + } + bool isXNACKEnabled() const { return EnableXNACK; } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index a20f8d00e82..8f36aaa2f45 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -438,24 +438,30 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT, if (!VT.isSimple() || VT == MVT::Other) return false; - // TODO - CI+ supports unaligned memory accesses, but this requires driver - // support. - - // XXX - The only mention I see of this in the ISA manual is for LDS direct - // reads the "byte address and must be dword aligned". Is it also true for the - // normal loads and stores? - if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS) { + if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS || + AddrSpace == AMDGPUAS::REGION_ADDRESS) { // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte // aligned, 8 byte access in a single operation using ds_read2/write2_b32 // with adjacent offsets. bool AlignedBy4 = (Align % 4 == 0); if (IsFast) *IsFast = AlignedBy4; + return AlignedBy4; } + if (Subtarget->hasUnalignedBufferAccess()) { + // If we have an uniform constant load, it still requires using a slow + // buffer instruction if unaligned. + if (IsFast) { + *IsFast = (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS) ? + (Align % 4 == 0) : true; + } + + return true; + } + // Smaller than dword value must be aligned. - // FIXME: This should be allowed on CI+ if (VT.bitsLT(MVT::i32)) return false; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index e706dfa4124..a113ca2a25a 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -183,8 +183,10 @@ def mubuf_load_atomic : PatFrag <(ops node:$ptr), (atomic_load node:$ptr), [{ def smrd_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{ - return isConstantLoad(cast<LoadSDNode>(N), -1) && - static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpUniform(N); + auto Ld = cast<LoadSDNode>(N); + return Ld->getAlignment() >= 4 && + isConstantLoad(Ld, -1) && + static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpUniform(N); }]>; //===----------------------------------------------------------------------===// |

