diff options
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 6 | ||||
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll | 37 | ||||
| -rw-r--r-- | llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores-private.ll | 26 |
3 files changed, 34 insertions, 35 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 3de6a546a4f..1bc430b3e06 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1069,7 +1069,11 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT, if (!Subtarget->hasUnalignedScratchAccess() && (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS || AddrSpace == AMDGPUAS::FLAT_ADDRESS)) { - return false; + bool AlignedBy4 = Align >= 4; + if (IsFast) + *IsFast = AlignedBy4; + + return AlignedBy4; } if (Subtarget->hasUnalignedBufferAccess()) { diff --git a/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll index b0268dc61ee..bf2f5d38bab 100644 --- a/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll +++ b/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll @@ -78,6 +78,16 @@ entry: ; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s5 offset:8 ; GCN-DAG: buffer_store_dword [[THIRTEEN]], off, s[0:3], s5 offset:24 +; GCN: buffer_load_dword [[LOAD4:v[0-9]+]], off, s[0:3], s5 offset:24 +; GCN: buffer_load_dword [[LOAD5:v[0-9]+]], off, s[0:3], s5 offset:28 +; GCN: buffer_load_dword [[LOAD6:v[0-9]+]], off, s[0:3], s5 offset:32 +; GCN: buffer_load_dword [[LOAD7:v[0-9]+]], off, s[0:3], s5 offset:36 + +; GCN-DAG: buffer_store_dword [[LOAD4]], off, s[0:3], s32 offset:20 +; GCN-DAG: buffer_store_dword [[LOAD5]], off, s[0:3], s32 offset:24 +; GCN-DAG: buffer_store_dword [[LOAD6]], off, s[0:3], s32 offset:28 +; GCN-DAG: buffer_store_dword [[LOAD7]], off, s[0:3], s32 offset:32 + ; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s5 offset:8 ; GCN-DAG: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s5 offset:12 ; GCN-DAG: buffer_load_dword [[LOAD2:v[0-9]+]], off, s[0:3], s5 offset:16 @@ -90,15 +100,6 @@ entry: ; GCN-DAG: buffer_store_dword [[LOAD2]], off, s[0:3], s32 offset:12 ; GCN-DAG: buffer_store_dword [[LOAD3]], off, s[0:3], s32 offset:16 -; GCN: buffer_load_dword [[LOAD4:v[0-9]+]], off, s[0:3], s5 offset:24 -; GCN: buffer_load_dword [[LOAD5:v[0-9]+]], off, s[0:3], s5 offset:28 -; GCN: buffer_load_dword [[LOAD6:v[0-9]+]], off, s[0:3], s5 offset:32 -; GCN: buffer_load_dword [[LOAD7:v[0-9]+]], off, s[0:3], s5 offset:36 - -; GCN-DAG: buffer_store_dword [[LOAD4]], off, s[0:3], s32 offset:20 -; GCN-DAG: buffer_store_dword [[LOAD5]], off, s[0:3], s32 offset:24 -; GCN-DAG: buffer_store_dword [[LOAD6]], off, s[0:3], s32 offset:28 -; GCN-DAG: buffer_store_dword [[LOAD7]], off, s[0:3], s32 offset:32 ; GCN: s_swappc_b64 ; GCN-NOT: v_readlane_b32 s32 @@ -271,6 +272,16 @@ entry: ; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s5 offset:8 ; GCN-DAG: buffer_store_dword [[THIRTEEN]], off, s[0:3], s5 offset:24 +; GCN: buffer_load_dword [[LOAD4:v[0-9]+]], off, s[0:3], s5 offset:24 +; GCN: buffer_load_dword [[LOAD5:v[0-9]+]], off, s[0:3], s5 offset:28 +; GCN: buffer_load_dword [[LOAD6:v[0-9]+]], off, s[0:3], s5 offset:32 +; GCN: buffer_load_dword [[LOAD7:v[0-9]+]], off, s[0:3], s5 offset:36 + +; GCN-DAG: buffer_store_dword [[LOAD4]], off, s[0:3], s32 offset:24 +; GCN-DAG: buffer_store_dword [[LOAD5]], off, s[0:3], s32 offset:28 +; GCN-DAG: buffer_store_dword [[LOAD6]], off, s[0:3], s32 offset:32 +; GCN-DAG: buffer_store_dword [[LOAD7]], off, s[0:3], s32 offset:36 + ; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s5 offset:8 ; GCN-DAG: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s5 offset:12 ; GCN-DAG: buffer_load_dword [[LOAD2:v[0-9]+]], off, s[0:3], s5 offset:16 @@ -283,15 +294,7 @@ entry: ; GCN-DAG: buffer_store_dword [[LOAD2]], off, s[0:3], s32 offset:16 ; GCN-DAG: buffer_store_dword [[LOAD3]], off, s[0:3], s32 offset:20 -; GCN: buffer_load_dword [[LOAD4:v[0-9]+]], off, s[0:3], s5 offset:24 -; GCN: buffer_load_dword [[LOAD5:v[0-9]+]], off, s[0:3], s5 offset:28 -; GCN: buffer_load_dword [[LOAD6:v[0-9]+]], off, s[0:3], s5 offset:32 -; GCN: buffer_load_dword [[LOAD7:v[0-9]+]], off, s[0:3], s5 offset:36 -; GCN-DAG: buffer_store_dword [[LOAD4]], off, s[0:3], s32 offset:24 -; GCN-DAG: buffer_store_dword [[LOAD5]], off, s[0:3], s32 offset:28 -; GCN-DAG: buffer_store_dword [[LOAD6]], off, s[0:3], s32 offset:32 -; GCN-DAG: buffer_store_dword [[LOAD7]], off, s[0:3], s32 offset:36 ; GCN: s_swappc_b64 ; GCN-NOT: v_readlane_b32 s32 diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores-private.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores-private.ll index a44c1321fd9..4292cbcec85 100644 --- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores-private.ll +++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores-private.ll @@ -8,13 +8,13 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" ; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i32 -; ALIGNED: store i32 -; ALIGNED: store i32 -; ALIGNED: store i32 -; ALIGNED: store i32 +; ELT4-ALIGNED: store i32 +; ELT4-ALIGNED: store i32 +; ELT4-ALIGNED: store i32 +; ELT4-ALIGNED: store i32 -; ELT8-UNALIGNED: store <2 x i32> -; ELT8-UNALIGNED: store <2 x i32> +; ELT8: store <2 x i32> +; ELT8: store <2 x i32> ; ELT16-UNALIGNED: store <4 x i32> define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i32(i32 addrspace(5)* %out) #0 { @@ -167,18 +167,10 @@ define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v2i16_align8( ; ELT4: store i32 ; ELT4: store i32 -; ELT8-ALIGNED: store i32 -; ELT8-ALIGNED: store i32 -; ELT8-ALIGNED: store i32 +; ELT8: store <2 x i32> +; ELT8: store i32 -; ELT8-UNALIGNED: store <2 x i32> -; ELT8-UNALIGNED: store i32 - -; ELT16-ALIGNED: store i32 -; ELT16-ALIGNED: store i32 -; ELT16-ALIGNED: store i32 - -; ELT16-UNALIGNED: store <3 x i32> +; ELT16: store <3 x i32> define amdgpu_kernel void @merge_private_store_3_vector_elts_loads_v4i32(i32 addrspace(5)* %out) #0 { %out.gep.1 = getelementptr i32, i32 addrspace(5)* %out, i32 1 %out.gep.2 = getelementptr i32, i32 addrspace(5)* %out, i32 2 |

