diff options
| author | Jan Vesely <jan.vesely@rutgers.edu> | 2016-09-02 19:07:06 +0000 |
|---|---|---|
| committer | Jan Vesely <jan.vesely@rutgers.edu> | 2016-09-02 19:07:06 +0000 |
| commit | 00864886f4683e95dc38e4060701ec52a5ee6ff8 (patch) | |
| tree | ba974d0a6856e6a25d4cf834eca9c9a6db46e8db /llvm | |
| parent | 341e825eae615f8019c667f15ff62ebc4ade005a (diff) | |
| download | bcm5719-llvm-00864886f4683e95dc38e4060701ec52a5ee6ff8.tar.gz bcm5719-llvm-00864886f4683e95dc38e4060701ec52a5ee6ff8.zip | |
AMDGPU/R600: Expand unaligned writes to local and global AS
LOCAL and GLOBAL AS only
PRIVATE needs special treatment
Differential Revision: https://reviews.llvm.org/D23971
llvm-svn: 280526
Diffstat (limited to 'llvm')
| -rw-r--r-- | llvm/lib/Target/AMDGPU/R600ISelLowering.cpp | 13 | ||||
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/store-global.ll | 135 | ||||
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/store-local.ll | 49 |
3 files changed, 189 insertions, 8 deletions
diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp index 5e525fbb522..8c252e8c725 100644 --- a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp @@ -1120,26 +1120,36 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { unsigned AS = StoreNode->getAddressSpace(); SDValue Value = StoreNode->getValue(); EVT ValueVT = Value.getValueType(); + EVT MemVT = StoreNode->getMemoryVT(); + unsigned Align = StoreNode->getAlignment(); if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS) && ValueVT.isVector()) { return SplitVectorStore(Op, DAG); } + // Private AS needs special fixes + if (Align < MemVT.getStoreSize() && (AS != AMDGPUAS::PRIVATE_ADDRESS) && + !allowsMisalignedMemoryAccesses(MemVT, AS, Align, NULL)) { + return expandUnalignedStore(StoreNode, DAG); + } + SDLoc DL(Op); SDValue Chain = StoreNode->getChain(); SDValue Ptr = StoreNode->getBasePtr(); if (AS == AMDGPUAS::GLOBAL_ADDRESS) { + // It is beneficial to create MSKOR here instead of combiner to avoid + // artificial dependencies introduced by RMW if (StoreNode->isTruncatingStore()) { EVT VT = Value.getValueType(); assert(VT.bitsLE(MVT::i32)); - EVT MemVT = StoreNode->getMemoryVT(); SDValue MaskConstant; if (MemVT == MVT::i8) { MaskConstant = DAG.getConstant(0xFF, DL, MVT::i32); } else { assert(MemVT == MVT::i16); + assert(StoreNode->getAlignment() >= 2); MaskConstant = DAG.getConstant(0xFFFF, DL, MVT::i32); } SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, VT, Ptr, @@ -1183,7 +1193,6 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { if (AS != AMDGPUAS::PRIVATE_ADDRESS) return SDValue(); - EVT MemVT = StoreNode->getMemoryVT(); if (MemVT.bitsLT(MVT::i32)) return lowerPrivateTruncStore(StoreNode, DAG); diff --git a/llvm/test/CodeGen/AMDGPU/store-global.ll b/llvm/test/CodeGen/AMDGPU/store-global.ll index 3604a97131e..659ea8718d7 100644 --- a/llvm/test/CodeGen/AMDGPU/store-global.ll +++ b/llvm/test/CodeGen/AMDGPU/store-global.ll @@ -5,6 +5,11 @@ ; FUNC-LABEL: {{^}}store_i1: ; EG: MEM_RAT MSKOR +; EG-NOT: MEM_RAT MSKOR + +; CM: MEM_RAT MSKOR +; CM-NOT: MEM_RAT MSKOR + ; GCN: buffer_store_byte define void @store_i1(i1 addrspace(1)* %out) { entry: @@ -15,6 +20,7 @@ entry: ; i8 store ; FUNC-LABEL: {{^}}store_i8: ; EG: MEM_RAT MSKOR T[[RW_GPR:[0-9]]].XW, T{{[0-9]}}.X +; EG-NOT: MEM_RAT MSKOR ; IG 0: Get the byte index and truncate the value ; EG: AND_INT * T{{[0-9]}}.[[BI_CHAN:[XYZW]]], KC0[2].Y, literal.x @@ -45,6 +51,7 @@ entry: ; i16 store ; FUNC-LABEL: {{^}}store_i16: ; EG: MEM_RAT MSKOR T[[RW_GPR:[0-9]]].XW, T{{[0-9]}}.X +; EG-NOT: MEM_RAT MSKOR ; IG 0: Get the byte index and truncate the value @@ -78,6 +85,9 @@ entry: ; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 ; GCN-DAG: buffer_store_byte ; GCN-DAG: buffer_store_short + +; EG: MEM_RAT MSKOR +; EG: MEM_RAT MSKOR define void @store_i24(i24 addrspace(1)* %out, i24 %in) { entry: store i24 %in, i24 addrspace(1)* %out @@ -88,6 +98,12 @@ entry: ; GCN: s_and_b32 [[AND:s[0-9]+]], s{{[0-9]+}}, 0x1ffffff{{$}} ; GCN: v_mov_b32_e32 [[VAND:v[0-9]+]], [[AND]] ; GCN: buffer_store_dword [[VAND]] + +; EG: MEM_RAT_CACHELESS STORE_RAW +; EG-NOT: MEM_RAT + +; CM: MEM_RAT_CACHELESS STORE_DWORD +; CM-NOT: MEM_RAT define void @store_i25(i25 addrspace(1)* %out, i25 %in) { entry: store i25 %in, i25 addrspace(1)* %out @@ -95,9 +111,13 @@ entry: } ; FUNC-LABEL: {{^}}store_v2i8: +; v2i8 is naturally 2B aligned ; EG: MEM_RAT MSKOR ; EG-NOT: MEM_RAT MSKOR +; CM: MEM_RAT MSKOR +; CM-NOT: MEM_RAT MSKOR + ; GCN: buffer_store_short define void @store_v2i8(<2 x i8> addrspace(1)* %out, <2 x i32> %in) { entry: @@ -106,6 +126,23 @@ entry: ret void } +; FUNC-LABEL: {{^}}store_v2i8_unaligned: +; EG: MEM_RAT MSKOR +; EG: MEM_RAT MSKOR +; EG-NOT: MEM_RAT MSKOR + +; CM: MEM_RAT MSKOR +; CM: MEM_RAT MSKOR +; CM-NOT: MEM_RAT MSKOR + +; SI: buffer_store_byte +define void @store_v2i8_unaligned(<2 x i8> addrspace(1)* %out, <2 x i32> %in) { +entry: + %0 = trunc <2 x i32> %in to <2 x i8> + store <2 x i8> %0, <2 x i8> addrspace(1)* %out, align 1 + ret void +} + ; FUNC-LABEL: {{^}}store_v2i16: ; EG: MEM_RAT_CACHELESS STORE_RAW @@ -120,6 +157,26 @@ entry: ret void } +; FUNC-LABEL: {{^}}store_v2i16_unaligned: +; EG: MEM_RAT MSKOR +; EG: MEM_RAT MSKOR +; EG-NOT: MEM_RAT MSKOR +; EG-NOT: MEM_RAT_CACHELESS STORE_RAW + +; CM: MEM_RAT MSKOR +; CM: MEM_RAT MSKOR +; CM-NOT: MEM_RAT MSKOR +; CM-NOT: MEM_RAT_CACHELESS STORE_DWORD + +; SI: buffer_store_short +; SI: buffer_store_short +define void @store_v2i16_unaligned(<2 x i16> addrspace(1)* %out, <2 x i32> %in) { +entry: + %0 = trunc <2 x i32> %in to <2 x i16> + store <2 x i16> %0, <2 x i16> addrspace(1)* %out, align 2 + ret void +} + ; FUNC-LABEL: {{^}}store_v4i8: ; EG: MEM_RAT_CACHELESS STORE_RAW @@ -133,6 +190,54 @@ entry: ret void } +; FUNC-LABEL: {{^}}store_v4i8_unaligned: +; EG: MEM_RAT MSKOR +; EG: MEM_RAT MSKOR +; EG: MEM_RAT MSKOR +; EG: MEM_RAT MSKOR +; EG-NOT: MEM_RAT MSKOR +; EG-NOT: MEM_RAT_CACHELESS STORE_RAW + +; CM: MEM_RAT MSKOR +; CM: MEM_RAT MSKOR +; CM: MEM_RAT MSKOR +; CM: MEM_RAT MSKOR +; CM-NOT: MEM_RAT MSKOR +; CM-NOT: MEM_RAT_CACHELESS STORE_DWORD + +; SI: buffer_store_byte +; SI: buffer_store_byte +; SI: buffer_store_byte +; SI: buffer_store_byte +; SI-NOT: buffer_store_dword +define void @store_v4i8_unaligned(<4 x i8> addrspace(1)* %out, <4 x i32> %in) { +entry: + %0 = trunc <4 x i32> %in to <4 x i8> + store <4 x i8> %0, <4 x i8> addrspace(1)* %out, align 1 + ret void +} + +; FUNC-LABEL: {{^}}store_v4i8_halfaligned: +; EG: MEM_RAT MSKOR +; EG: MEM_RAT MSKOR +; EG-NOT: MEM_RAT MSKOR +; EG-NOT: MEM_RAT_CACHELESS STORE_RAW + +; CM: MEM_RAT MSKOR +; CM: MEM_RAT MSKOR +; CM-NOT: MEM_RAT MSKOR +; CM-NOT: MEM_RAT_CACHELESS STORE_DWORD + +; SI: buffer_store_short +; SI: buffer_store_short +; SI-NOT: buffer_store_dword +define void @store_v4i8_halfaligned(<4 x i8> addrspace(1)* %out, <4 x i32> %in) { +entry: + %0 = trunc <4 x i32> %in to <4 x i8> + store <4 x i8> %0, <4 x i8> addrspace(1)* %out, align 2 + ret void +} + ; floating-point store ; FUNC-LABEL: {{^}}store_f32: ; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+\.X, T[0-9]+\.X}}, 1 @@ -147,7 +252,9 @@ define void @store_f32(float addrspace(1)* %out, float %in) { } ; FUNC-LABEL: {{^}}store_v4i16: -; MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XYZW +; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY + +; CM: MEM_RAT_CACHELESS STORE_DWORD T{{[0-9]+}} ; GCN: buffer_store_dwordx2 define void @store_v4i16(<4 x i16> addrspace(1)* %out, <4 x i32> %in) { @@ -198,6 +305,20 @@ entry: ret void } +; FUNC-LABEL: {{^}}store_v4i32_unaligned: +; EG: MEM_RAT_CACHELESS STORE_RAW {{T[0-9]+\.XYZW}} +; EG-NOT: MEM_RAT_CACHELESS STORE_RAW + +; CM: MEM_RAT_CACHELESS STORE_DWORD +; CM-NOT: MEM_RAT_CACHELESS STORE_DWORD + +; SI: buffer_store_dwordx4 +define void @store_v4i32_unaligned(<4 x i32> addrspace(1)* %out, <4 x i32> %in) { +entry: + store <4 x i32> %in, <4 x i32> addrspace(1)* %out, align 4 + ret void +} + ; v4f32 store ; FUNC-LABEL: {{^}}store_v4f32: ; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+\.XYZW, T[0-9]+\.X}}, 1 @@ -215,6 +336,9 @@ define void @store_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1 ; FUNC-LABEL: {{^}}store_i64_i8: ; EG: MEM_RAT MSKOR + +; CM: MEM_RAT MSKOR + ; GCN: buffer_store_byte define void @store_i64_i8(i8 addrspace(1)* %out, i64 %in) { entry: @@ -234,16 +358,15 @@ entry: } ; The stores in this function are combined by the optimizer to create a -; 64-bit store with 32-bit alignment. This is legal for GCN and the legalizer +; 64-bit store with 32-bit alignment. This is legal and the legalizer ; should not try to split the 64-bit store back into 2 32-bit stores. -; -; Evergreen / Northern Islands don't support 64-bit stores yet, so there should -; be two 32-bit stores. ; FUNC-LABEL: {{^}}vecload2: -; EG: MEM_RAT_CACHELESS STORE_RAW +; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+\.XY, T[0-9]+\.X}}, 1 +; EG-NOT: MEM_RAT_CACHELESS STORE_RAW ; CM: MEM_RAT_CACHELESS STORE_DWORD +; CM-NOT: MEM_RAT_CACHELESS STORE_DWORD ; GCN: buffer_store_dwordx2 define void @vecload2(i32 addrspace(1)* nocapture %out, i32 addrspace(2)* nocapture %mem) #0 { diff --git a/llvm/test/CodeGen/AMDGPU/store-local.ll b/llvm/test/CodeGen/AMDGPU/store-local.ll index 857be849ef5..03fd30ca9a2 100644 --- a/llvm/test/CodeGen/AMDGPU/store-local.ll +++ b/llvm/test/CodeGen/AMDGPU/store-local.ll @@ -5,6 +5,9 @@ ; FUNC-LABEL: {{^}}store_local_i1: ; EG: LDS_BYTE_WRITE + +; CM: LDS_BYTE_WRITE + ; GCN: ds_write_b8 define void @store_local_i1(i1 addrspace(3)* %out) { entry: @@ -15,6 +18,8 @@ entry: ; FUNC-LABEL: {{^}}store_local_i8: ; EG: LDS_BYTE_WRITE +; CM: LDS_BYTE_WRITE + ; GCN: ds_write_b8 define void @store_local_i8(i8 addrspace(3)* %out, i8 %in) { store i8 %in, i8 addrspace(3)* %out @@ -24,6 +29,8 @@ define void @store_local_i8(i8 addrspace(3)* %out, i8 %in) { ; FUNC-LABEL: {{^}}store_local_i16: ; EG: LDS_SHORT_WRITE +; CM: LDS_SHORT_WRITE + ; GCN: ds_write_b16 define void @store_local_i16(i16 addrspace(3)* %out, i16 %in) { store i16 %in, i16 addrspace(3)* %out @@ -54,12 +61,54 @@ entry: ret void } +; FUNC-LABEL: {{^}}store_local_v4i8_unaligned: +; EG: LDS_BYTE_WRITE +; EG: LDS_BYTE_WRITE +; EG: LDS_BYTE_WRITE +; EG: LDS_BYTE_WRITE +; EG-NOT: LDS_WRITE + +; CM: LDS_BYTE_WRITE +; CM: LDS_BYTE_WRITE +; CM: LDS_BYTE_WRITE +; CM: LDS_BYTE_WRITE +; CM-NOT: LDS_WRITE + +; GCN: ds_write_b8 +; GCN: ds_write_b8 +; GCN: ds_write_b8 +; GCN: ds_write_b8 +define void @store_local_v4i8_unaligned(<4 x i8> addrspace(3)* %out, <4 x i8> %in) { +entry: + store <4 x i8> %in, <4 x i8> addrspace(3)* %out, align 1 + ret void +} + +; FUNC-LABEL: {{^}}store_local_v4i8_halfaligned: +; EG: LDS_SHORT_WRITE +; EG: LDS_SHORT_WRITE +; EG-NOT: LDS_WRITE + +; CM: LDS_SHORT_WRITE +; CM: LDS_SHORT_WRITE +; CM-NOT: LDS_WRITE + +; GCN: ds_write_b16 +; GCN: ds_write_b16 +define void @store_local_v4i8_halfaligned(<4 x i8> addrspace(3)* %out, <4 x i8> %in) { +entry: + store <4 x i8> %in, <4 x i8> addrspace(3)* %out, align 2 + ret void +} + ; FUNC-LABEL: {{^}}store_local_v2i32: ; EG: LDS_WRITE ; EG: LDS_WRITE +; EG-NOT: LDS_WRITE ; CM: LDS_WRITE ; CM: LDS_WRITE +; CM-NOT: LDS_WRITE ; GCN: ds_write_b64 define void @store_local_v2i32(<2 x i32> addrspace(3)* %out, <2 x i32> %in) { |

