diff options
| author | Nicolai Haehnle <nhaehnle@gmail.com> | 2019-06-25 11:52:30 +0000 |
|---|---|---|
| committer | Nicolai Haehnle <nhaehnle@gmail.com> | 2019-06-25 11:52:30 +0000 |
| commit | 2710171a15e8eef351c53e3c3555fa89476268f0 (patch) | |
| tree | ac67d18f75381204115f372d595d2c4a60965533 /llvm/test | |
| parent | 08e8cb576021ce493329bbc9fa29e31cb77bfbda (diff) | |
| download | bcm5719-llvm-2710171a15e8eef351c53e3c3555fa89476268f0.tar.gz bcm5719-llvm-2710171a15e8eef351c53e3c3555fa89476268f0.zip | |
AMDGPU: Write LDS objects out as global symbols in code generation
Summary:
The symbols use the processor-specific SHN_AMDGPU_LDS section index
introduced with a previous change. The linker is then expected to resolve
relocations, which are also emitted.
Initially disabled for HSA and PAL environments until they have caught up
in terms of linker and runtime loader.
Some notes:
- The llvm.amdgcn.groupstaticsize intrinsics can no longer be lowered
to a constant at compile times, which means some tests can no longer
be applied.
The current "solution" is a terrible hack, but the intrinsic isn't
used by Mesa, so we can keep it for now.
- We no longer know the full LDS size per kernel at compile time, which
means that we can no longer generate a relevant error message at
compile time. It would be possible to add a check for the size of
individual variables, but ultimately the linker will have to perform
the final check.
Change-Id: If66dbf33fccfbf3609aefefa2558ac0850d42275
Reviewers: arsenm, rampitec, t-tye, b-sumner, jsjodin
Subscribers: qcolombet, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D61494
llvm-svn: 364297
Diffstat (limited to 'llvm/test')
20 files changed, 180 insertions, 88 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/32-bit-local-address-space.ll b/llvm/test/CodeGen/AMDGPU/32-bit-local-address-space.ll index ca661cf9a71..72e62f8dbbf 100644 --- a/llvm/test/CodeGen/AMDGPU/32-bit-local-address-space.ll +++ b/llvm/test/CodeGen/AMDGPU/32-bit-local-address-space.ll @@ -81,8 +81,8 @@ define amdgpu_kernel void @mul_32bit_ptr(float addrspace(1)* %out, [3 x float] a @g_lds = addrspace(3) global float undef, align 4 ; FUNC-LABEL: {{^}}infer_ptr_alignment_global_offset: -; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0 -; SI: ds_read_b32 v{{[0-9]+}}, [[REG]] +; SI: v_mov_b32_e32 [[PTR:v[0-9]+]], g_lds@abs32@lo +; SI: ds_read_b32 v{{[0-9]+}}, [[PTR]] define amdgpu_kernel void @infer_ptr_alignment_global_offset(float addrspace(1)* %out, i32 %tid) { %val = load float, float addrspace(3)* @g_lds store float %val, float addrspace(1)* %out diff --git a/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll b/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll index 1314311c9cd..7487cd98e0a 100644 --- a/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll +++ b/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll @@ -7,8 +7,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0 ; GCN-LABEL: {{^}}write_ds_sub0_offset0_global: ; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 2, v0 -; CI: v_sub_i32_e32 [[BASEPTR:v[0-9]+]], vcc, 0, [[SHL]] -; GFX9: v_sub_u32_e32 [[BASEPTR:v[0-9]+]], 0, [[SHL]] +; GCN: v_sub_{{[iu]}}32_e32 [[BASEPTR:v[0-9]+]], {{(vcc, )?}}lds.obj@abs32@lo, [[SHL]] ; GCN: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x7b ; GCN: ds_write_b32 [[BASEPTR]], [[VAL]] offset:12 define amdgpu_kernel void @write_ds_sub0_offset0_global() #0 { diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2.ll b/llvm/test/CodeGen/AMDGPU/ds_read2.ll index f933dc05701..9991eb3fcbe 100644 --- a/llvm/test/CodeGen/AMDGPU/ds_read2.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_read2.ll @@ -355,7 +355,8 @@ define amdgpu_kernel void @misaligned_2_simple_read2_f32(float addrspace(1)* %ou ; CI-DAG: s_mov_b32 m0 ; GFX9-NOT: m0 -; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, {{v[0-9]+}} +; GCN-DAG: v_lshlrev_b32_e32 [[VOFS:v[0-9]+]], 3, {{v[0-9]+}} +; GCN-DAG: v_add_{{[iu]}}32_e32 [[VPTR:v[0-9]+]], {{(vcc, )?}}lds.f64@abs32@lo, [[VOFS]] ; GCN: ds_read2_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, [[VPTR]] offset1:8 ; GCN: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}} @@ -441,8 +442,8 @@ define amdgpu_kernel void @misaligned_read2_f64(double addrspace(1)* %out, doubl ; CI-DAG: s_mov_b32 m0 ; GFX9-NOT: m0 -; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} -; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:1 +; GCN-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], foo@abs32@lo{{$}} +; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]] offset1:1 define amdgpu_kernel void @load_constant_adjacent_offsets(i32 addrspace(1)* %out) { %val0 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4 %val1 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4 @@ -455,8 +456,8 @@ define amdgpu_kernel void @load_constant_adjacent_offsets(i32 addrspace(1)* %out ; CI-DAG: s_mov_b32 m0 ; GFX9-NOT: m0 -; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} -; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:2 +; GCN-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], foo@abs32@lo{{$}} +; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]] offset1:2 define amdgpu_kernel void @load_constant_disjoint_offsets(i32 addrspace(1)* %out) { %val0 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4 %val1 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4 @@ -471,9 +472,9 @@ define amdgpu_kernel void @load_constant_disjoint_offsets(i32 addrspace(1)* %out ; CI-DAG: s_mov_b32 m0 ; GFX9-NOT: m0 -; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} -; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:1 -; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset0:2 offset1:3 +; GCN-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], bar@abs32@lo{{$}} +; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]] offset1:1 +; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]] offset0:2 offset1:3 define amdgpu_kernel void @load_misaligned64_constant_offsets(i64 addrspace(1)* %out) { %val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4 %val1 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4 @@ -488,10 +489,13 @@ define amdgpu_kernel void @load_misaligned64_constant_offsets(i64 addrspace(1)* ; CI-DAG: s_mov_b32 m0 ; GFX9-NOT: m0 -; GCN-DAG: v_mov_b32_e32 [[BASE0:v[0-9]+]], 0x7ff8{{$}} -; GCN-DAG: v_mov_b32_e32 [[BASE1:v[0-9]+]], 0x4000 -; GCN-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASE0]] offset1:1 -; GCN-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASE1]] offset1:1 +; GCN-DAG: s_mov_b32 [[SBASE0:s[0-9]+]], bar.large@abs32@lo +; GCN-DAG: s_add_i32 [[SBASE1:s[0-9]+]], [[SBASE0]], 0x4000{{$}} +; GCN-DAG: s_addk_i32 [[SBASE0]], 0x7ff8{{$}} +; GCN-DAG: v_mov_b32_e32 [[VBASE0:v[0-9]+]], [[SBASE0]] +; GCN-DAG: v_mov_b32_e32 [[VBASE1:v[0-9]+]], [[SBASE1]] +; GCN-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[VBASE0]] offset1:1 +; GCN-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[VBASE1]] offset1:1 ; GCN: s_endpgm define amdgpu_kernel void @load_misaligned64_constant_large_offsets(i64 addrspace(1)* %out) { %val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4 diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2.ll b/llvm/test/CodeGen/AMDGPU/ds_write2.ll index 03436f6b3a3..a1610d44e2c 100644 --- a/llvm/test/CodeGen/AMDGPU/ds_write2.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_write2.ll @@ -103,10 +103,17 @@ define amdgpu_kernel void @simple_write2_two_val_f32_volatile_1(float addrspace( ; CI: buffer_load_dwordx2 v{{\[[0-9]+}}:[[VAL1:[0-9]+]]{{\]}} ; CI-DAG: s_mov_b32 m0 -; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} - -; GFX9: global_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:{{[0-9]+\]}} -; GFX9: global_load_dwordx2 v{{\[[0-9]+}}:[[VAL1:[0-9]+]]{{\]}} +; CI-DAG: v_lshlrev_b32_e32 [[VOFS:v[0-9]+]], 2, v{{[0-9]+}} +; CI-DAG: v_add_i32_e32 [[VPTR:v[0-9]+]], vcc, lds@abs32@lo, [[VOFS]] +; +; TODO: This should be an s_mov_b32. The v_mov_b32 gets introduced by an +; early legalization of the constant bus constraint on the v_lshl_add_u32, +; and then SIFoldOperands folds in an unlucky order. +; GFX9-DAG: v_mov_b32_e32 [[VBASE:v[0-9]+]], lds@abs32@lo +; GFX9-DAG: v_lshl_add_u32 [[VPTR:v[0-9]+]], {{v[0-9]+}}, 2, [[VBASE]] + +; GFX9-DAG: global_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:{{[0-9]+\]}} +; GFX9-DAG: global_load_dwordx2 v{{\[[0-9]+}}:[[VAL1:[0-9]+]]{{\]}} ; GCN: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8 ; GCN: s_endpgm @@ -131,7 +138,12 @@ define amdgpu_kernel void @simple_write2_two_val_subreg2_mixed_f32(float addrspa ; GFX9-NOT: m0 ; GCN-DAG: {{buffer|global}}_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}} -; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} + +; CI-DAG: v_lshlrev_b32_e32 [[VOFS:v[0-9]+]], 2, v{{[0-9]+}} +; CI-DAG: v_add_i32_e32 [[VPTR:v[0-9]+]], vcc, lds@abs32@lo, [[VOFS]] +; GFX9-DAG: v_mov_b32_e32 [[VBASE:v[0-9]+]], lds@abs32@lo +; GFX9-DAG: v_lshl_add_u32 [[VPTR:v[0-9]+]], v{{[0-9]+}}, 2, [[VBASE]] + ; GCN: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8 ; GCN: s_endpgm define amdgpu_kernel void @simple_write2_two_val_subreg2_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 { @@ -153,7 +165,12 @@ define amdgpu_kernel void @simple_write2_two_val_subreg2_f32(float addrspace(1)* ; GFX9-NOT: m0 ; GCN-DAG: {{buffer|global}}_load_dwordx4 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}} -; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} + +; CI-DAG: v_lshlrev_b32_e32 [[VOFS:v[0-9]+]], 2, v{{[0-9]+}} +; CI-DAG: v_add_i32_e32 [[VPTR:v[0-9]+]], vcc, lds@abs32@lo, [[VOFS]] +; GFX9-DAG: v_mov_b32_e32 [[VBASE:v[0-9]+]], lds@abs32@lo +; GFX9-DAG: v_lshl_add_u32 [[VPTR:v[0-9]+]], v{{[0-9]+}}, 2, [[VBASE]] + ; GCN: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8 ; GCN: s_endpgm define amdgpu_kernel void @simple_write2_two_val_subreg4_f32(float addrspace(1)* %C, <4 x float> addrspace(1)* %in) #0 { @@ -389,8 +406,8 @@ define amdgpu_kernel void @simple_write2_two_val_f64(double addrspace(1)* %C, do ; CI-DAG: s_mov_b32 m0 ; GFX9-NOT: m0 -; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} -; GCN: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1 +; GCN-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], foo@abs32@lo{{$}} +; GCN: ds_write2_b32 [[PTR]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1 define amdgpu_kernel void @store_constant_adjacent_offsets() { store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4 store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4 @@ -402,8 +419,8 @@ define amdgpu_kernel void @store_constant_adjacent_offsets() { ; GFX9-NOT: m0 ; GCN-DAG: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x7b{{$}} -; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} -; GCN: ds_write2_b32 [[ZERO]], [[VAL]], [[VAL]] offset1:2 +; GCN-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], foo@abs32@lo{{$}} +; GCN: ds_write2_b32 [[PTR]], [[VAL]], [[VAL]] offset1:2 define amdgpu_kernel void @store_constant_disjoint_offsets() { store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4 store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4 @@ -416,9 +433,9 @@ define amdgpu_kernel void @store_constant_disjoint_offsets() { ; CI-DAG: s_mov_b32 m0 ; GFX9-NOT: m0 -; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} -; GCN-DAG: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1 -; GCN-DAG: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset0:2 offset1:3 +; GCN-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], bar@abs32@lo{{$}} +; GCN-DAG: ds_write2_b32 [[PTR]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1 +; GCN-DAG: ds_write2_b32 [[PTR]], v{{[0-9]+}}, v{{[0-9]+}} offset0:2 offset1:3 ; GCN: s_endpgm define amdgpu_kernel void @store_misaligned64_constant_offsets() { store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4 @@ -432,10 +449,13 @@ define amdgpu_kernel void @store_misaligned64_constant_offsets() { ; CI-DAG: s_mov_b32 m0 ; GFX9-NOT: m0 -; GCN-DAG: v_mov_b32_e32 [[BASE0:v[0-9]+]], 0x7ff8{{$}} -; GCN-DAG: v_mov_b32_e32 [[BASE1:v[0-9]+]], 0x4000{{$}} -; GCN-DAG: ds_write2_b32 [[BASE0]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1 -; GCN-DAG: ds_write2_b32 [[BASE1]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1 +; GCN-DAG: s_mov_b32 [[SBASE0:s[0-9]+]], bar.large@abs32@lo +; GCN-DAG: s_add_i32 [[SBASE1:s[0-9]+]], [[SBASE0]], 0x4000{{$}} +; GCN-DAG: s_addk_i32 [[SBASE0]], 0x7ff8{{$}} +; GCN-DAG: v_mov_b32_e32 [[VBASE0:v[0-9]+]], [[SBASE0]]{{$}} +; GCN-DAG: v_mov_b32_e32 [[VBASE1:v[0-9]+]], [[SBASE1]]{{$}} +; GCN-DAG: ds_write2_b32 [[VBASE0]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1 +; GCN-DAG: ds_write2_b32 [[VBASE1]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1 ; GCN: s_endpgm define amdgpu_kernel void @store_misaligned64_constant_large_offsets() { store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4 diff --git a/llvm/test/CodeGen/AMDGPU/lds-initializer.ll b/llvm/test/CodeGen/AMDGPU/lds-initializer.ll index 5ced7100596..7cfe013b845 100644 --- a/llvm/test/CodeGen/AMDGPU/lds-initializer.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-initializer.ll @@ -1,7 +1,7 @@ ; RUN: not llc -march=amdgcn -mcpu=tahiti < %s 2>&1 | FileCheck %s ; RUN: not llc -march=amdgcn -mcpu=tonga < %s 2>&1 | FileCheck %s -; CHECK: in function load_init_lds_global{{.*}}: unsupported initializer for address space +; CHECK: lds: unsupported initializer for address space @lds = addrspace(3) global [8 x i32] [i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8] diff --git a/llvm/test/CodeGen/AMDGPU/lds-relocs.ll b/llvm/test/CodeGen/AMDGPU/lds-relocs.ll new file mode 100644 index 00000000000..63e3dd880ba --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lds-relocs.ll @@ -0,0 +1,63 @@ +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GCN %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -filetype=obj < %s | llvm-readobj -r -t | FileCheck -check-prefixes=ELF %s + +@lds.external = external unnamed_addr addrspace(3) global [0 x i32] +@lds.defined = unnamed_addr addrspace(3) global [8 x i32] undef, align 8 + +; ELF: Relocations [ +; ELF-NEXT: Section (3) .rel.text { +; ELF-NEXT: 0x{{[0-9a-f]*}} R_AMDGPU_ABS32 lds.external 0x0 +; ELF-NEXT: 0x{{[0-9a-f]*}} R_AMDGPU_ABS32 lds.defined 0x0 +; ELF-NEXT: } +; ELF-NEXT: ] + +; ELF: Symbol { +; ELF: Name: lds.defined +; ELF-NEXT: Value: 0x8 +; ELF-NEXT: Size: 32 +; ELF-NEXT: Binding: Global (0x1) +; ELF-NEXT: Type: Object (0x1) +; ELF-NEXT: Other: 0 +; ELF-NEXT: Section: Processor Specific (0xFF00) +; ELF-NEXT: } + +; ELF: Symbol { +; ELF: Name: lds.external +; ELF-NEXT: Value: 0x4 +; ELF-NEXT: Size: 0 +; ELF-NEXT: Binding: Global (0x1) +; ELF-NEXT: Type: Object (0x1) +; ELF-NEXT: Other: 0 +; ELF-NEXT: Section: Processor Specific (0xFF00) +; ELF-NEXT: } + +; GCN-LABEL: {{^}}test_basic: +; GCN: v_mov_b32_e32 v1, lds.external@abs32@lo ; encoding: [0xff,0x02,0x02,0x7e,A,A,A,A] +; GCN-NEXT: ; fixup A - offset: 4, value: lds.external@abs32@lo, kind: FK_Data_4{{$}} +; +; GCN: s_add_i32 s0, lds.defined@abs32@lo, s0 ; encoding: [0xff,0x00,0x00,0x81,A,A,A,A] +; GCN-NEXT: ; fixup A - offset: 4, value: lds.defined@abs32@lo, kind: FK_Data_4{{$}} +; +; GCN: .globl lds.external +; GCN: .amdgpu_lds lds.external, 0, 4 +; GCN: .globl lds.defined +; GCN: .amdgpu_lds lds.defined, 32, 8 +define amdgpu_gs float @test_basic(i32 inreg %wave, i32 %arg1) #0 { +main_body: + %gep0 = getelementptr [0 x i32], [0 x i32] addrspace(3)* @lds.external, i32 0, i32 %arg1 + %tmp = load i32, i32 addrspace(3)* %gep0 + + %mask = call i64 @llvm.amdgcn.icmp.i64.i32(i32 %tmp, i32 0, i32 0) + %mask.32 = trunc i64 %mask to i32 + %gep1 = getelementptr [8 x i32], [8 x i32] addrspace(3)* @lds.defined, i32 0, i32 %wave + store i32 %mask.32, i32 addrspace(3)* %gep1 + + %r = bitcast i32 %tmp to float + ret float %r +} + +; Function Attrs: convergent nounwind readnone +declare i64 @llvm.amdgcn.icmp.i64.i32(i32, i32, i32) #4 + +attributes #0 = { "no-signed-zeros-fp-math"="true" } +attributes #4 = { convergent nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/lds-size.ll b/llvm/test/CodeGen/AMDGPU/lds-size.ll index ff78c3bcb18..a7f3d778755 100644 --- a/llvm/test/CodeGen/AMDGPU/lds-size.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-size.ll @@ -1,4 +1,3 @@ -; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=ALL -check-prefix=GCN %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa < %s | FileCheck -check-prefix=ALL -check-prefix=HSA %s ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=ALL -check-prefix=EG %s diff --git a/llvm/test/CodeGen/AMDGPU/lds-zero-initializer.ll b/llvm/test/CodeGen/AMDGPU/lds-zero-initializer.ll index a3b3d7b341f..367dd173f78 100644 --- a/llvm/test/CodeGen/AMDGPU/lds-zero-initializer.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-zero-initializer.ll @@ -1,7 +1,7 @@ ; RUN: not llc -march=amdgcn -mcpu=tahiti < %s 2>&1 | FileCheck %s ; RUN: not llc -march=amdgcn -mcpu=tonga < %s 2>&1 | FileCheck %s -; CHECK: in function load_zeroinit_lds_global{{.*}}: unsupported initializer for address space +; CHECK: lds: unsupported initializer for address space @lds = addrspace(3) global [256 x i32] zeroinitializer diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll index c4e631590d5..4d49d87c67a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll @@ -268,7 +268,11 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(i64* %ptr) #0 ; CIVI-DAG: s_mov_b32 m0 ; GFX9-NOT: m0 -; GCN-DAG: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} +; CIVI-DAG: v_lshlrev_b32_e32 [[OFS:v[0-9]+]], 2, {{v[0-9]+}} +; CIVI-DAG: v_add_{{[ui]}}32_e32 [[PTR:v[0-9]+]], vcc, lds0@abs32@lo, [[OFS]] +; GFX9-DAG: s_mov_b32 [[BASE:s[0-9]+]], lds0@abs32@lo +; GFX9-DAG: v_lshl_add_u32 [[PTR:v[0-9]+]], {{v[0-9]+}}, 2, [[BASE]] + ; GCN: ds_dec_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 define amdgpu_kernel void @atomic_dec_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -412,7 +416,11 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(i64 addrspa ; CIVI-DAG: s_mov_b32 m0 ; GFX9-NOT: m0 -; GCN-DAG: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 3, {{v[0-9]+}} +; CIVI-DAG: v_lshlrev_b32_e32 [[OFS:v[0-9]+]], 3, {{v[0-9]+}} +; CIVI-DAG: v_add_{{[ui]}}32_e32 [[PTR:v[0-9]+]], vcc, lds1@abs32@lo, [[OFS]] +; GFX9-DAG: v_mov_b32_e32 [[BASE:v[0-9]+]], lds1@abs32@lo +; GFX9-DAG: v_lshl_add_u32 [[PTR:v[0-9]+]], {{v[0-9]+}}, 3, [[BASE]] + ; GCN: ds_dec_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]], v{{\[[0-9]+:[0-9]+\]}} offset:16 define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll index 689e2b6300f..111f131177b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll @@ -131,7 +131,10 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(i32 addrspa @lds0 = addrspace(3) global [512 x i32] undef, align 4 ; GCN-LABEL: {{^}}atomic_inc_shl_base_lds_0_i32: -; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} +; CIVI-DAG: v_lshlrev_b32_e32 [[OFS:v[0-9]+]], 2, {{v[0-9]+}} +; CIVI-DAG: v_add_{{[ui]}}32_e32 [[PTR:v[0-9]+]], vcc, lds0@abs32@lo, [[OFS]] +; GFX9-DAG: s_mov_b32 [[BASE:s[0-9]+]], lds0@abs32@lo +; GFX9-DAG: v_lshl_add_u32 [[PTR:v[0-9]+]], {{v[0-9]+}}, 2, [[BASE]] ; GCN: ds_inc_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -325,7 +328,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(i32* %ptr) #0 @lds1 = addrspace(3) global [512 x i64] undef, align 8 ; GCN-LABEL: {{^}}atomic_inc_shl_base_lds_0_i64: -; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 3, {{v[0-9]+}} +; CIVI-DAG: v_lshlrev_b32_e32 [[OFS:v[0-9]+]], 3, {{v[0-9]+}} +; CIVI-DAG: v_add_{{[ui]}}32_e32 [[PTR:v[0-9]+]], vcc, lds1@abs32@lo, [[OFS]] +; GFX9-DAG: v_mov_b32_e32 [[BASE:v[0-9]+]], lds1@abs32@lo +; GFX9-DAG: v_lshl_add_u32 [[PTR:v[0-9]+]], {{v[0-9]+}}, 3, [[BASE]] ; GCN: ds_inc_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]], v{{\[[0-9]+:[0-9]+\]}} offset:16 define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticsize.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticsize.ll index d26fab4cebe..3224d8a3594 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticsize.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticsize.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,NOHSA %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,HSA %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,HSA %s @lds0 = addrspace(3) global [512 x float] undef, align 4 @lds1 = addrspace(3) global [256 x float] undef, align 4 @@ -8,7 +8,8 @@ @large = addrspace(3) global [4096 x i32] undef, align 4 ; CHECK-LABEL: {{^}}groupstaticsize_test0: -; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0x800{{$}} +; NOHSA: v_mov_b32_e32 v{{[0-9]+}}, llvm.amdgcn.groupstaticsize@abs32@lo +; HSA: v_mov_b32_e32 v{{[0-9]+}}, 0x800{{$}} define amdgpu_kernel void @groupstaticsize_test0(float addrspace(1)* %out, i32 addrspace(1)* %lds_size) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %idx.0 = add nsw i32 %tid.x, 64 @@ -22,7 +23,8 @@ define amdgpu_kernel void @groupstaticsize_test0(float addrspace(1)* %out, i32 a } ; CHECK-LABEL: {{^}}groupstaticsize_test1: -; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0xc00{{$}} +; NOHSA: v_mov_b32_e32 v{{[0-9]+}}, llvm.amdgcn.groupstaticsize@abs32@lo +; HSA: v_mov_b32_e32 v{{[0-9]+}}, 0xc00{{$}} define amdgpu_kernel void @groupstaticsize_test1(float addrspace(1)* %out, i32 %cond, i32 addrspace(1)* %lds_size) { entry: %static_lds_size = call i32 @llvm.amdgcn.groupstaticsize() #1 @@ -50,7 +52,8 @@ endif: ; preds = %else, %if ; Exceeds 16-bit simm limit of s_movk_i32 ; CHECK-LABEL: {{^}}large_groupstaticsize: -; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 0x4000{{$}} +; NOHSA: v_mov_b32_e32 v{{[0-9]+}}, llvm.amdgcn.groupstaticsize@abs32@lo +; HSA: v_mov_b32_e32 [[REG:v[0-9]+]], 0x4000{{$}} define amdgpu_kernel void @large_groupstaticsize(i32 addrspace(1)* %size, i32 %idx) #0 { %gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(3)* @large, i32 0, i32 %idx store volatile i32 0, i32 addrspace(3)* %gep diff --git a/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll b/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll index 47b6558241b..a070488a4bc 100644 --- a/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll +++ b/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll @@ -3,12 +3,6 @@ @local_memory.local_mem = internal unnamed_addr addrspace(3) global [128 x i32] undef, align 4 -; Check that the LDS size emitted correctly -; SI: .long 47180 -; SI-NEXT: .long 65668 -; CI: .long 47180 -; CI-NEXT: .long 32900 - ; GCN-LABEL: {{^}}local_memory: ; GCN-NOT: s_wqm_b64 @@ -57,6 +51,7 @@ entry: ; CI: v_sub_i32_e32 [[SUB:v[0-9]+]], vcc, 0, [[ADDRW]] ; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, [[SUB]] offset0:3 offset1:7 + define amdgpu_kernel void @local_memory_two_objects(i32 addrspace(1)* %out) #0 { entry: %x.i = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/local-memory.ll b/llvm/test/CodeGen/AMDGPU/local-memory.ll index 6124237d763..5e820792aee 100644 --- a/llvm/test/CodeGen/AMDGPU/local-memory.ll +++ b/llvm/test/CodeGen/AMDGPU/local-memory.ll @@ -10,8 +10,8 @@ ; not an immediate. ; FUNC-LABEL: {{^}}load_i32_local_const_ptr: -; GCN: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0 -; GCN: ds_read_b32 v{{[0-9]+}}, v[[ZERO]] offset:4 +; GCN: v_mov_b32_e32 v[[PTR:[0-9]+]], lds@abs32@lo +; GCN: ds_read_b32 v{{[0-9]+}}, v[[PTR]] offset:4 ; R600: LDS_READ_RET define amdgpu_kernel void @load_i32_local_const_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %in) #0 { diff --git a/llvm/test/CodeGen/AMDGPU/merge-store-crash.ll b/llvm/test/CodeGen/AMDGPU/merge-store-crash.ll index 1252a5c0c02..c384eb5b425 100644 --- a/llvm/test/CodeGen/AMDGPU/merge-store-crash.ll +++ b/llvm/test/CodeGen/AMDGPU/merge-store-crash.ll @@ -7,7 +7,8 @@ @tess_lds = external addrspace(3) global [8192 x i32] ; CHECK-LABEL: {{^}}main: -; CHECK: ds_write2_b32 +; CHECK: ds_write_b32 +; CHECK: ds_write_b32 ; CHECK: v_mov_b32_e32 v1, v0 ; CHECK: tbuffer_store_format_xyzw v[0:3], define amdgpu_vs void @main(i32 inreg %arg) { diff --git a/llvm/test/CodeGen/AMDGPU/over-max-lds-size.ll b/llvm/test/CodeGen/AMDGPU/over-max-lds-size.ll deleted file mode 100644 index 57777e783c5..00000000000 --- a/llvm/test/CodeGen/AMDGPU/over-max-lds-size.ll +++ /dev/null @@ -1,14 +0,0 @@ -; RUN: not llc -march=amdgcn -mcpu=tahiti < %s 2>&1 | FileCheck -check-prefix=ERROR %s -; RUN: not llc -march=amdgcn -mcpu=hawaii < %s 2>&1 | FileCheck -check-prefix=ERROR %s -; RUN: not llc -march=amdgcn -mcpu=fiji < %s 2>&1 | FileCheck -check-prefix=ERROR %s - -; ERROR: error: local memory limit exceeded (400000) in use_huge_lds - -@huge = internal unnamed_addr addrspace(3) global [100000 x i32] undef, align 4 - -define amdgpu_kernel void @use_huge_lds() { -entry: - %v0 = getelementptr inbounds [100000 x i32], [100000 x i32] addrspace(3)* @huge, i32 0, i32 0 - store i32 0, i32 addrspace(3)* %v0 - ret void -} diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-globals.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-globals.ll index 4403d1f2306..4fd9c01a57f 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-globals.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-globals.ll @@ -8,7 +8,8 @@ ; IR-LABEL: define amdgpu_kernel void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) { ; IR: alloca [10 x i32] ; ASM-LABEL: {{^}}promote_alloca_size_256: -; ASM: ; LDSByteSize: 60000 bytes/workgroup (compile time only) +; ASM: .amdgpu_lds global_array0, 30000, 4 +; ASM: .amdgpu_lds global_array1, 30000, 4 define amdgpu_kernel void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) { entry: diff --git a/llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll b/llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll index 379dcd62302..6c8891d28d6 100644 --- a/llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll +++ b/llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll @@ -33,7 +33,11 @@ define amdgpu_kernel void @load_shl_base_lds_0(float addrspace(1)* %out, i32 add ; remaining add use goes through the normal shl + add constant fold. ; GCN-LABEL: {{^}}load_shl_base_lds_1: -; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} +; GCN: v_lshlrev_b32_e32 [[OFS:v[0-9]+]], 2, {{v[0-9]+}} + +; TODO: integrate into the ds_read_b32 offset using a 16-bit relocation +; GCN: v_add_{{[iu]}}32_e32 [[PTR:v[0-9]+]], vcc, lds0@abs32@lo, [[OFS]] + ; GCN: ds_read_b32 [[RESULT:v[0-9]+]], [[PTR]] offset:8 ; GCN: v_add_{{[iu]}}32_e32 [[ADDUSE:v[0-9]+]], vcc, 8, v{{[0-9]+}} ; GCN-DAG: buffer_store_dword [[RESULT]] @@ -68,10 +72,18 @@ define amdgpu_kernel void @load_shl_base_lds_max_offset(i8 addrspace(1)* %out, i ; The two globals are placed adjacent in memory, so the same base ; pointer can be used with an offset into the second one. +; TODO: Recover the optimization of using ds_read2st64_b32 using alignment hints + ; GCN-LABEL: {{^}}load_shl_base_lds_2: -; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} +; GCN: v_lshlrev_b32_e32 [[OFS:v[0-9]+]], 2, {{v[0-9]+}} +; GCN-DAG: v_add_{{[iu]}}32_e32 [[PTR0:v[0-9]+]], vcc, lds0@abs32@lo, [[OFS]] +; GCN-DAG: v_add_{{[iu]}}32_e32 [[PTR1:v[0-9]+]], vcc, lds1@abs32@lo, [[OFS]] ; GCN: s_mov_b32 m0, -1 -; GCN-NEXT: ds_read2st64_b32 {{v\[[0-9]+:[0-9]+\]}}, [[PTR]] offset0:1 offset1:9 + +; GCN-DAG: ds_read_b32 {{v[0-9]+}}, [[PTR0]] offset:256 +; GCN-DAG: ds_read_b32 {{v[0-9]+}}, [[PTR1]] offset:256 +; TODO: ds_read2st64_b32 {{v\[[0-9]+:[0-9]+\]}}, [[PTR]] offset0:1 offset1:9 + ; GCN: s_endpgm define amdgpu_kernel void @load_shl_base_lds_2(float addrspace(1)* %out) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 diff --git a/llvm/test/CodeGen/AMDGPU/si-sgpr-spill.ll b/llvm/test/CodeGen/AMDGPU/si-sgpr-spill.ll index 9a731c6bbbc..54b4b4d2586 100644 --- a/llvm/test/CodeGen/AMDGPU/si-sgpr-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/si-sgpr-spill.ll @@ -4,16 +4,11 @@ ; These tests check that the compiler won't crash when it needs to spill ; SGPRs. -@ddxy_lds = external addrspace(3) global [64 x i32] - ; GCN-LABEL: {{^}}main: ; GCN: s_wqm ; Make sure not emitting unused scratch resource descriptor setup ; GCN-NOT: s_mov_b32 -; GCN-NOT: s_mov_b32 -; GCN-NOT: s_mov_b32 -; GCN-NOT: s_mov_b32 ; GCN: s_mov_b32 m0 @@ -26,6 +21,7 @@ ; TOVGPR: ScratchSize: 0{{$}} define amdgpu_ps void @main([17 x <4 x i32>] addrspace(4)* byval %arg, [32 x <4 x i32>] addrspace(4)* byval %arg1, [16 x <8 x i32>] addrspace(4)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) { main_body: + %lds = inttoptr i32 0 to [64 x i32] addrspace(3)* %tmp = getelementptr [17 x <4 x i32>], [17 x <4 x i32>] addrspace(4)* %arg, i64 0, i32 0 %tmp21 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp, !tbaa !0 %tmp22 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 96, i32 0) @@ -203,18 +199,18 @@ main_body: %p2.i6 = call float @llvm.amdgcn.interp.p2(float %p1.i5, float %j.f.i4, i32 2, i32 5, i32 %arg4) #0 %mbcnt.lo.0 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) %tmp109 = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo.0) - %tmp110 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp109 + %tmp110 = getelementptr [64 x i32], [64 x i32] addrspace(3)* %lds, i32 0, i32 %tmp109 %tmp111 = bitcast float %p2.i to i32 store i32 %tmp111, i32 addrspace(3)* %tmp110 %tmp112 = bitcast float %p2.i96 to i32 store i32 %tmp112, i32 addrspace(3)* %tmp110 %mbcnt.lo.1 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) %tmp113 = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo.1) - %tmp114 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp113 + %tmp114 = getelementptr [64 x i32], [64 x i32] addrspace(3)* %lds, i32 0, i32 %tmp113 %tmp115 = and i32 %tmp113, -4 - %tmp116 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp115 + %tmp116 = getelementptr [64 x i32], [64 x i32] addrspace(3)* %lds, i32 0, i32 %tmp115 %tmp117 = add i32 %tmp115, 1 - %tmp118 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp117 + %tmp118 = getelementptr [64 x i32], [64 x i32] addrspace(3)* %lds, i32 0, i32 %tmp117 %tmp119 = bitcast float %p2.i to i32 store i32 %tmp119, i32 addrspace(3)* %tmp114 %tmp120 = load i32, i32 addrspace(3)* %tmp116 @@ -241,7 +237,7 @@ main_body: %tmp140 = fmul float %tmp59, %p2.i96 %mbcnt.lo.2 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) %tmp141 = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo.2) - %tmp142 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp141 + %tmp142 = getelementptr [64 x i32], [64 x i32] addrspace(3)* %lds, i32 0, i32 %tmp141 %tmp143 = bitcast float %tmp137 to i32 store i32 %tmp143, i32 addrspace(3)* %tmp142 %tmp144 = bitcast float %tmp138 to i32 @@ -252,11 +248,11 @@ main_body: store i32 %tmp146, i32 addrspace(3)* %tmp142 %mbcnt.lo.3 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) %tmp147 = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo.3) - %tmp148 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp147 + %tmp148 = getelementptr [64 x i32], [64 x i32] addrspace(3)* %lds, i32 0, i32 %tmp147 %tmp149 = and i32 %tmp147, -4 - %tmp150 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp149 + %tmp150 = getelementptr [64 x i32], [64 x i32] addrspace(3)* %lds, i32 0, i32 %tmp149 %tmp151 = add i32 %tmp149, 2 - %tmp152 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp151 + %tmp152 = getelementptr [64 x i32], [64 x i32] addrspace(3)* %lds, i32 0, i32 %tmp151 %tmp153 = bitcast float %tmp137 to i32 store i32 %tmp153, i32 addrspace(3)* %tmp148 %tmp154 = load i32, i32 addrspace(3)* %tmp150 diff --git a/llvm/test/CodeGen/AMDGPU/target-cpu.ll b/llvm/test/CodeGen/AMDGPU/target-cpu.ll index cc4c98bb678..4750cf2020d 100644 --- a/llvm/test/CodeGen/AMDGPU/target-cpu.ll +++ b/llvm/test/CodeGen/AMDGPU/target-cpu.ll @@ -78,7 +78,6 @@ define amdgpu_kernel void @target_fiji() #4 { ; CHECK-LABEL: {{^}}promote_alloca_enabled: ; CHECK: ds_read_b32 -; CHECK: ; LDSByteSize: 5120 define amdgpu_kernel void @promote_alloca_enabled(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #5 { entry: %stack = alloca [5 x i32], align 4, addrspace(5) diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll index df004562d49..4e233495f5f 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll @@ -10,7 +10,7 @@ ; CHECK: machineFunctionInfo: ; CHECK-NEXT: explicitKernArgSize: 128 ; CHECK-NEXT: maxKernArgAlign: 64 -; CHECK-NEXT: ldsSize: 2048 +; CHECK-NEXT: ldsSize: 0 ; CHECK-NEXT: isEntryFunction: true ; CHECK-NEXT: noSignedZerosFPMath: false ; CHECK-NEXT: memoryBound: false |

