diff options
author | Matt Arsenault <Matthew.Arsenault@amd.com> | 2016-07-01 22:47:50 +0000 |
---|---|---|
committer | Matt Arsenault <Matthew.Arsenault@amd.com> | 2016-07-01 22:47:50 +0000 |
commit | 327bb5ad82328bb92907bb2acf87e2282593e1e4 (patch) | |
tree | 96f20ec1942050bdab64dc965a2f9f5e292994ce /llvm/test/CodeGen | |
parent | 591ff8376bbf3ee0cc6f6006300878ae96475a49 (diff) | |
download | bcm5719-llvm-327bb5ad82328bb92907bb2acf87e2282593e1e4.tar.gz bcm5719-llvm-327bb5ad82328bb92907bb2acf87e2282593e1e4.zip |
AMDGPU: Improve load/store of illegal types.
There was a combine before to handle the simple copy case.
Split this into handling loads and stores separately.
We might want to change how this handles some of the vector
extloads, since this can result in large code size increases.
llvm-svn: 274394
Diffstat (limited to 'llvm/test/CodeGen')
27 files changed, 398 insertions, 1341 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/add.ll b/llvm/test/CodeGen/AMDGPU/add.ll index f3086fe2c71..f37247361ec 100644 --- a/llvm/test/CodeGen/AMDGPU/add.ll +++ b/llvm/test/CodeGen/AMDGPU/add.ll @@ -123,12 +123,11 @@ entry: ; SI: s_add_u32 ; SI: s_addc_u32 -; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.[XYZW]]] -; EG: MEM_RAT_CACHELESS STORE_RAW [[HI:T[0-9]+\.[XYZW]]] -; EG-DAG: ADD_INT {{[* ]*}}[[LO]] +; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.XY]] +; EG-DAG: ADD_INT {{[* ]*}} ; EG-DAG: ADDC_UINT ; EG-DAG: ADD_INT -; EG-DAG: ADD_INT {{[* ]*}}[[HI]] +; EG-DAG: ADD_INT {{[* ]*}} ; EG-NOT: SUB define void @add64(i64 addrspace(1)* %out, i64 %a, i64 %b) { entry: @@ -145,12 +144,11 @@ entry: ; FUNC-LABEL: {{^}}add64_sgpr_vgpr: ; SI-NOT: v_addc_u32_e32 s -; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.[XYZW]]] -; EG: MEM_RAT_CACHELESS STORE_RAW [[HI:T[0-9]+\.[XYZW]]] -; EG-DAG: ADD_INT {{[* ]*}}[[LO]] +; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.XY]] +; EG-DAG: ADD_INT {{[* ]*}} ; EG-DAG: ADDC_UINT ; EG-DAG: ADD_INT -; EG-DAG: ADD_INT {{[* ]*}}[[HI]] +; EG-DAG: ADD_INT {{[* ]*}} ; EG-NOT: SUB define void @add64_sgpr_vgpr(i64 addrspace(1)* %out, i64 %a, i64 addrspace(1)* %in) { entry: @@ -165,12 +163,11 @@ entry: ; SI: s_add_u32 ; SI: s_addc_u32 -; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.[XYZW]]] -; EG: MEM_RAT_CACHELESS STORE_RAW [[HI:T[0-9]+\.[XYZW]]] -; EG-DAG: ADD_INT {{[* ]*}}[[LO]] +; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.XY]] +; EG-DAG: ADD_INT {{[* ]*}} ; EG-DAG: ADDC_UINT ; EG-DAG: ADD_INT -; EG-DAG: ADD_INT {{[* ]*}}[[HI]] +; EG-DAG: ADD_INT {{[* ]*}} ; EG-NOT: SUB define void @add64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) { entry: diff --git a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll index 7da8bde441b..00d2257f4ad 100644 --- a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll +++ b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll @@ -54,31 +54,12 @@ define void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace( } ; FUNC-LABEL: {{^}}test_copy_v4i8_extra_use: -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI-DAG: v_add -; SI-DAG: v_add -; SI-DAG: v_add -; SI-DAG: v_add -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_store_byte - -; After scalarizing v4i8 loads is fixed. -; XSI: buffer_load_dword -; XSI: V_BFE -; XSI: V_ADD -; XSI: V_ADD -; XSI: V_ADD -; XSI: buffer_store_dword -; XSI: buffer_store_dword +; SI: buffer_load_dword +; SI-DAG: v_lshrrev_b32 +; SI: v_and_b32 +; SI: v_or_b32 +; SI-DAG: buffer_store_dword +; SI-DAG: buffer_store_dword ; SI: s_endpgm define void @test_copy_v4i8_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind { @@ -90,34 +71,14 @@ define void @test_copy_v4i8_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> add } ; FUNC-LABEL: {{^}}test_copy_v4i8_x2_extra_use: -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI-DAG: v_add -; SI-DAG: v_add -; SI-DAG: v_add -; SI-DAG: v_add -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_store_byte -; SI-DAG: buffer_store_byte - -; XSI: buffer_load_dword -; XSI: BFE -; XSI: buffer_store_dword -; XSI: V_ADD -; XSI: buffer_store_dword -; XSI-NEXT: buffer_store_dword - +; SI: buffer_load_dword +; SI-DAG: v_lshrrev_b32 +; SI-DAG: v_add_i32 +; SI-DAG: v_and_b32 +; SI-DAG: v_or_b32 +; SI-DAG: buffer_store_dword +; SI: buffer_store_dword +; SI: buffer_store_dword ; SI: s_endpgm define void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind { %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 @@ -128,21 +89,50 @@ define void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> ret void } -; FUNC-LABEL: {{^}}test_copy_v3i8: -; SI-NOT: bfe -; SI-NOT: bfi +; FUNC-LABEL: {{^}}test_copy_v3i8_align4: +; SI: buffer_load_dword +; SI-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} +; SI-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}} ; SI: s_endpgm -define void @test_copy_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind { +define void @test_copy_v3i8_align4(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind { %val = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 4 store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 4 ret void } +; FUNC-LABEL: {{^}}test_copy_v3i8_align2: +; SI-DAG: buffer_load_ushort v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} +; SI-DAG: buffer_load_ubyte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}} +; SI-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} +; SI-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}} +; SI: s_endpgm +define void @test_copy_v3i8_align2(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind { + %val = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 2 + store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 2 + ret void +} + +; FUNC-LABEL: {{^}}test_copy_v3i8_align1: +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte + +; SI: buffer_store_byte +; SI: buffer_store_byte +; SI: buffer_store_byte +; SI: s_endpgm +define void @test_copy_v3i8_align1(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind { + %val = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 1 + store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 1 + ret void +} + ; FUNC-LABEL: {{^}}test_copy_v4i8_volatile_load: ; SI: buffer_load_ubyte ; SI: buffer_load_ubyte ; SI: buffer_load_ubyte ; SI: buffer_load_ubyte +; SI: buffer_store_dword ; SI: s_endpgm define void @test_copy_v4i8_volatile_load(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind { %val = load volatile <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/ctpop64.ll b/llvm/test/CodeGen/AMDGPU/ctpop64.ll index dca0cc1edb4..d0976b7d45b 100644 --- a/llvm/test/CodeGen/AMDGPU/ctpop64.ll +++ b/llvm/test/CodeGen/AMDGPU/ctpop64.ll @@ -170,16 +170,15 @@ define void @s_ctpop_i65(i32 addrspace(1)* noalias %out, i65 %val) nounwind { ; FIXME: Should not have extra add ; FUNC-LABEL: {{^}}v_ctpop_i128: -; GCN-DAG: buffer_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} -; GCN-DAG: buffer_load_dwordx2 v{{\[}}[[VAL2:[0-9]+]]:[[VAL3:[0-9]+]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8{{$}} +; GCN: buffer_load_dwordx4 v{{\[}}[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} -; GCN: v_bcnt_u32_b32_e64 [[MIDRESULT0:v[0-9]+]], v[[VAL2]], 0 -; GCN: v_bcnt_u32_b32{{_e32|_e64}} [[MIDRESULT1:v[0-9]+]], v[[VAL3]], [[MIDRESULT0]] +; GCN-DAG: v_bcnt_u32_b32_e64 [[MIDRESULT0:v[0-9]+]], v{{[0-9]+}}, 0 +; GCN-DAG: v_bcnt_u32_b32{{_e32|_e64}} [[MIDRESULT1:v[0-9]+]], v[[VAL3]], [[MIDRESULT0]] -; GCN: v_bcnt_u32_b32_e64 [[MIDRESULT2:v[0-9]+]], v[[VAL0]], 0 -; GCN: v_bcnt_u32_b32{{_e32|_e64}} [[MIDRESULT3:v[0-9]+]], v[[VAL1]], [[MIDRESULT2]] +; GCN-DAG: v_bcnt_u32_b32_e64 [[MIDRESULT2:v[0-9]+]], v[[VAL0]], 0 +; GCN-DAG: v_bcnt_u32_b32{{_e32|_e64}} [[MIDRESULT3:v[0-9]+]], v{{[0-9]+}}, [[MIDRESULT2]] -; GCN: v_add_i32_e32 [[RESULT:v[0-9]+]], vcc, [[MIDRESULT2]], [[MIDRESULT1]] +; GCN: v_add_i32_e32 [[RESULT:v[0-9]+]], vcc, [[MIDRESULT1]], [[MIDRESULT2]] ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll index f85e4e9c598..447d0743368 100644 --- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -15,12 +15,9 @@ define void @load_i8_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* n } ; SI-LABEL: {{^}}load_v2i8_to_v2f32: -; SI: buffer_load_ushort [[LOADREG:v[0-9]+]], -; SI-NOT: bfe -; SI-NOT: lshr -; SI-NOT: and -; SI-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[LOADREG]] -; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG]] +; SI: buffer_load_ushort [[LD:v[0-9]+]] +; SI-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[LD]] +; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LD]] ; SI: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}}, define void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8> addrspace(1)* noalias %in) nounwind { %load = load <2 x i8>, <2 x i8> addrspace(1)* %in, align 2 @@ -30,11 +27,11 @@ define void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8> } ; SI-LABEL: {{^}}load_v3i8_to_v3f32: -; SI-NOT: bfe +; SI: buffer_load_dword [[VAL:v[0-9]+]] ; SI-NOT: v_cvt_f32_ubyte3_e32 -; SI-DAG: v_cvt_f32_ubyte2_e32 -; SI-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], -; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], +; SI-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, [[VAL]] +; SI-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[VAL]] +; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[VAL]] ; SI: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}}, define void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind { %load = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 4 @@ -83,26 +80,25 @@ define void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)* noalias %out ret void } -; XXX - This should really still be able to use the v_cvt_f32_ubyte0 -; for each component, but computeKnownBits doesn't handle vectors very -; well. - +; Instructions still emitted to repack bytes for add use. ; SI-LABEL: {{^}}load_v4i8_to_v4f32_2_uses: -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: buffer_load_ubyte -; SI: v_cvt_f32_ubyte0_e32 -; SI: v_cvt_f32_ubyte0_e32 -; SI: v_cvt_f32_ubyte0_e32 -; SI: v_cvt_f32_ubyte0_e32 - -; XXX - replace with this when v4i8 loads aren't scalarized anymore. -; XSI: buffer_load_dword -; XSI: v_cvt_f32_u32_e32 -; XSI: v_cvt_f32_u32_e32 -; XSI: v_cvt_f32_u32_e32 -; XSI: v_cvt_f32_u32_e32 +; SI: buffer_load_dword +; SI-DAG: v_cvt_f32_ubyte0_e32 +; SI-DAG: v_cvt_f32_ubyte1_e32 +; SI-DAG: v_cvt_f32_ubyte2_e32 +; SI-DAG: v_cvt_f32_ubyte3_e32 + +; SI-DAG: v_lshrrev_b32_e32 v{{[0-9]+}}, 24 +; SI-DAG: v_lshrrev_b32_e32 v{{[0-9]+}}, 16 +; SI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16 +; SI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 8 +; SI-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xffff, +; SI-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xff00, +; SI-DAG: v_add_i32 + +; SI: buffer_store_dwordx4 +; SI: buffer_store_dword + ; SI: s_endpgm define void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %out2, <4 x i8> addrspace(1)* noalias %in) nounwind { %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll index 73b8879d1b9..d21d66176a1 100644 --- a/llvm/test/CodeGen/AMDGPU/half.ll +++ b/llvm/test/CodeGen/AMDGPU/half.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s ; half args should be promoted to float @@ -15,8 +15,9 @@ define void @load_f16_arg(half addrspace(1)* %out, half %arg) #0 { ; GCN-LABEL: {{^}}load_v2f16_arg: ; GCN-DAG: buffer_load_ushort [[V0:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44 ; GCN-DAG: buffer_load_ushort [[V1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:46 -; GCN-DAG: buffer_store_short [[V0]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} -; GCN-DAG: buffer_store_short [[V1]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}} +; GCN: v_lshlrev_b32_e32 [[HI:v[0-9]+]], 16, [[V1]] +; GCN: v_or_b32_e32 [[PACKED:v[0-9]+]], [[V0]], [[HI]] +; GCN: buffer_store_dword [[PACKED]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} ; GCN: s_endpgm define void @load_v2f16_arg(<2 x half> addrspace(1)* %out, <2 x half> %arg) #0 { store <2 x half> %arg, <2 x half> addrspace(1)* %out @@ -42,10 +43,7 @@ define void @load_v3f16_arg(<3 x half> addrspace(1)* %out, <3 x half> %arg) #0 { ; GCN: buffer_load_ushort ; GCN: buffer_load_ushort ; GCN: buffer_load_ushort -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short +; GCN: buffer_store_dwordx2 ; GCN: s_endpgm define void @load_v4f16_arg(<4 x half> addrspace(1)* %out, <4 x half> %arg) #0 { store <4 x half> %arg, <4 x half> addrspace(1)* %out @@ -280,11 +278,11 @@ define void @global_extload_f16_to_f32(float addrspace(1)* %out, half addrspace( } ; GCN-LABEL: {{^}}global_extload_v2f16_to_v2f32: -; GCN-DAG: buffer_load_ushort [[LOAD0:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} -; GCN-DAG: buffer_load_ushort [[LOAD1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}} -; GCN-DAG: v_cvt_f32_f16_e32 v[[CVT0:[0-9]+]], [[LOAD0]] -; GCN-DAG: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[LOAD1]] -; GCN-DAG: buffer_store_dwordx2 v{{\[}}[[CVT0]]:[[CVT1]]{{\]}} +; GCN: buffer_load_dword [[LOAD:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} +; GCN: v_cvt_f32_f16_e32 v[[CVT0:[0-9]+]], [[LOAD]] +; GCN: v_lshrrev_b32_e32 [[HI:v[0-9]+]], 16, [[LOAD]] +; GCN: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[HI]] +; GCN: buffer_store_dwordx2 v{{\[}}[[CVT0]]:[[CVT1]]{{\]}} ; GCN: s_endpgm define void @global_extload_v2f16_to_v2f32(<2 x float> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { %val = load <2 x half>, <2 x half> addrspace(1)* %in @@ -318,22 +316,8 @@ define void @global_extload_v8f16_to_v8f32(<8 x float> addrspace(1)* %out, <8 x } ; GCN-LABEL: {{^}}global_extload_v16f16_to_v16f32: -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort +; GCN: buffer_load_dwordx4 +; GCN: buffer_load_dwordx4 ; GCN: v_cvt_f32_f16_e32 ; GCN: v_cvt_f32_f16_e32 @@ -378,10 +362,10 @@ define void @global_extload_f16_to_f64(double addrspace(1)* %out, half addrspace } ; GCN-LABEL: {{^}}global_extload_v2f16_to_v2f64: -; GCN-DAG: buffer_load_ushort [[LOAD0:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} -; GCN-DAG: buffer_load_ushort [[LOAD1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}} -; GCN-DAG: v_cvt_f32_f16_e32 v[[CVT0:[0-9]+]], [[LOAD0]] -; GCN-DAG: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[LOAD1]] +; GCN-DAG: buffer_load_dword [[LOAD:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} +; GCN-DAG: v_lshrrev_b32_e32 [[HI:v[0-9]+]], 16, [[LOAD]] +; GCN-DAG: v_cvt_f32_f16_e32 v[[CVT0:[0-9]+]], [[LOAD]] +; GCN-DAG: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[HI]] ; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT2_LO:[0-9]+]]:[[CVT2_HI:[0-9]+]]{{\]}}, v[[CVT0]] ; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT3_LO:[0-9]+]]:[[CVT3_HI:[0-9]+]]{{\]}}, v[[CVT1]] ; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[CVT2_LO]]:[[CVT3_HI]]{{\]}} @@ -455,8 +439,9 @@ define void @global_truncstore_f32_to_f16(half addrspace(1)* %out, float addrspa ; GCN: buffer_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}} ; GCN-DAG: v_cvt_f16_f32_e32 [[CVT0:v[0-9]+]], v[[LO]] ; GCN-DAG: v_cvt_f16_f32_e32 [[CVT1:v[0-9]+]], v[[HI]] -; GCN-DAG: buffer_store_short [[CVT0]] -; GCN-DAG: buffer_store_short [[CVT1]] +; GCN-DAG: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 16, [[CVT1]] +; GCN-DAG: v_or_b32_e32 [[PACKED:v[0-9]+]], [[CVT0]], [[SHL]] +; GCN-DAG: buffer_store_dword [[PACKED]] ; GCN: s_endpgm define void @global_truncstore_v2f32_to_v2f16(<2 x half> addrspace(1)* %out, <2 x float> addrspace(1)* %in) #0 { %val = load <2 x float>, <2 x float> addrspace(1)* %in @@ -487,10 +472,7 @@ define void @global_truncstore_v3f32_to_v3f16(<3 x half> addrspace(1)* %out, <3 ; GCN: v_cvt_f16_f32_e32 ; GCN: v_cvt_f16_f32_e32 ; GCN: v_cvt_f16_f32_e32 -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short +; GCN: buffer_store_dwordx2 ; GCN: s_endpgm define void @global_truncstore_v4f32_to_v4f16(<4 x half> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 { %val = load <4 x float>, <4 x float> addrspace(1)* %in @@ -510,14 +492,7 @@ define void @global_truncstore_v4f32_to_v4f16(<4 x half> addrspace(1)* %out, <4 ; GCN: v_cvt_f16_f32_e32 ; GCN: v_cvt_f16_f32_e32 ; GCN: v_cvt_f16_f32_e32 -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short +; GCN: buffer_store_dwordx4 ; GCN: s_endpgm define void @global_truncstore_v8f32_to_v8f16(<8 x half> addrspace(1)* %out, <8 x float> addrspace(1)* %in) #0 { %val = load <8 x float>, <8 x float> addrspace(1)* %in @@ -547,22 +522,8 @@ define void @global_truncstore_v8f32_to_v8f16(<8 x half> addrspace(1)* %out, <8 ; GCN-DAG: v_cvt_f16_f32_e32 ; GCN-DAG: v_cvt_f16_f32_e32 ; GCN-DAG: v_cvt_f16_f32_e32 -; GCN-DAG: buffer_store_short -; GCN-DAG: buffer_store_short -; GCN-DAG: buffer_store_short -; GCN-DAG: buffer_store_short -; GCN-DAG: buffer_store_short -; GCN-DAG: buffer_store_short -; GCN-DAG: buffer_store_short -; GCN-DAG: buffer_store_short -; GCN-DAG: buffer_store_short -; GCN-DAG: buffer_store_short -; GCN-DAG: buffer_store_short -; GCN-DAG: buffer_store_short -; GCN-DAG: buffer_store_short -; GCN-DAG: buffer_store_short -; GCN-DAG: buffer_store_short -; GCN-DAG: buffer_store_short +; GCN-DAG: buffer_store_dwordx4 +; GCN-DAG: buffer_store_dwordx4 ; GCN: s_endpgm define void @global_truncstore_v16f32_to_v16f16(<16 x half> addrspace(1)* %out, <16 x float> addrspace(1)* %in) #0 { %val = load <16 x float>, <16 x float> addrspace(1)* %in diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll index 2d1b7f0efa2..93001e4c139 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -208,10 +208,7 @@ define void @dynamic_insertelement_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> ; GCN: buffer_load_ushort ; GCN: buffer_load_ushort -; GCN: buffer_store_short v{{[0-9]+}}, off -; GCN: buffer_store_short v{{[0-9]+}}, off -; GCN: buffer_store_short v{{[0-9]+}}, off -; GCN: buffer_store_short v{{[0-9]+}}, off +; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off define void @dynamic_insertelement_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, i32 %b) nounwind { %vecins = insertelement <4 x i16> %a, i16 5, i32 %b store <4 x i16> %vecins, <4 x i16> addrspace(1)* %out, align 8 @@ -230,8 +227,7 @@ define void @dynamic_insertelement_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> ; GCN: buffer_load_ubyte ; GCN: buffer_load_ubyte -; GCN: buffer_store_byte v{{[0-9]+}}, off -; GCN: buffer_store_byte v{{[0-9]+}}, off +; GCN: buffer_store_short v{{[0-9]+}}, off define void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> %a, i32 %b) nounwind { %vecins = insertelement <2 x i8> %a, i8 5, i32 %b store <2 x i8> %vecins, <2 x i8> addrspace(1)* %out, align 8 @@ -279,10 +275,7 @@ define void @dynamic_insertelement_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> %a ; GCN: buffer_load_ubyte ; GCN: buffer_load_ubyte -; GCN: buffer_store_byte v{{[0-9]+}}, off -; GCN: buffer_store_byte v{{[0-9]+}}, off -; GCN: buffer_store_byte v{{[0-9]+}}, off -; GCN: buffer_store_byte v{{[0-9]+}}, off +; GCN: buffer_store_dword v{{[0-9]+}}, off define void @dynamic_insertelement_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, i32 %b) nounwind { %vecins = insertelement <4 x i8> %a, i8 5, i32 %b store <4 x i8> %vecins, <4 x i8> addrspace(1)* %out, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll index 0eeaef20b5c..ef9791d8f7a 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll @@ -125,10 +125,9 @@ define void @constant_sextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x } ; FUNC-LABEL: {{^}}constant_zextload_v2i16_to_v2i32: -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort +; GCN: s_load_dword s +; GCN-DAG: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0xffff{{$}} +; GCN-DAG: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 define void @constant_zextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) #0 { %load = load <2 x i16>, <2 x i16> addrspace(2)* %in %ext = zext <2 x i16> %load to <2 x i32> @@ -137,11 +136,9 @@ define void @constant_zextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x } ; FUNC-LABEL: {{^}}constant_sextload_v2i16_to_v2i32: -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort - -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort +; GCN: s_load_dword s +; GCN-DAG: s_ashr_i32 +; GCN-DAG: s_sext_i32_i16 ; EG-DAG: VTX_READ_16 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]] ; EG-DAG: VTX_READ_16 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]] @@ -177,15 +174,9 @@ entry: } ; FUNC-LABEL: {{^}}constant_constant_zextload_v4i16_to_v4i32: -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort - -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort +; GCN: s_load_dwordx2 +; GCN-DAG: s_and_b32 +; GCN-DAG: s_lshr_b32 ; EG: VTX_READ_16 ; EG: VTX_READ_16 @@ -199,15 +190,9 @@ define void @constant_constant_zextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* % } ; FUNC-LABEL: {{^}}constant_sextload_v4i16_to_v4i32: -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort - -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort +; GCN: s_load_dwordx2 +; GCN-DAG: s_ashr_i32 +; GCN-DAG: s_sext_i32_i16 ; EG-DAG: VTX_READ_16 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]] ; EG-DAG: VTX_READ_16 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]] @@ -229,23 +214,9 @@ define void @constant_sextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x } ; FUNC-LABEL: {{^}}constant_zextload_v8i16_to_v8i32: -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort - -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort +; GCN: s_load_dwordx4 +; GCN-DAG: s_and_b32 +; GCN-DAG: s_lshr_b32 define void @constant_zextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) #0 { %load = load <8 x i16>, <8 x i16> addrspace(2)* %in %ext = zext <8 x i16> %load to <8 x i32> @@ -254,23 +225,9 @@ define void @constant_zextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x } ; FUNC-LABEL: {{^}}constant_sextload_v8i16_to_v8i32: -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort - -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort +; GCN: s_load_dwordx4 +; GCN-DAG: s_ashr_i32 +; GCN-DAG: s_sext_i32_i16 define void @constant_sextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) #0 { %load = load <8 x i16>, <8 x i16> addrspace(2)* %in %ext = sext <8 x i16> %load to <8 x i32> @@ -279,39 +236,9 @@ define void @constant_sextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x } ; FUNC-LABEL: {{^}}constant_zextload_v16i16_to_v16i32: -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort - -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort +; GCN: s_load_dwordx8 +; GCN-DAG: s_and_b32 +; GCN-DAG: s_lshr_b32 define void @constant_zextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) #0 { %load = load <16 x i16>, <16 x i16> addrspace(2)* %in %ext = zext <16 x i16> %load to <16 x i32> @@ -320,6 +247,9 @@ define void @constant_zextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, < } ; FUNC-LABEL: {{^}}constant_sextload_v16i16_to_v16i32: +; GCN: s_load_dwordx8 +; GCN-DAG: s_ashr_i32 +; GCN-DAG: s_sext_i32_i16 define void @constant_sextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) #0 { %load = load <16 x i16>, <16 x i16> addrspace(2)* %in %ext = sext <16 x i16> %load to <16 x i32> @@ -328,71 +258,10 @@ define void @constant_sextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, < } ; FUNC-LABEL: {{^}}constant_zextload_v32i16_to_v32i32: -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort - -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort +; GCN-DAG: s_load_dwordx16 +; GCN-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}} +; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[K]] +; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 define void @constant_zextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(2)* %in) #0 { %load = load <32 x i16>, <32 x i16> addrspace(2)* %in %ext = zext <32 x i16> %load to <32 x i32> @@ -401,71 +270,9 @@ define void @constant_zextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, < } ; FUNC-LABEL: {{^}}constant_sextload_v32i16_to_v32i32: -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort - -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort +; GCN: s_load_dwordx16 +; GCN-DAG: s_ashr_i32 +; GCN-DAG: s_sext_i32_i16 define void @constant_sextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(2)* %in) #0 { %load = load <32 x i16>, <32 x i16> addrspace(2)* %in %ext = sext <32 x i16> %load to <32 x i32> @@ -474,135 +281,8 @@ define void @constant_sextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, < } ; FUNC-LABEL: {{^}}constant_zextload_v64i16_to_v64i32: -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort - -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort +; GCN: s_load_dwordx16 +; GCN: s_load_dwordx16 define void @constant_zextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(2)* %in) #0 { %load = load <64 x i16>, <64 x i16> addrspace(2)* %in %ext = zext <64 x i16> %load to <64 x i32> diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll index 5f6ec652874..40c29be6054 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll @@ -78,8 +78,9 @@ entry: ; GCN-DAG: v_mov_b32_e32 v[[SHI:[0-9]+]], 0{{$}} ; GCN: store_dwordx2 -; EG: MEM_RAT -; EG: MEM_RAT +; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY +; EG: CF_END +; EG: VTX_READ_32 define void @constant_zextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(2)* %in) #0 { %ld = load i32, i32 addrspace(2)* %in %ext = zext i32 %ld to i64 @@ -92,9 +93,10 @@ define void @constant_zextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace( ; GCN: s_ashr_i32 s[[HI:[0-9]+]], s[[SLO]], 31 ; GCN: store_dwordx2 -; EG: MEM_RAT -; EG: MEM_RAT -; EG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}}, literal.x +; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY +; EG: CF_END +; EG: VTX_READ_32 +; EG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}}, literal. ; EG: 31 define void @constant_sextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(2)* %in) #0 { %ld = load i32, i32 addrspace(2)* %in diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll index 6697fb0e54f..e4656a2b2ac 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll @@ -25,16 +25,10 @@ entry: } ; FUNC-LABEL: {{^}}constant_load_v3i64: -; GCN-DAG: s_load_dwordx4 {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}} -; SI-DAG: s_load_dwordx2 {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0x4{{$}} -; VI-DAG: s_load_dwordx2 {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0x10{{$}} +; GCN: s_load_dwordx8 {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}} -; EG-DAG: VTX_READ_32 -; EG-DAG: VTX_READ_32 -; EG-DAG: VTX_READ_32 -; EG-DAG: VTX_READ_32 -; EG-DAG: VTX_READ_32 -; EG-DAG: VTX_READ_32 +; EG-DAG: VTX_READ_128 +; EG-DAG: VTX_READ_128 define void @constant_load_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> addrspace(2)* %in) #0 { entry: %ld = load <3 x i64>, <3 x i64> addrspace(2)* %in diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll index c0550c1ef38..87828982a98 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll @@ -115,10 +115,8 @@ define void @constant_sextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x } ; FUNC-LABEL: {{^}}constant_zextload_v2i8_to_v2i32: -; GCN-NOHSA: buffer_load_ubyte -; GCN-NOHSA: buffer_load_ubyte -; GCN-HSA: flat_load_ubyte -; GCN-HSA: flat_load_ubyte +; GCN-NOHSA: buffer_load_ushort +; GCN-HSA: flat_load_ushort ; EG: VTX_READ_8 ; EG: VTX_READ_8 define void @constant_zextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 { @@ -129,10 +127,12 @@ define void @constant_zextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x } ; FUNC-LABEL: {{^}}constant_sextload_v2i8_to_v2i32: -; GCN-NOHSA: buffer_load_sbyte -; GCN-NOHSA: buffer_load_sbyte -; GCN-HSA: flat_load_sbyte -; GCN-HSA: flat_load_sbyte +; GCN-NOHSA: buffer_load_ushort + +; GCN-HSA: flat_load_ushort + +; GCN: v_bfe_i32 +; GCN: v_bfe_i32 ; EG-DAG: VTX_READ_8 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]] ; EG-DAG: VTX_READ_8 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]] @@ -176,14 +176,9 @@ entry: } ; FUNC-LABEL: {{^}}constant_zextload_v4i8_to_v4i32: -; GCN-NOHSA: buffer_load_ubyte -; GCN-NOHSA: buffer_load_ubyte -; GCN-NOHSA: buffer_load_ubyte -; GCN-NOHSA: buffer_load_ubyte -; GCN-HSA: flat_load_ubyte -; GCN-HSA: flat_load_ubyte -; GCN-HSA: flat_load_ubyte -; GCN-HSA: flat_load_ubyte +; GCN: s_load_dword s +; GCN-DAG: s_and_b32 +; GCN-DAG: s_lshr_b32 ; EG: VTX_READ_8 ; EG: VTX_READ_8 @@ -197,14 +192,9 @@ define void @constant_zextload_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x } ; FUNC-LABEL: {{^}}constant_sextload_v4i8_to_v4i32: -; GCN-NOHSA: buffer_load_sbyte -; GCN-NOHSA: buffer_load_sbyte -; GCN-NOHSA: buffer_load_sbyte -; GCN-NOHSA: buffer_load_sbyte -; GCN-HSA: flat_load_sbyte -; GCN-HSA: flat_load_sbyte -; GCN-HSA: flat_load_sbyte -; GCN-HSA: flat_load_sbyte +; GCN: s_load_dword s +; GCN-DAG: s_sext_i32_i8 +; GCN-DAG: s_ashr_i32 ; EG-DAG: VTX_READ_8 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]] ; EG-DAG: VTX_READ_8 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]] @@ -226,23 +216,9 @@ define void @constant_sextload_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x } ; FUNC-LABEL: {{^}}constant_zextload_v8i8_to_v8i32: -; GCN-NOHSA: buffer_load_ubyte -; GCN-NOHSA: buffer_load_ubyte -; GCN-NOHSA: buffer_load_ubyte -; GCN-NOHSA: buffer_load_ubyte -; GCN-NOHSA: buffer_load_ubyte -; GCN-NOHSA: buffer_load_ubyte -; GCN-NOHSA: buffer_load_ubyte -; GCN-NOHSA: buffer_load_ubyte - -; GCN-HSA: flat_load_ubyte -; GCN-HSA: flat_load_ubyte -; GCN-HSA: flat_load_ubyte -; GCN-HSA: flat_load_ubyte -; GCN-HSA: flat_load_ubyte -; GCN-HSA: flat_load_ubyte -; GCN-HSA: flat_load_ubyte -; GCN-HSA: flat_load_ubyte +; GCN: s_load_dwordx2 +; GCN-DAG: s_and_b32 +; GCN-DAG: s_lshr_b32 define void @constant_zextload_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 { %load = load <8 x i8>, <8 x i8> addrspace(2)* %in %ext = zext <8 x i8> %load to <8 x i32> @@ -251,23 +227,9 @@ define void @constant_zextload_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x } ; FUNC-LABEL: {{^}}constant_sextload_v8i8_to_v8i32: -; GCN-NOHSA: buffer_load_sbyte -; GCN-NOHSA: buffer_load_sbyte -; GCN-NOHSA: buffer_load_sbyte -; GCN-NOHSA: buffer_load_sbyte -; GCN-NOHSA: buffer_load_sbyte -; GCN-NOHSA: buffer_load_sbyte -; GCN-NOHSA: buffer_load_sbyte -; GCN-NOHSA: buffer_load_sbyte - -; GCN-HSA: flat_load_sbyte -; GCN-HSA: flat_load_sbyte -; GCN-HSA: flat_load_sbyte -; GCN-HSA: flat_load_sbyte -; GCN-HSA: flat_load_sbyte -; GCN-HSA: flat_load_sbyte -; GCN-HSA: flat_load_sbyte -; GCN-HSA: flat_load_sbyte +; GCN: s_load_dwordx2 +; GCN-DAG: s_ashr_i32 +; GCN-DAG: s_sext_i32_i8 define void @constant_sextload_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 { %load = load <8 x i8>, <8 x i8> addrspace(2)* %in %ext = sext <8 x i8> %load to <8 x i32> diff --git a/llvm/test/CodeGen/AMDGPU/load-global-f64.ll b/llvm/test/CodeGen/AMDGPU/load-global-f64.ll index cd6b6f6848d..a86cc5a6d3d 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-f64.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-f64.ll @@ -14,21 +14,21 @@ define void @global_load_f64(double addrspace(1)* %out, double addrspace(1)* %in ret void } -; FUNC-LABEL: {{^}}global_load_v2i64: +; FUNC-LABEL: {{^}}global_load_v2f64: ; GCN-NOHSA: buffer_load_dwordx4 ; GCN-HSA: flat_load_dwordx4 -define void @global_load_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) #0 { +define void @global_load_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in) #0 { entry: - %ld = load <2 x i64>, <2 x i64> addrspace(1)* %in - store <2 x i64> %ld, <2 x i64> addrspace(1)* %out + %ld = load <2 x double>, <2 x double> addrspace(1)* %in + store <2 x double> %ld, <2 x double> addrspace(1)* %out ret void } ; FUNC-LABEL: {{^}}global_load_v3f64: -; GCN-NOHSA-DAG: buffer_load_dwordx4 -; GCN-NOHSA-DAG: buffer_load_dwordx2 -; GCN-HSA-DAG: flat_load_dwordx4 -; GCN-HSA-DAG: flat_load_dwordx2 +; GCN-NOHSA: buffer_load_dwordx4 +; GCN-NOHSA: buffer_load_dwordx4 +; GCN-HSA: flat_load_dwordx4 +; GCN-HSA: flat_load_dwordx4 define void @global_load_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %in) #0 { entry: %ld = load <3 x double>, <3 x double> addrspace(1)* %in diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll index d621c815b0d..11e6b10c38f 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll @@ -136,10 +136,8 @@ define void @global_sextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i } ; FUNC-LABEL: {{^}}global_zextload_v2i16_to_v2i32: -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort +; GCN-NOHSA: buffer_load_dword +; GCN-HSA: flat_load_dword define void @global_zextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { %load = load <2 x i16>, <2 x i16> addrspace(1)* %in %ext = zext <2 x i16> %load to <2 x i32> @@ -148,11 +146,9 @@ define void @global_zextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i } ; FUNC-LABEL: {{^}}global_sextload_v2i16_to_v2i32: -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort +; GCN-NOHSA: buffer_load_dword -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort +; GCN-HSA: flat_load_dword ; EG-DAG: VTX_READ_16 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]] ; EG-DAG: VTX_READ_16 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]] @@ -190,15 +186,9 @@ entry: } ; FUNC-LABEL: {{^}}global_global_zextload_v4i16_to_v4i32: -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort +; GCN-NOHSA: buffer_load_dwordx2 -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort +; GCN-HSA: flat_load_dwordx2 ; EG: VTX_READ_16 ; EG: VTX_READ_16 @@ -212,15 +202,9 @@ define void @global_global_zextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, } ; FUNC-LABEL: {{^}}global_sextload_v4i16_to_v4i32: -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort +; GCN-NOHSA: buffer_load_dwordx2 -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort +; GCN-HSA: flat_load_dwordx2 ; EG-DAG: VTX_READ_16 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]] ; EG-DAG: VTX_READ_16 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]] @@ -242,23 +226,8 @@ define void @global_sextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i } ; FUNC-LABEL: {{^}}global_zextload_v8i16_to_v8i32: -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort - -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort +; GCN-NOHSA: buffer_load_dwordx4 +; GCN-HSA: flat_load_dwordx4 define void @global_zextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 { %load = load <8 x i16>, <8 x i16> addrspace(1)* %in %ext = zext <8 x i16> %load to <8 x i32> @@ -267,23 +236,8 @@ define void @global_zextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i } ; FUNC-LABEL: {{^}}global_sextload_v8i16_to_v8i32: -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort - -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort +; GCN-NOHSA: buffer_load_dwordx4 +; GCN-HSA: flat_load_dwordx4 define void @global_sextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 { %load = load <8 x i16>, <8 x i16> addrspace(1)* %in %ext = sext <8 x i16> %load to <8 x i32> @@ -292,39 +246,11 @@ define void @global_sextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i } ; FUNC-LABEL: {{^}}global_zextload_v16i16_to_v16i32: -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort +; GCN-NOHSA: buffer_load_dwordx4 +; GCN-NOHSA: buffer_load_dwordx4 -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort +; GCN-HSA: flat_load_dwordx4 +; GCN-HSA: flat_load_dwordx4 define void @global_zextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 { %load = load <16 x i16>, <16 x i16> addrspace(1)* %in %ext = zext <16 x i16> %load to <16 x i32> @@ -341,71 +267,15 @@ define void @global_sextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 } ; FUNC-LABEL: {{^}}global_zextload_v32i16_to_v32i32: -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort +; GCN-NOHSA: buffer_load_dwordx4 +; GCN-NOHSA: buffer_load_dwordx4 +; GCN-NOHSA: buffer_load_dwordx4 +; GCN-NOHSA: buffer_load_dwordx4 -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort +; GCN-HSA: flat_load_dwordx4 +; GCN-HSA: flat_load_dwordx4 +; GCN-HSA: flat_load_dwordx4 +; GCN-HSA: flat_load_dwordx4 define void @global_zextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* %in) #0 { %load = load <32 x i16>, <32 x i16> addrspace(1)* %in %ext = zext <32 x i16> %load to <32 x i32> @@ -414,71 +284,15 @@ define void @global_zextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 } ; FUNC-LABEL: {{^}}global_sextload_v32i16_to_v32i32: -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort -; GCN-NOHSA: buffer_load_sshort +; GCN-NOHSA: buffer_load_dwordx4 +; GCN-NOHSA: buffer_load_dwordx4 +; GCN-NOHSA: buffer_load_dwordx4 +; GCN-NOHSA: buffer_load_dwordx4 -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort -; GCN-HSA: flat_load_sshort +; GCN-HSA: flat_load_dwordx4 +; GCN-HSA: flat_load_dwordx4 +; GCN-HSA: flat_load_dwordx4 +; GCN-HSA: flat_load_dwordx4 define void @global_sextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* %in) #0 { %load = load <32 x i16>, <32 x i16> addrspace(1)* %in %ext = sext <32 x i16> %load to <32 x i32> @@ -487,135 +301,23 @@ define void @global_sextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 } ; FUNC-LABEL: {{^}}global_zextload_v64i16_to_v64i32: -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort -; GCN-NOHSA: buffer_load_ushort +; GCN-NOHSA: buffer_load_dwordx4 +; GCN-NOHSA: buffer_load_dwordx4 +; GCN-NOHSA: buffer_load_dwordx4 +; GCN-NOHSA: buffer_load_dwordx4 +; GCN-NOHSA: buffer_load_dwordx4 +; GCN-NOHSA: buffer_load_dwordx4 +; GCN-NOHSA: buffer_load_dwordx4 +; GCN-NOHSA: buffer_load_dwordx4 -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort -; GCN-HSA: flat_load_ushort +; GCN-HSA: flat_load_dwordx4 +; GCN-HSA: flat_load_dwordx4 +; GCN-HSA: flat_load_dwordx4 +; GCN-HSA: flat_load_dwordx4 +; GCN-HSA: flat_load_dwordx4 +; GCN-HSA: flat_load_dwordx4 +; GCN-HSA: flat_load_dwordx4 +; GCN-HSA: flat_load_dwordx4 define void @global_zextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* %in) #0 { %load = load <64 x i16>, <64 x i16> addrspace(1)* %in %ext = zext <64 x i16> %load to <64 x i32> diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll index c25470b1b78..5e1171a69be 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll @@ -1,9 +1,7 @@ ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s - ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}global_load_i32: @@ -99,8 +97,7 @@ entry: ; GCN-NOHSA: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]] ; GCN-HSA: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]] -; EG: MEM_RAT -; EG: MEM_RAT +; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY define void @global_zextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %ld = load i32, i32 addrspace(1)* %in %ext = zext i32 %ld to i64 @@ -115,9 +112,10 @@ define void @global_zextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1) ; GCN-NOHSA: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} ; GCN-HSA: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}} + ; EG: MEM_RAT -; EG: MEM_RAT -; EG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}}, literal.x +; EG: VTX_READ_32 +; EG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}}, literal. ; EG: 31 define void @global_sextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %ld = load i32, i32 addrspace(1)* %in diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i64.ll b/llvm/test/CodeGen/AMDGPU/load-global-i64.ll index b5367319ec0..305b954c78f 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i64.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i64.ll @@ -32,17 +32,14 @@ entry: } ; FUNC-LABEL: {{^}}global_load_v3i64: -; GCN-NOHSA-DAG: buffer_load_dwordx4 -; GCN-NOHSA-DAG: buffer_load_dwordx2 -; GCN-HSA-DAG: flat_load_dwordx4 -; GCN-HSA-DAG: flat_load_dwordx2 - -; EG-DAG: VTX_READ_32 -; EG-DAG: VTX_READ_32 -; EG-DAG: VTX_READ_32 -; EG-DAG: VTX_READ_32 -; EG-DAG: VTX_READ_32 -; EG-DAG: VTX_READ_32 +; GCN-NOHSA: buffer_load_dwordx4 +; GCN-NOHSA: buffer_load_dwordx4 + +; GCN-HSA: flat_load_dwordx4 +; GCN-HSA: flat_load_dwordx4 + +; EG: VTX_READ_128 +; EG: VTX_READ_128 define void @global_load_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> addrspace(1)* %in) #0 { entry: %ld = load <3 x i64>, <3 x i64> addrspace(1)* %in diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i8.ll b/llvm/test/CodeGen/AMDGPU/load-global-i8.ll index c58c3bfa73a..b697967f1a2 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i8.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i8.ll @@ -121,10 +121,9 @@ define void @global_sextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8 } ; FUNC-LABEL: {{^}}global_zextload_v2i8_to_v2i32: -; GCN-NOHSA: buffer_load_ubyte -; GCN-NOHSA: buffer_load_ubyte -; GCN-HSA: flat_load_ubyte -; GCN-HSA: flat_load_ubyte +; GCN-NOHSA: buffer_load_ushort +; GCN-HSA: flat_load_ushort + ; EG: VTX_READ_8 ; EG: VTX_READ_8 define void @global_zextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 { @@ -135,10 +134,8 @@ define void @global_zextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8 } ; FUNC-LABEL: {{^}}global_sextload_v2i8_to_v2i32: -; GCN-NOHSA: buffer_load_sbyte -; GCN-NOHSA: buffer_load_sbyte -; GCN-HSA: flat_load_sbyte -; GCN-HSA: flat_load_sbyte +; GCN-NOHSA: buffer_load_ushort +; GCN-HSA: flat_load_ushort ; EG-DAG: VTX_READ_8 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]] ; EG-DAG: VTX_READ_8 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]] @@ -184,14 +181,8 @@ entry: } ; FUNC-LABEL: {{^}}global_zextload_v4i8_to_v4i32: -; GCN-NOHSA: buffer_load_ubyte -; GCN-NOHSA: buffer_load_ubyte -; GCN-NOHSA: buffer_load_ubyte -; GCN-NOHSA: buffer_load_ubyte -; GCN-HSA: flat_load_ubyte -; GCN-HSA: flat_load_ubyte -; GCN-HSA: flat_load_ubyte -; GCN-HSA: flat_load_ubyte +; GCN-NOHSA: buffer_load_dword +; GCN-HSA: flat_load_dword ; EG: VTX_READ_8 ; EG: VTX_READ_8 @@ -205,14 +196,8 @@ define void @global_zextload_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8 } ; FUNC-LABEL: {{^}}global_sextload_v4i8_to_v4i32: -; GCN-NOHSA: buffer_load_sbyte -; GCN-NOHSA: buffer_load_sbyte -; GCN-NOHSA: buffer_load_sbyte -; GCN-NOHSA: buffer_load_sbyte -; GCN-HSA: flat_load_sbyte -; GCN-HSA: flat_load_sbyte -; GCN-HSA: flat_load_sbyte -; GCN-HSA: flat_load_sbyte +; GCN-NOHSA: buffer_load_dword +; GCN-HSA: flat_load_dword ; EG-DAG: VTX_READ_8 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]] ; EG-DAG: VTX_READ_8 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]] diff --git a/llvm/test/CodeGen/AMDGPU/load-local-f32.ll b/llvm/test/CodeGen/AMDGPU/load-local-f32.ll index 17c09d149ac..77b5e3cf3ae 100644 --- a/llvm/test/CodeGen/AMDGPU/load-local-f32.ll +++ b/llvm/test/CodeGen/AMDGPU/load-local-f32.ll @@ -27,9 +27,10 @@ entry: ret void } -; FIXME: should only do one b64 load +; FIXME: should this do a read2_b64? ; FUNC-LABEL: {{^}}local_load_v3f32: -; GCN: ds_read2_b64 +; GCN-DAG: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:8 +; GCN-DAG: ds_read_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+$}} ; GCN: s_waitcnt ; GCN-DAG: ds_write_b64 ; GCN-DAG: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:8{{$}} diff --git a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll index 6db67d77425..d3c0af469dd 100644 --- a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll @@ -132,8 +132,7 @@ define void @local_sextload_v1i16_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i1 ; FUNC-LABEL: {{^}}local_zextload_v2i16_to_v2i32: ; GCN-NOT: s_wqm_b64 ; GCN: s_mov_b32 m0 -; GCN: ds_read_u16 -; GCN: ds_read_u16 +; GCN: ds_read_b32 ; EG: LDS_USHORT_READ_RET ; EG: LDS_USHORT_READ_RET @@ -147,8 +146,7 @@ define void @local_zextload_v2i16_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i1 ; FUNC-LABEL: {{^}}local_sextload_v2i16_to_v2i32: ; GCN-NOT: s_wqm_b64 ; GCN: s_mov_b32 m0 -; GCN: ds_read_i16 -; GCN: ds_read_i16 +; GCN: ds_read_b32 ; EG-DAG: LDS_USHORT_READ_RET ; EG-DAG: LDS_USHORT_READ_RET @@ -188,10 +186,7 @@ entry: ; FUNC-LABEL: {{^}}local_local_zextload_v4i16_to_v4i32: ; GCN-NOT: s_wqm_b64 ; GCN: s_mov_b32 m0 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 +; GCN: ds_read_b64 ; EG: LDS_USHORT_READ_RET ; EG: LDS_USHORT_READ_RET @@ -207,10 +202,7 @@ define void @local_local_zextload_v4i16_to_v4i32(<4 x i32> addrspace(3)* %out, < ; FUNC-LABEL: {{^}}local_sextload_v4i16_to_v4i32: ; GCN-NOT: s_wqm_b64 ; GCN: s_mov_b32 m0 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 +; GCN: ds_read_b64 ; EG-DAG: LDS_USHORT_READ_RET ; EG-DAG: LDS_USHORT_READ_RET @@ -228,14 +220,7 @@ define void @local_sextload_v4i16_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i1 } ; FUNC-LABEL: {{^}}local_zextload_v8i16_to_v8i32: -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 +; GCN: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} define void @local_zextload_v8i16_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) #0 { %load = load <8 x i16>, <8 x i16> addrspace(3)* %in %ext = zext <8 x i16> %load to <8 x i32> @@ -244,14 +229,7 @@ define void @local_zextload_v8i16_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i1 } ; FUNC-LABEL: {{^}}local_sextload_v8i16_to_v8i32: -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 +; GCN: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} define void @local_sextload_v8i16_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) #0 { %load = load <8 x i16>, <8 x i16> addrspace(3)* %in %ext = sext <8 x i16> %load to <8 x i32> @@ -259,23 +237,16 @@ define void @local_sextload_v8i16_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i1 ret void } +; FIXME: Should have 2 ds_read_b64 ; FUNC-LABEL: {{^}}local_zextload_v16i16_to_v16i32: -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 +; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:1 offset1:2{{$}} +; GCN-DAG: ds_read_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+$}} +; GCN-DAG: ds_read_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:24 + +; GCN: ds_write2_b64 +; GCN: ds_write2_b64 +; GCN: ds_write2_b64 +; GCN: ds_write2_b64 define void @local_zextload_v16i16_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) #0 { %load = load <16 x i16>, <16 x i16> addrspace(3)* %in %ext = zext <16 x i16> %load to <16 x i32> @@ -284,22 +255,9 @@ define void @local_zextload_v16i16_to_v16i32(<16 x i32> addrspace(3)* %out, <16 } ; FUNC-LABEL: {{^}}local_sextload_v16i16_to_v16i32: -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 +; GCN-DAG: ds_read_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+$}} +; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:3 offset1:1{{$}} +; GCN-DAG: ds_read_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:16{{$}} define void @local_sextload_v16i16_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) #0 { %load = load <16 x i16>, <16 x i16> addrspace(3)* %in %ext = sext <16 x i16> %load to <16 x i32> @@ -308,39 +266,10 @@ define void @local_sextload_v16i16_to_v16i32(<16 x i32> addrspace(3)* %out, <16 } ; FUNC-LABEL: {{^}}local_zextload_v32i16_to_v32i32: -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 - +; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} +; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3 +; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:4 offset1:5 +; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:6 offset1:7 define void @local_zextload_v32i16_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i16> addrspace(3)* %in) #0 { %load = load <32 x i16>, <32 x i16> addrspace(3)* %in %ext = zext <32 x i16> %load to <32 x i32> @@ -349,38 +278,10 @@ define void @local_zextload_v32i16_to_v32i32(<32 x i32> addrspace(3)* %out, <32 } ; FUNC-LABEL: {{^}}local_sextload_v32i16_to_v32i32: -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 -; GCN: ds_read_i16 +; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:1 offset1:2{{$}} +; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:3 offset1:4 +; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:5{{$}} +; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:6 offset1:7 define void @local_sextload_v32i16_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i16> addrspace(3)* %in) #0 { %load = load <32 x i16>, <32 x i16> addrspace(3)* %in %ext = sext <32 x i16> %load to <32 x i32> @@ -388,72 +289,17 @@ define void @local_sextload_v32i16_to_v32i32(<32 x i32> addrspace(3)* %out, <32 ret void } +; FIXME: Missed read2 ; FUNC-LABEL: {{^}}local_zextload_v64i16_to_v64i32: -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 -; GCN: ds_read_u16 - +; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:11 offset1:15 +; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} +; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3 +; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:4 offset1:5 +; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:6 offset1:7 +; GCN-DAG: ds_read_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:64 +; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:9 offset1:10 +; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:12 offset1:13 +; GCN-DAG: ds_read_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:112 define void @local_zextload_v64i16_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i16> addrspace(3)* %in) #0 { %load = load <64 x i16>, <64 x i16> addrspace(3)* %in %ext = zext <64 x i16> %load to <64 x i32> diff --git a/llvm/test/CodeGen/AMDGPU/load-local-i8.ll b/llvm/test/CodeGen/AMDGPU/load-local-i8.ll index 423fddac3cf..be865b078d7 100644 --- a/llvm/test/CodeGen/AMDGPU/load-local-i8.ll +++ b/llvm/test/CodeGen/AMDGPU/load-local-i8.ll @@ -122,8 +122,7 @@ define void @local_sextload_v1i8_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i8> } ; FUNC-LABEL: {{^}}local_zextload_v2i8_to_v2i32: -; GCN: ds_read_u8 -; GCN: ds_read_u8 +; GCN: ds_read_u16 ; EG: LDS_UBYTE_READ_RET ; EG: LDS_UBYTE_READ_RET @@ -137,8 +136,9 @@ define void @local_zextload_v2i8_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i8> ; FUNC-LABEL: {{^}}local_sextload_v2i8_to_v2i32: ; GCN-NOT: s_wqm_b64 ; GCN: s_mov_b32 m0 -; GCN: ds_read_i8 -; GCN: ds_read_i8 +; GCN: ds_read_u16 +; GCN-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 8, 8 +; GCN-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8 ; EG-DAG: LDS_UBYTE_READ_RET ; EG-DAG: LDS_UBYTE_READ_RET @@ -189,10 +189,7 @@ entry: ; FUNC-LABEL: {{^}}local_zextload_v4i8_to_v4i32: ; GCN-NOT: s_wqm_b64 ; GCN: s_mov_b32 m0 -; GCN: ds_read_u8 -; GCN: ds_read_u8 -; GCN: ds_read_u8 -; GCN: ds_read_u8 +; GCN: ds_read_b32 ; EG: LDS_UBYTE_READ_RET ; EG: LDS_UBYTE_READ_RET @@ -208,10 +205,7 @@ define void @local_zextload_v4i8_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i8> ; FUNC-LABEL: {{^}}local_sextload_v4i8_to_v4i32: ; GCN-NOT: s_wqm_b64 ; GCN: s_mov_b32 m0 -; GCN: ds_read_i8 -; GCN: ds_read_i8 -; GCN: ds_read_i8 -; GCN: ds_read_i8 +; GCN: ds_read_b32 ; EG-DAG: LDS_UBYTE_READ_RET ; EG-DAG: LDS_UBYTE_READ_RET diff --git a/llvm/test/CodeGen/AMDGPU/load-weird-sizes.ll b/llvm/test/CodeGen/AMDGPU/load-weird-sizes.ll new file mode 100644 index 00000000000..b9f7018b810 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/load-weird-sizes.ll @@ -0,0 +1,31 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI-NOHSA -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -mtriple=amdgcn-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=CI-HSA -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI-NOHSA -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=R600 -check-prefix=CM -check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}load_i24: +; SI: {{flat|buffer}}_load_ubyte +; SI: {{flat|buffer}}_load_ushort +; SI: {{flat|buffer}}_store_dword +define void @load_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) #0 { + %1 = load i24, i24 addrspace(1)* %in + %2 = zext i24 %1 to i32 + store i32 %2, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}load_i25: +; SI-NOHSA: buffer_load_dword [[VAL:v[0-9]+]] +; SI-NOHSA: buffer_store_dword [[VAL]] + +; CI-HSA: flat_load_dword [[VAL:v[0-9]+]] +; CI-HSA: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VAL]] +define void @load_i25(i32 addrspace(1)* %out, i25 addrspace(1)* %in) #0 { + %1 = load i25, i25 addrspace(1)* %in + %2 = zext i25 %1 to i32 + store i32 %2, i32 addrspace(1)* %out + ret void +} + +attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll index 0970e5d3063..55b392a3272 100644 --- a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll +++ b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll @@ -1,15 +1,14 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s - +; XXX - Why the packing? ; FUNC-LABEL: {{^}}scalar_to_vector_v2i32: ; SI: buffer_load_dword [[VAL:v[0-9]+]], -; SI: v_lshrrev_b32_e32 [[RESULT:v[0-9]+]], 16, [[VAL]] -; SI: buffer_store_short [[RESULT]] -; SI: buffer_store_short [[RESULT]] -; SI: buffer_store_short [[RESULT]] -; SI: buffer_store_short [[RESULT]] -; SI: s_endpgm +; SI: v_lshrrev_b32_e32 [[SHR:v[0-9]+]], 16, [[VAL]] +; SI: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 16, [[SHR]] +; SI: v_or_b32_e32 v[[OR:[0-9]+]], [[SHL]], [[SHR]] +; SI: v_mov_b32_e32 v[[COPY:[0-9]+]], v[[OR]] +; SI: buffer_store_dwordx2 v{{\[}}[[OR]]:[[COPY]]{{\]}} define void @scalar_to_vector_v2i32(<4 x i16> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { %tmp1 = load i32, i32 addrspace(1)* %in, align 4 %bc = bitcast i32 %tmp1 to <2 x i16> @@ -21,11 +20,7 @@ define void @scalar_to_vector_v2i32(<4 x i16> addrspace(1)* %out, i32 addrspace( ; FUNC-LABEL: {{^}}scalar_to_vector_v2f32: ; SI: buffer_load_dword [[VAL:v[0-9]+]], ; SI: v_lshrrev_b32_e32 [[RESULT:v[0-9]+]], 16, [[VAL]] -; SI: buffer_store_short [[RESULT]] -; SI: buffer_store_short [[RESULT]] -; SI: buffer_store_short [[RESULT]] -; SI: buffer_store_short [[RESULT]] -; SI: s_endpgm +; SI: buffer_store_dwordx2 define void @scalar_to_vector_v2f32(<4 x i16> addrspace(1)* %out, float addrspace(1)* %in) nounwind { %tmp1 = load float, float addrspace(1)* %in, align 4 %bc = bitcast float %tmp1 to <2 x i16> diff --git a/llvm/test/CodeGen/AMDGPU/sext-in-reg.ll b/llvm/test/CodeGen/AMDGPU/sext-in-reg.ll index a668f8cc5a3..3857de9f10f 100644 --- a/llvm/test/CodeGen/AMDGPU/sext-in-reg.ll +++ b/llvm/test/CodeGen/AMDGPU/sext-in-reg.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s ; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s declare i32 @llvm.AMDGPU.imax(i32, i32) nounwind readnone @@ -95,17 +95,6 @@ define void @sext_in_reg_i1_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounw ; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]] ; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]] ; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}} - -; EG: MEM_{{.*}} STORE_{{.*}} [[RES_LO:T[0-9]+\.[XYZW]]], [[ADDR_LO:T[0-9]+.[XYZW]]] -; EG: MEM_{{.*}} STORE_{{.*}} [[RES_HI:T[0-9]+\.[XYZW]]], [[ADDR_HI:T[0-9]+.[XYZW]]] -; EG: LSHL -; EG: BFE_INT {{\*?}} [[RES_LO]], {{.*}}, 0.0, literal -; EG: ASHR [[RES_HI]] -; EG-NOT: BFE_INT -; EG: LSHR -; EG: LSHR -;; TODO Check address computation, using | with variables in {{}} does not work, -;; also the _LO/_HI order might be different define void @sext_in_reg_i8_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { %c = shl i64 %a, %b %shl = shl i64 %c, 56 @@ -121,16 +110,6 @@ define void @sext_in_reg_i8_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounw ; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]] ; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}} -; EG: MEM_{{.*}} STORE_{{.*}} [[RES_LO:T[0-9]+\.[XYZW]]], [[ADDR_LO:T[0-9]+.[XYZW]]] -; EG: MEM_{{.*}} STORE_{{.*}} [[RES_HI:T[0-9]+\.[XYZW]]], [[ADDR_HI:T[0-9]+.[XYZW]]] -; EG: LSHL -; EG: BFE_INT {{\*?}} [[RES_LO]], {{.*}}, 0.0, literal -; EG: ASHR [[RES_HI]] -; EG-NOT: BFE_INT -; EG: LSHR -; EG: LSHR -;; TODO Check address computation, using | with variables in {{}} does not work, -;; also the _LO/_HI order might be different define void @sext_in_reg_i16_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { %c = shl i64 %a, %b %shl = shl i64 %c, 48 @@ -145,17 +124,6 @@ define void @sext_in_reg_i16_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) noun ; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]] ; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]] ; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}} - -; EG: MEM_{{.*}} STORE_{{.*}} [[RES_LO:T[0-9]+\.[XYZW]]], [[ADDR_LO:T[0-9]+.[XYZW]]] -; EG: MEM_{{.*}} STORE_{{.*}} [[RES_HI:T[0-9]+\.[XYZW]]], [[ADDR_HI:T[0-9]+.[XYZW]]] -; EG-NOT: BFE_INT - -; EG: ASHR [[RES_HI]] - -; EG: LSHR -; EG: LSHR -;; TODO Check address computation, using | with variables in {{}} does not work, -;; also the _LO/_HI order might be different define void @sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { %c = shl i64 %a, %b %shl = shl i64 %c, 32 diff --git a/llvm/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll b/llvm/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll index ea7357d78a8..a6555a19738 100644 --- a/llvm/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll +++ b/llvm/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll @@ -5,11 +5,11 @@ ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; GCN: v_mov_b32_e32 v[[ZERO0:[0-9]+]], 0{{$}} -; GCN: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO0]]{{$}} -; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]] +; GCN: v_mov_b32_e32 v[[ZERO1:[0-9]+]], 0{{$}} +; GCN: v_mov_b32_e32 v[[ZERO2:[0-9]+]], v[[ZERO0]]{{$}} +; GCN: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]] -; GCN-DAG: buffer_store_dwordx2 v{{\[}}[[ZERO0]]:[[ZERO1]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; GCN-DAG: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO0]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; GCN: buffer_store_dwordx4 v{{\[}}[[SHIFT]]:[[ZERO2]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; GCN: s_endpgm define void @v_uextract_bit_31_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -26,12 +26,12 @@ define void @v_uextract_bit_31_i128(i128 addrspace(1)* %out, i128 addrspace(1)* ; GCN-LABEL: {{^}}v_uextract_bit_63_i128: ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} -; GCN: v_mov_b32_e32 v[[ZERO0:[0-9]+]], 0{{$}} +; GCN-DAG: v_mov_b32_e32 v[[ZERO0:[0-9]+]], 0{{$}} ; GCN: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO0]]{{$}} +; GCN: v_mov_b32_e32 v[[ZERO2:[0-9]+]], v[[ZERO0]]{{$}} ; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]] -; GCN-DAG: buffer_store_dwordx2 v{{\[}}[[ZERO0]]:[[ZERO1]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; GCN-DAG: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO0]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[SHIFT]]:[[ZERO2]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; GCN: s_endpgm define void @v_uextract_bit_63_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -48,12 +48,12 @@ define void @v_uextract_bit_63_i128(i128 addrspace(1)* %out, i128 addrspace(1)* ; GCN-LABEL: {{^}}v_uextract_bit_95_i128: ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; GCN: v_mov_b32_e32 v[[ZERO0:[0-9]+]], 0{{$}} -; GCN: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO0]]{{$}} +; GCN-DAG: v_mov_b32_e32 v[[ZERO0:[0-9]+]], 0{{$}} +; GCN: v_mov_b32_e32 v[[ZERO1:[0-9]+]], 0{{$}} +; GCN: v_mov_b32_e32 v[[ZERO2:[0-9]+]], v[[ZERO0]]{{$}} ; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]] -; GCN-DAG: buffer_store_dwordx2 v{{\[}}[[ZERO0]]:[[ZERO1]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; GCN-DAG: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO0]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[SHIFT]]:[[ZERO2]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; GCN: s_endpgm define void @v_uextract_bit_95_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -72,10 +72,10 @@ define void @v_uextract_bit_95_i128(i128 addrspace(1)* %out, i128 addrspace(1)* ; GCN: v_mov_b32_e32 v[[ZERO0:[0-9]+]], 0{{$}} ; GCN: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO0]]{{$}} +; GCN: v_mov_b32_e32 v[[ZERO2:[0-9]+]], v[[ZERO0]]{{$}} ; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]] -; GCN-DAG: buffer_store_dwordx2 v{{\[}}[[ZERO0]]:[[ZERO1]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; GCN-DAG: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO0]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[SHIFT]]:[[ZERO2]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; GCN: s_endpgm define void @v_uextract_bit_127_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -90,18 +90,16 @@ define void @v_uextract_bit_127_i128(i128 addrspace(1)* %out, i128 addrspace(1)* ; Spans more than 2 dword boundaries ; GCN-LABEL: {{^}}v_uextract_bit_34_100_i128: -; GCN: buffer_load_dwordx2 v{{\[}}[[VAL2:[0-9]+]]:[[VAL3:[0-9]+]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; GCN: buffer_load_dword v[[VAL1:[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} +; GCN: buffer_load_dwordx4 v{{\[}}[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; GCN-DAG: v_lshl_b64 v{{\[}}[[SHLLO:[0-9]+]]:[[SHLHI:[0-9]+]]{{\]}}, v{{\[}}[[VAL2]]:[[VAL3]]{{\]}}, 30 -; GCN-DAG: v_lshrrev_b32_e32 v[[ELT1PART:[0-9]+]], 2, v[[VAL1]] +; GCN-DAG: v_lshl_b64 v{{\[}}[[SHLLO:[0-9]+]]:[[SHLHI:[0-9]+]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, 30 +; GCN-DAG: v_lshrrev_b32_e32 v[[ELT1PART:[0-9]+]], 2, v{{[[0-9]+}} ; GCN-DAG: v_bfe_u32 v[[ELT2PART:[0-9]+]], v[[VAL3]], 2, 2{{$}} ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} ; GCN-DAG: v_or_b32_e32 v[[OR0:[0-9]+]], v[[SHLLO]], v[[ELT1PART]] ; GCN-DAG: v_or_b32_e32 v[[OR1:[0-9]+]], 0, v[[SHLHI]]{{$}} -; GCN-DAG: buffer_store_dwordx2 v{{\[}}[[ELT2PART]]:[[ZERO]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; GCN-DAG: buffer_store_dwordx2 v{{\[}}[[OR0]]:[[OR1]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[OR0]]:[[ZERO]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; GCN: s_endpgm define void @v_uextract_bit_34_100_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/store-barrier.ll b/llvm/test/CodeGen/AMDGPU/store-barrier.ll index 3838b81dee5..57a93ccd250 100644 --- a/llvm/test/CodeGen/AMDGPU/store-barrier.ll +++ b/llvm/test/CodeGen/AMDGPU/store-barrier.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck %s +; RUN: llc -march=amdgcn -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck %s ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck %s ; This test is for a bug in the machine scheduler where stores without @@ -17,10 +17,10 @@ bb: %tmp10 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp9 %tmp13 = load i32, i32 addrspace(1)* %tmp10, align 2 %tmp14 = getelementptr inbounds <2 x i8>, <2 x i8> addrspace(3)* %arg, i32 %tmp13 - %tmp15 = load <2 x i8>, <2 x i8> addrspace(3)* %tmp14, align 2 + %tmp15 = load <2 x i8>, <2 x i8> addrspace(3)* %tmp14, align 1 %tmp16 = add i32 %tmp13, 1 %tmp17 = getelementptr inbounds <2 x i8>, <2 x i8> addrspace(3)* %arg, i32 %tmp16 - store <2 x i8> %tmp15, <2 x i8> addrspace(3)* %tmp17, align 2 + store <2 x i8> %tmp15, <2 x i8> addrspace(3)* %tmp17, align 1 tail call void @llvm.amdgcn.s.barrier() %tmp25 = load i32, i32 addrspace(1)* %tmp10, align 4 %tmp26 = sext i32 %tmp25 to i64 diff --git a/llvm/test/CodeGen/AMDGPU/store.ll b/llvm/test/CodeGen/AMDGPU/store.ll index e107fe1a604..294fd498611 100644 --- a/llvm/test/CodeGen/AMDGPU/store.ll +++ b/llvm/test/CodeGen/AMDGPU/store.ll @@ -77,12 +77,31 @@ entry: ret void } +; FUNC-LABEL: {{^}}store_i24: +; SI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16 +; SI-DAG: buffer_store_byte +; SI-DAG: buffer_store_short +define void @store_i24(i24 addrspace(1)* %out, i24 %in) { +entry: + store i24 %in, i24 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}store_i25: +; SI: s_and_b32 [[AND:s[0-9]+]], s{{[0-9]+}}, 0x1ffffff{{$}} +; SI: v_mov_b32_e32 [[VAND:v[0-9]+]], [[AND]] +; SI: buffer_store_dword [[VAND]] +define void @store_i25(i25 addrspace(1)* %out, i25 %in) { +entry: + store i25 %in, i25 addrspace(1)* %out + ret void +} + ; FUNC-LABEL: {{^}}store_v2i8: ; EG: MEM_RAT MSKOR ; EG-NOT: MEM_RAT MSKOR -; SI: buffer_store_byte -; SI: buffer_store_byte +; SI: buffer_store_short define void @store_v2i8(<2 x i8> addrspace(1)* %out, <2 x i32> %in) { entry: %0 = trunc <2 x i32> %in to <2 x i8> @@ -96,8 +115,7 @@ entry: ; CM: MEM_RAT_CACHELESS STORE_DWORD -; SI: buffer_store_short -; SI: buffer_store_short +; SI: buffer_store_dword define void @store_v2i16(<2 x i16> addrspace(1)* %out, <2 x i32> %in) { entry: %0 = trunc <2 x i32> %in to <2 x i16> @@ -110,10 +128,7 @@ entry: ; CM: MEM_RAT_CACHELESS STORE_DWORD -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte +; SI: buffer_store_dword define void @store_v4i8(<4 x i8> addrspace(1)* %out, <4 x i32> %in) { entry: %0 = trunc <4 x i32> %in to <4 x i8> @@ -135,17 +150,9 @@ define void @store_f32(float addrspace(1)* %out, float %in) { } ; FUNC-LABEL: {{^}}store_v4i16: -; EG: MEM_RAT MSKOR -; EG: MEM_RAT MSKOR -; EG: MEM_RAT MSKOR -; EG: MEM_RAT MSKOR -; EG-NOT: MEM_RAT MSKOR +; MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XYZW -; SI: buffer_store_short -; SI: buffer_store_short -; SI: buffer_store_short -; SI: buffer_store_short -; SI-NOT: buffer_store_byte +; SI: buffer_store_dwordx2 define void @store_v4i16(<4 x i16> addrspace(1)* %out, <4 x i32> %in) { entry: %0 = trunc <4 x i32> %in to <4 x i16> @@ -239,8 +246,7 @@ define void @store_local_i16(i16 addrspace(3)* %out, i16 %in) { ; CM: LDS_WRITE -; SI: ds_write_b16 -; SI: ds_write_b16 +; SI: ds_write_b32 define void @store_local_v2i16(<2 x i16> addrspace(3)* %out, <2 x i16> %in) { entry: store <2 x i16> %in, <2 x i16> addrspace(3)* %out @@ -252,10 +258,7 @@ entry: ; CM: LDS_WRITE -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 -; SI: ds_write_b8 +; SI: ds_write_b32 define void @store_local_v4i8(<4 x i8> addrspace(3)* %out, <4 x i8> %in) { entry: store <4 x i8> %in, <4 x i8> addrspace(3)* %out diff --git a/llvm/test/CodeGen/AMDGPU/sub.ll b/llvm/test/CodeGen/AMDGPU/sub.ll index 9f9446a4e60..5a026cdf299 100644 --- a/llvm/test/CodeGen/AMDGPU/sub.ll +++ b/llvm/test/CodeGen/AMDGPU/sub.ll @@ -58,13 +58,11 @@ define void @test_sub_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1) ; SI: s_sub_u32 ; SI: s_subb_u32 -; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.[XYZW]]] -; EG: MEM_RAT_CACHELESS STORE_RAW [[HI:T[0-9]+\.[XYZW]]] -; EG-DAG: SUB_INT {{[* ]*}}[[LO]] +; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY +; EG-DAG: SUB_INT {{[* ]*}} ; EG-DAG: SUBB_UINT ; EG-DAG: SUB_INT -; EG-DAG: SUB_INT {{[* ]*}}[[HI]] -; EG-NOT: SUB +; EG-DAG: SUB_INT {{[* ]*}} define void @s_sub_i64(i64 addrspace(1)* noalias %out, i64 %a, i64 %b) nounwind { %result = sub i64 %a, %b store i64 %result, i64 addrspace(1)* %out, align 8 @@ -75,13 +73,11 @@ define void @s_sub_i64(i64 addrspace(1)* noalias %out, i64 %a, i64 %b) nounwind ; SI: v_sub_i32_e32 ; SI: v_subb_u32_e32 -; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.[XYZW]]] -; EG: MEM_RAT_CACHELESS STORE_RAW [[HI:T[0-9]+\.[XYZW]]] -; EG-DAG: SUB_INT {{[* ]*}}[[LO]] +; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY +; EG-DAG: SUB_INT {{[* ]*}} ; EG-DAG: SUBB_UINT ; EG-DAG: SUB_INT -; EG-DAG: SUB_INT {{[* ]*}}[[HI]] -; EG-NOT: SUB +; EG-DAG: SUB_INT {{[* ]*}} define void @v_sub_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %inA, i64 addrspace(1)* noalias %inB) nounwind { %tid = call i32 @llvm.r600.read.tidig.x() readnone %a_ptr = getelementptr i64, i64 addrspace(1)* %inA, i32 %tid @@ -110,13 +106,13 @@ define void @v_test_sub_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace( } ; FUNC-LABEL: {{^}}v_test_sub_v4i64: -; SI: v_sub_i32_e32 +; SI: v_subrev_i32_e32 ; SI: v_subb_u32_e32 -; SI: v_sub_i32_e32 +; SI: v_subrev_i32_e32 ; SI: v_subb_u32_e32 -; SI: v_sub_i32_e32 +; SI: v_subrev_i32_e32 ; SI: v_subb_u32_e32 -; SI: v_sub_i32_e32 +; SI: v_subrev_i32_e32 ; SI: v_subb_u32_e32 define void @v_test_sub_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* noalias %inA, <4 x i64> addrspace(1)* noalias %inB) { %tid = call i32 @llvm.r600.read.tidig.x() readnone diff --git a/llvm/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll b/llvm/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll index 20c49e2809c..9e2373c55e3 100644 --- a/llvm/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll +++ b/llvm/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll @@ -46,9 +46,8 @@ define void @trunc_i16_bitcast_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace ret void } -; FIXME: Don't want load width reduced here. ; CHECK-LABEL: {{^}}trunc_i16_bitcast_v4i16: -; CHECK: buffer_load_ushort [[VAL:v[0-9]+]] +; CHECK: buffer_load_dword [[VAL:v[0-9]+]] ; CHECK: buffer_store_short [[VAL]] define void @trunc_i16_bitcast_v4i16(i16 addrspace(1)* %out, <4 x i16> addrspace(1)* %in) { %ld = load <4 x i16>, <4 x i16> addrspace(1)* %in diff --git a/llvm/test/CodeGen/AMDGPU/trunc-store.ll b/llvm/test/CodeGen/AMDGPU/trunc-store.ll index 4ba815f2669..cf5c00e65b7 100644 --- a/llvm/test/CodeGen/AMDGPU/trunc-store.ll +++ b/llvm/test/CodeGen/AMDGPU/trunc-store.ll @@ -2,22 +2,7 @@ ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}truncstore_arg_v16i32_to_v16i8: -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte +; SI: buffer_store_dwordx4 define void @truncstore_arg_v16i32_to_v16i8(<16 x i8> addrspace(1)* %out, <16 x i32> %in) { %trunc = trunc <16 x i32> %in to <16 x i8> store <16 x i8> %trunc, <16 x i8> addrspace(1)* %out @@ -25,22 +10,7 @@ define void @truncstore_arg_v16i32_to_v16i8(<16 x i8> addrspace(1)* %out, <16 x } ; FUNC-LABEL: {{^}}truncstore_arg_v16i64_to_v16i8: -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte -; SI: buffer_store_byte +; SI: buffer_store_dwordx4 define void @truncstore_arg_v16i64_to_v16i8(<16 x i8> addrspace(1)* %out, <16 x i64> %in) { %trunc = trunc <16 x i64> %in to <16 x i8> store <16 x i8> %trunc, <16 x i8> addrspace(1)* %out |