diff options
author | Matt Arsenault <Matthew.Arsenault@amd.com> | 2015-11-24 12:05:03 +0000 |
---|---|---|
committer | Matt Arsenault <Matthew.Arsenault@amd.com> | 2015-11-24 12:05:03 +0000 |
commit | 4d801cd357c74bb7c2a60fedf4030b9fb5b4827f (patch) | |
tree | ab89b79da213d3d358dbdaf7b6ba19ba2fe2d994 /llvm/test/CodeGen/AMDGPU | |
parent | 9d0f44bf8af57cbe992edada1a5351881b1388b2 (diff) | |
download | bcm5719-llvm-4d801cd357c74bb7c2a60fedf4030b9fb5b4827f.tar.gz bcm5719-llvm-4d801cd357c74bb7c2a60fedf4030b9fb5b4827f.zip |
AMDGPU: Split x8 and x16 vector loads instead of scalarize
The one regression in the builtin tests is in the read2 test which now
(again) has many extra copies, but this should be solved once the pass
is replaced with a DAG combine.
llvm-svn: 253974
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU')
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll | 10 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/ds_read2_superreg.ll | 76 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/global-extload-i32.ll | 145 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/half.ll | 90 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/load.ll | 34 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/merge-stores.ll | 17 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/reorder-stores.ll | 46 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/salu-to-valu.ll | 27 |
8 files changed, 156 insertions, 289 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll index cd554ba256b..834922c62cb 100644 --- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -137,14 +137,8 @@ define void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias %out, <7 x i8> ; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, v[[HILOAD]] ; SI-NOT: bfe ; SI-NOT: lshr -; SI: buffer_store_dword -; SI: buffer_store_dword -; SI: buffer_store_dword -; SI: buffer_store_dword -; SI: buffer_store_dword -; SI: buffer_store_dword -; SI: buffer_store_dword -; SI: buffer_store_dword +; SI: buffer_store_dwordx4 +; SI: buffer_store_dwordx4 define void @load_v8i8_to_v8f32(<8 x float> addrspace(1)* noalias %out, <8 x i8> addrspace(1)* noalias %in) nounwind { %load = load <8 x i8>, <8 x i8> addrspace(1)* %in, align 8 %cvt = uitofp <8 x i8> %load to <8 x float> diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2_superreg.ll b/llvm/test/CodeGen/AMDGPU/ds_read2_superreg.ll index 6b95c031d04..5aca2cf77fb 100644 --- a/llvm/test/CodeGen/AMDGPU/ds_read2_superreg.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_read2_superreg.ll @@ -116,19 +116,18 @@ define void @simple_read2_v4f32_superreg(<4 x float> addrspace(1)* %out) #0 { ret void } +; FIXME: Extra moves shuffling superregister ; CI-LABEL: {{^}}simple_read2_v8f32_superreg: -; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT7:[0-9]+]]:[[REG_ELT6:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:7 offset1:6{{$}} -; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT5:[0-9]+]]:[[REG_ELT4:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:5 offset1:4{{$}} -; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT3:[0-9]+]]:[[REG_ELT2:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:3 offset1:2{{$}} -; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT1:[0-9]+]]:[[REG_ELT0:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1{{$}} -; CI: buffer_store_dword -; CI: buffer_store_dword -; CI: buffer_store_dword -; CI: buffer_store_dword -; CI: buffer_store_dword -; CI: buffer_store_dword -; CI: buffer_store_dword -; CI: buffer_store_dword +; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT3:[0-9]+]]:[[REG_ELT7:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:3 offset1:7{{$}} +; CI: v_mov_b32 +; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT6:[0-9]+]]:[[REG_ELT5:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:2 offset1:1{{$}} +; CI: v_mov_b32 +; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT4:[0-9]+]]:[[REG_ELT2:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:6 offset1:5{{$}} +; CI: v_mov_b32 +; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT1:[0-9]+]]:[[REG_ELT0:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:4{{$}} +; CI: v_mov_b32 +; CI: buffer_store_dwordx4 +; CI: buffer_store_dwordx4 ; CI: s_endpgm define void @simple_read2_v8f32_superreg(<8 x float> addrspace(1)* %out) #0 { %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 @@ -139,41 +138,30 @@ define void @simple_read2_v8f32_superreg(<8 x float> addrspace(1)* %out) #0 { ret void } +; FIXME: Extra moves shuffling superregister ; CI-LABEL: {{^}}simple_read2_v16f32_superreg: -; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT7:[0-9]+]]:[[REG_ELT6:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:15 offset1:14{{$}} -; CI-NOT: v_mov_b32 -; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT7:[0-9]+]]:[[REG_ELT6:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:13 offset1:12{{$}} -; CI-NOT: v_mov_b32 -; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT7:[0-9]+]]:[[REG_ELT6:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:11 offset1:10{{$}} -; CI-NOT: v_mov_b32 -; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT7:[0-9]+]]:[[REG_ELT6:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:9 offset1:8{{$}} -; CI-NOT: v_mov_b32 -; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT7:[0-9]+]]:[[REG_ELT6:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:7 offset1:6{{$}} -; CI-NOT: v_mov_b32 -; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT5:[0-9]+]]:[[REG_ELT4:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:5 offset1:4{{$}} -; CI-NOT: v_mov_b32 -; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT3:[0-9]+]]:[[REG_ELT2:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:3 offset1:2{{$}} -; CI-NOT: v_mov_b32 -; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT1:[0-9]+]]:[[REG_ELT0:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1{{$}} -; CI-NOT: v_mov_b32 +; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT11:[0-9]+]]:[[REG_ELT15:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:3 offset1:7{{$}} +; CI: v_mov_b32 +; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT14:[0-9]+]]:[[REG_ELT13:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:11 offset1:15{{$}} +; CI: v_mov_b32 +; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT12:[0-9]+]]:[[REG_ELT10:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:2 offset1:1{{$}} +; CI: v_mov_b32 +; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT9:[0-9]+]]:[[REG_ELT8:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:6 offset1:5{{$}} +; CI: v_mov_b32 +; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT7:[0-9]+]]:[[REG_ELT6:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:10 offset1:9{{$}} +; CI: v_mov_b32 +; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT5:[0-9]+]]:[[REG_ELT4:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:14 offset1:13{{$}} +; CI: v_mov_b32 +; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT3:[0-9]+]]:[[REG_ELT2:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:12 offset1:8{{$}} +; CI: v_mov_b32 +; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT1:[0-9]+]]:[[REG_ELT0:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:4{{$}} +; CI: v_mov_b32 ; CI: s_waitcnt lgkmcnt(0) -; CI: buffer_store_dword -; CI: buffer_store_dword -; CI: buffer_store_dword -; CI: buffer_store_dword -; CI: buffer_store_dword -; CI: buffer_store_dword -; CI: buffer_store_dword -; CI: buffer_store_dword -; CI: buffer_store_dword -; CI: buffer_store_dword -; CI: buffer_store_dword -; CI: buffer_store_dword -; CI: buffer_store_dword -; CI: buffer_store_dword -; CI: buffer_store_dword -; CI: buffer_store_dword +; CI: buffer_store_dwordx4 +; CI: buffer_store_dwordx4 +; CI: buffer_store_dwordx4 +; CI: buffer_store_dwordx4 ; CI: s_endpgm define void @simple_read2_v16f32_superreg(<16 x float> addrspace(1)* %out) #0 { %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 diff --git a/llvm/test/CodeGen/AMDGPU/global-extload-i32.ll b/llvm/test/CodeGen/AMDGPU/global-extload-i32.ll index 79b83452939..ef2f64d673d 100644 --- a/llvm/test/CodeGen/AMDGPU/global-extload-i32.ll +++ b/llvm/test/CodeGen/AMDGPU/global-extload-i32.ll @@ -106,14 +106,8 @@ define void @sextload_global_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i } ; FUNC-LABEL: {{^}}zextload_global_v8i32_to_v8i64: -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword +; SI: buffer_load_dwordx4 +; SI: buffer_load_dwordx4 ; SI-DAG: buffer_store_dwordx2 ; SI-DAG: buffer_store_dwordx2 ; SI-DAG: buffer_store_dwordx2 @@ -131,14 +125,8 @@ define void @zextload_global_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i } ; FUNC-LABEL: {{^}}sextload_global_v8i32_to_v8i64: -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword +; SI: buffer_load_dwordx4 +; SI: buffer_load_dwordx4 ; SI-DAG: v_ashrrev_i32 ; SI-DAG: v_ashrrev_i32 @@ -166,22 +154,10 @@ define void @sextload_global_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i } ; FUNC-LABEL: {{^}}sextload_global_v16i32_to_v16i64: -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword +; SI: buffer_load_dwordx4 +; SI: buffer_load_dwordx4 +; SI: buffer_load_dwordx4 +; SI: buffer_load_dwordx4 ; SI-DAG: v_ashrrev_i32 ; SI-DAG: v_ashrrev_i32 @@ -219,22 +195,10 @@ define void @sextload_global_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 } ; FUNC-LABEL: {{^}}zextload_global_v16i32_to_v16i64 -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword +; SI: buffer_load_dwordx4 +; SI: buffer_load_dwordx4 +; SI: buffer_load_dwordx4 +; SI: buffer_load_dwordx4 ; SI: buffer_store_dwordx2 ; SI: buffer_store_dwordx2 @@ -262,41 +226,15 @@ define void @zextload_global_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 } ; FUNC-LABEL: {{^}}sextload_global_v32i32_to_v32i64: -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword - -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword - -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword +; SI: buffer_load_dwordx4 +; SI: buffer_load_dwordx4 +; SI: buffer_load_dwordx4 +; SI: buffer_load_dwordx4 +; SI: buffer_load_dwordx4 +; SI: buffer_load_dwordx4 +; SI: buffer_load_dwordx4 +; SI: buffer_load_dwordx4 -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword ; SI-DAG: v_ashrrev_i32 ; SI-DAG: v_ashrrev_i32 @@ -376,41 +314,14 @@ define void @sextload_global_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 } ; FUNC-LABEL: {{^}}zextload_global_v32i32_to_v32i64: -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword - -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword - -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword - -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword +; SI: buffer_load_dwordx4 +; SI: buffer_load_dwordx4 +; SI: buffer_load_dwordx4 +; SI: buffer_load_dwordx4 +; SI: buffer_load_dwordx4 +; SI: buffer_load_dwordx4 +; SI: buffer_load_dwordx4 +; SI: buffer_load_dwordx4 ; SI-DAG: buffer_store_dwordx2 ; SI-DAG: buffer_store_dwordx2 diff --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll index 4a3ea5e4a7e..a344d213d1f 100644 --- a/llvm/test/CodeGen/AMDGPU/half.ll +++ b/llvm/test/CodeGen/AMDGPU/half.ll @@ -105,6 +105,26 @@ define void @extload_v4f16_to_v4f32_arg(<4 x float> addrspace(1)* %out, <4 x hal } ; GCN-LABEL: {{^}}extload_v8f16_to_v8f32_arg: +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort + +; GCN: v_cvt_f32_f16_e32 +; GCN: v_cvt_f32_f16_e32 +; GCN: v_cvt_f32_f16_e32 +; GCN: v_cvt_f32_f16_e32 +; GCN: v_cvt_f32_f16_e32 +; GCN: v_cvt_f32_f16_e32 +; GCN: v_cvt_f32_f16_e32 +; GCN: v_cvt_f32_f16_e32 + +; GCN: buffer_store_dwordx4 +; GCN: buffer_store_dwordx4 define void @extload_v8f16_to_v8f32_arg(<8 x float> addrspace(1)* %out, <8 x half> %arg) #0 { %ext = fpext <8 x half> %arg to <8 x float> store <8 x float> %ext, <8 x float> addrspace(1)* %out @@ -298,6 +318,46 @@ define void @global_extload_v8f16_to_v8f32(<8 x float> addrspace(1)* %out, <8 x } ; GCN-LABEL: {{^}}global_extload_v16f16_to_v16f32: +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort + +; GCN: v_cvt_f32_f16_e32 +; GCN: v_cvt_f32_f16_e32 +; GCN: v_cvt_f32_f16_e32 +; GCN: v_cvt_f32_f16_e32 +; GCN: v_cvt_f32_f16_e32 +; GCN: v_cvt_f32_f16_e32 +; GCN: v_cvt_f32_f16_e32 +; GCN: v_cvt_f32_f16_e32 +; GCN: v_cvt_f32_f16_e32 +; GCN: v_cvt_f32_f16_e32 +; GCN: v_cvt_f32_f16_e32 +; GCN: v_cvt_f32_f16_e32 +; GCN: v_cvt_f32_f16_e32 +; GCN: v_cvt_f32_f16_e32 +; GCN: v_cvt_f32_f16_e32 +; GCN: v_cvt_f32_f16_e32 + +; GCN: buffer_store_dwordx4 +; GCN: buffer_store_dwordx4 +; GCN: buffer_store_dwordx4 +; GCN: buffer_store_dwordx4 + +; GCN: s_endpgm define void @global_extload_v16f16_to_v16f32(<16 x float> addrspace(1)* %out, <16 x half> addrspace(1)* %in) #0 { %val = load <16 x half>, <16 x half> addrspace(1)* %in %cvt = fpext <16 x half> %val to <16 x float> @@ -426,14 +486,8 @@ define void @global_truncstore_v4f32_to_v4f16(<4 x half> addrspace(1)* %out, <4 } ; GCN-LABEL: {{^}}global_truncstore_v8f32_to_v8f16: -; GCN: buffer_load_dword -; GCN: buffer_load_dword -; GCN: buffer_load_dword -; GCN: buffer_load_dword -; GCN: buffer_load_dword -; GCN: buffer_load_dword -; GCN: buffer_load_dword -; GCN: buffer_load_dword +; GCN: buffer_load_dwordx4 +; GCN: buffer_load_dwordx4 ; GCN: v_cvt_f16_f32_e32 ; GCN: v_cvt_f16_f32_e32 ; GCN: v_cvt_f16_f32_e32 @@ -459,22 +513,10 @@ define void @global_truncstore_v8f32_to_v8f16(<8 x half> addrspace(1)* %out, <8 } ; GCN-LABEL: {{^}}global_truncstore_v16f32_to_v16f16: -; GCN: buffer_load_dword -; GCN: buffer_load_dword -; GCN: buffer_load_dword -; GCN: buffer_load_dword -; GCN: buffer_load_dword -; GCN: buffer_load_dword -; GCN: buffer_load_dword -; GCN: buffer_load_dword -; GCN: buffer_load_dword -; GCN: buffer_load_dword -; GCN: buffer_load_dword -; GCN: buffer_load_dword -; GCN: buffer_load_dword -; GCN: buffer_load_dword -; GCN: buffer_load_dword -; GCN: buffer_load_dword +; GCN: buffer_load_dwordx4 +; GCN: buffer_load_dwordx4 +; GCN: buffer_load_dwordx4 +; GCN: buffer_load_dwordx4 ; GCN-DAG: v_cvt_f16_f32_e32 ; GCN-DAG: v_cvt_f16_f32_e32 ; GCN-DAG: v_cvt_f16_f32_e32 diff --git a/llvm/test/CodeGen/AMDGPU/load.ll b/llvm/test/CodeGen/AMDGPU/load.ll index 93b1b51a0d0..6a04261fe47 100644 --- a/llvm/test/CodeGen/AMDGPU/load.ll +++ b/llvm/test/CodeGen/AMDGPU/load.ll @@ -277,15 +277,9 @@ entry: ; FUNC-LABEL: {{^}}load_v8i32: ; R600: VTX_READ_128 ; R600: VTX_READ_128 -; XXX: We should be using DWORDX4 instructions on SI. -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword + +; SI: buffer_load_dwordx4 +; SI: buffer_load_dwordx4 define void @load_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) { entry: %0 = load <8 x i32>, <8 x i32> addrspace(1)* %in @@ -298,23 +292,11 @@ entry: ; R600: VTX_READ_128 ; R600: VTX_READ_128 ; R600: VTX_READ_128 -; XXX: We should be using DWORDX4 instructions on SI. -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword + +; SI: buffer_load_dwordx4 +; SI: buffer_load_dwordx4 +; SI: buffer_load_dwordx4 +; SI: buffer_load_dwordx4 define void @load_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(1)* %in) { entry: %0 = load <16 x i32>, <16 x i32> addrspace(1)* %in diff --git a/llvm/test/CodeGen/AMDGPU/merge-stores.ll b/llvm/test/CodeGen/AMDGPU/merge-stores.ll index eb38e10353f..73b09251886 100644 --- a/llvm/test/CodeGen/AMDGPU/merge-stores.ll +++ b/llvm/test/CodeGen/AMDGPU/merge-stores.ll @@ -613,22 +613,9 @@ define void @merge_global_store_7_constants_i32(i32 addrspace(1)* %out) { ret void } -; FIXME: This should do 2 dwordx4 loads ; GCN-LABEL: {{^}}merge_global_store_8_constants_i32: - -; GCN-NOAA: buffer_store_dword v -; GCN-NOAA: buffer_store_dword v -; GCN-NOAA: buffer_store_dword v -; GCN-NOAA: buffer_store_dword v -; GCN-NOAA: buffer_store_dword v -; GCN-NOAA: buffer_store_dword v -; GCN-NOAA: buffer_store_dword v -; GCN-NOAA: buffer_store_dword v - -; GCN-AA: buffer_store_dwordx4 -; GCN-AA: buffer_store_dwordx2 -; GCN-AA: buffer_store_dwordx2 - +; GCN: buffer_store_dwordx4 +; GCN: buffer_store_dwordx4 ; GCN: s_endpgm define void @merge_global_store_8_constants_i32(i32 addrspace(1)* %out) { store i32 34, i32 addrspace(1)* %out, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/reorder-stores.ll b/llvm/test/CodeGen/AMDGPU/reorder-stores.ll index 187650ff9a5..712205c2ce2 100644 --- a/llvm/test/CodeGen/AMDGPU/reorder-stores.ll +++ b/llvm/test/CodeGen/AMDGPU/reorder-stores.ll @@ -34,46 +34,16 @@ define void @no_reorder_scalarized_v2f64_local_load_store(<2 x double> addrspace } ; SI-LABEL: {{^}}no_reorder_split_v8i32_global_load_store: -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword +; SI: buffer_load_dwordx4 +; SI: buffer_load_dwordx4 +; SI: buffer_load_dwordx4 +; SI: buffer_load_dwordx4 -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword - -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword -; SI: buffer_load_dword - - -; SI: buffer_store_dword -; SI: buffer_store_dword -; SI: buffer_store_dword -; SI: buffer_store_dword - -; SI: buffer_store_dword -; SI: buffer_store_dword -; SI: buffer_store_dword -; SI: buffer_store_dword - -; SI: buffer_store_dword -; SI: buffer_store_dword -; SI: buffer_store_dword -; SI: buffer_store_dword - -; SI: buffer_store_dword -; SI: buffer_store_dword -; SI: buffer_store_dword -; SI: buffer_store_dword +; SI: buffer_store_dwordx4 +; SI: buffer_store_dwordx4 +; SI: buffer_store_dwordx4 +; SI: buffer_store_dwordx4 ; SI: s_endpgm define void @no_reorder_split_v8i32_global_load_store(<8 x i32> addrspace(1)* nocapture %x, <8 x i32> addrspace(1)* nocapture %y) nounwind { %tmp1 = load <8 x i32>, <8 x i32> addrspace(1)* %x, align 32 diff --git a/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll b/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll index b74f31bcdda..e40732ba3af 100644 --- a/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll +++ b/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll @@ -162,14 +162,8 @@ entry: ; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} ; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} ; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} -; GCN: buffer_store_dword -; GCN: buffer_store_dword -; GCN: buffer_store_dword -; GCN: buffer_store_dword -; GCN: buffer_store_dword -; GCN: buffer_store_dword -; GCN: buffer_store_dword -; GCN: buffer_store_dword +; GCN: buffer_store_dwordx4 +; GCN: buffer_store_dwordx4 define void @smrd_valu_ci_offset_x8(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(2)* %in, <8 x i32> %c) #1 { entry: %tmp = call i32 @llvm.r600.read.tidig.x() #0 @@ -184,7 +178,7 @@ entry: ; FIXME: should use immediate offset instead of using s_add_i32 for adding to constant. ; GCN-LABEL: {{^}}smrd_valu_ci_offset_x16: -; GCN: s_mov_b32 s[[OFFSET0:[0-9]+]], 0x13480{{$}} +; GCN-DAG: s_mov_b32 s[[OFFSET0:[0-9]+]], 0x13480{{$}} ; SI-DAG: s_add_i32 s[[OFFSET1:[0-9]+]], s[[OFFSET0]], 16 ; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[OFFSET0]]:{{[0-9]+}}], 0 addr64{{$}} @@ -197,6 +191,7 @@ entry: ; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[OFFSET2]]:{{[0-9]+}}], 0 addr64{{$}} ; GCN-DAG: s_add_i32 s[[OFFSET3:[0-9]+]], s[[OFFSET2]], 16 ; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[OFFSET3]]:{{[0-9]+}}], 0 addr64{{$}} + ; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} ; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} ; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} @@ -205,14 +200,12 @@ entry: ; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} ; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} ; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} -; GCN: buffer_store_dword -; GCN: buffer_store_dword -; GCN: buffer_store_dword -; GCN: buffer_store_dword -; GCN: buffer_store_dword -; GCN: buffer_store_dword -; GCN: buffer_store_dword -; GCN: buffer_store_dword +; GCN: buffer_store_dwordx4 +; GCN: buffer_store_dwordx4 +; GCN: buffer_store_dwordx4 +; GCN: buffer_store_dwordx4 + +; GCN: s_endpgm define void @smrd_valu_ci_offset_x16(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(2)* %in, <16 x i32> %c) #1 { entry: %tmp = call i32 @llvm.r600.read.tidig.x() #0 |